1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.lang3.text; 18 19 import java.util.ArrayList; 20 import java.util.Arrays; 21 import java.util.Collections; 22 import java.util.List; 23 import java.util.ListIterator; 24 import java.util.NoSuchElementException; 25 import java.util.StringTokenizer; 26 27 import org.apache.commons.lang3.ArrayUtils; 28 import org.apache.commons.lang3.StringUtils; 29 30 /** 31 * Tokenizes a string based on delimiters (separators) 32 * and supporting quoting and ignored character concepts. 33 * <p> 34 * This class can split a String into many smaller strings. It aims 35 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 36 * however it offers much more control and flexibility including implementing 37 * the {@link ListIterator} interface. By default, it is set up 38 * like {@link StringTokenizer}. 39 * </p> 40 * <p> 41 * The input String is split into a number of <i>tokens</i>. 42 * Each token is separated from the next String by a <i>delimiter</i>. 43 * One or more delimiter characters must be specified. 44 * </p> 45 * <p> 46 * Each token may be surrounded by quotes. 47 * The <i>quote</i> matcher specifies the quote character(s). 48 * A quote may be escaped within a quoted section by duplicating itself. 49 * </p> 50 * <p> 51 * Between each token and the delimiter are potentially characters that need trimming. 52 * The <i>trimmer</i> matcher specifies these characters. 53 * One usage might be to trim whitespace characters. 54 * </p> 55 * <p> 56 * At any point outside the quotes there might potentially be invalid characters. 57 * The <i>ignored</i> matcher specifies these characters to be removed. 58 * One usage might be to remove new line characters. 59 * </p> 60 * <p> 61 * Empty tokens may be removed or returned as null. 62 * </p> 63 * <pre> 64 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 65 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 66 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 67 * </pre> 68 * 69 * <table> 70 * <caption>StrTokenizer properties and options</caption> 71 * <tr> 72 * <th>Property</th><th>Type</th><th>Default</th> 73 * </tr> 74 * <tr> 75 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 76 * </tr> 77 * <tr> 78 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 79 * </tr> 80 * <tr> 81 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 82 * </tr> 83 * <tr> 84 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 85 * </tr> 86 * <tr> 87 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 88 * </tr> 89 * </table> 90 * 91 * @since 2.2 92 * @deprecated As of 3.6, use Apache Commons Text 93 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html"> 94 * StringTokenizer</a> instead 95 */ 96 @Deprecated 97 public class StrTokenizer implements ListIterator<String>, Cloneable { 98 99 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 100 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 101 static { 102 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 103 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 104 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 105 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 106 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 107 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 108 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 109 110 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 111 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 112 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 113 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 114 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 115 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 116 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 117 } 118 119 /** 120 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 121 * 122 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 123 */ 124 private static StrTokenizer getCSVClone() { 125 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 126 } 127 /** 128 * Gets a new tokenizer instance which parses Comma Separated Value strings 129 * initializing it with the given input. The default for CSV processing 130 * will be trim whitespace from both ends (which can be overridden with 131 * the setTrimmer method). 132 * <p> 133 * You must call a "reset" method to set the string which you want to parse. 134 * </p> 135 * @return a new tokenizer instance which parses Comma Separated Value strings 136 */ 137 public static StrTokenizer getCSVInstance() { 138 return getCSVClone(); 139 } 140 /** 141 * Gets a new tokenizer instance which parses Comma Separated Value strings 142 * initializing it with the given input. The default for CSV processing 143 * will be trim whitespace from both ends (which can be overridden with 144 * the setTrimmer method). 145 * 146 * @param input the text to parse 147 * @return a new tokenizer instance which parses Comma Separated Value strings 148 */ 149 public static StrTokenizer getCSVInstance(final char[] input) { 150 final StrTokenizer tok = getCSVClone(); 151 tok.reset(input); 152 return tok; 153 } 154 155 /** 156 * Gets a new tokenizer instance which parses Comma Separated Value strings 157 * initializing it with the given input. The default for CSV processing 158 * will be trim whitespace from both ends (which can be overridden with 159 * the setTrimmer method). 160 * 161 * @param input the text to parse 162 * @return a new tokenizer instance which parses Comma Separated Value strings 163 */ 164 public static StrTokenizer getCSVInstance(final String input) { 165 final StrTokenizer tok = getCSVClone(); 166 tok.reset(input); 167 return tok; 168 } 169 /** 170 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 171 * 172 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 173 */ 174 private static StrTokenizer getTSVClone() { 175 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 176 } 177 /** 178 * Gets a new tokenizer instance which parses Tab Separated Value strings. 179 * The default for CSV processing will be trim whitespace from both ends 180 * (which can be overridden with the setTrimmer method). 181 * <p> 182 * You must call a "reset" method to set the string which you want to parse. 183 * </p> 184 * @return a new tokenizer instance which parses Tab Separated Value strings. 185 */ 186 public static StrTokenizer getTSVInstance() { 187 return getTSVClone(); 188 } 189 /** 190 * Gets a new tokenizer instance which parses Tab Separated Value strings. 191 * The default for CSV processing will be trim whitespace from both ends 192 * (which can be overridden with the setTrimmer method). 193 * @param input the string to parse 194 * @return a new tokenizer instance which parses Tab Separated Value strings. 195 */ 196 public static StrTokenizer getTSVInstance(final char[] input) { 197 final StrTokenizer tok = getTSVClone(); 198 tok.reset(input); 199 return tok; 200 } 201 202 /** 203 * Gets a new tokenizer instance which parses Tab Separated Value strings. 204 * The default for CSV processing will be trim whitespace from both ends 205 * (which can be overridden with the setTrimmer method). 206 * @param input the string to parse 207 * @return a new tokenizer instance which parses Tab Separated Value strings. 208 */ 209 public static StrTokenizer getTSVInstance(final String input) { 210 final StrTokenizer tok = getTSVClone(); 211 tok.reset(input); 212 return tok; 213 } 214 /** The text to work on. */ 215 private char[] chars; 216 217 218 /** The parsed tokens */ 219 private String[] tokens; 220 221 /** The current iteration position */ 222 private int tokenPos; 223 224 /** The delimiter matcher */ 225 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 226 227 /** The quote matcher */ 228 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 229 230 /** The ignored matcher */ 231 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 232 233 234 /** The trimmer matcher */ 235 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 236 237 /** Whether to return empty tokens as null */ 238 private boolean emptyAsNull; 239 240 /** Whether to ignore empty tokens */ 241 private boolean ignoreEmptyTokens = true; 242 243 /** 244 * Constructs a tokenizer splitting on space, tab, newline and formfeed 245 * as per StringTokenizer, but with no text to tokenize. 246 * <p> 247 * This constructor is normally used with {@link #reset(String)}. 248 * </p> 249 */ 250 public StrTokenizer() { 251 this.chars = null; 252 } 253 254 /** 255 * Constructs a tokenizer splitting on space, tab, newline and formfeed 256 * as per StringTokenizer. 257 * 258 * @param input the string which is to be parsed, not cloned 259 */ 260 public StrTokenizer(final char[] input) { 261 this.chars = ArrayUtils.clone(input); 262 } 263 264 /** 265 * Constructs a tokenizer splitting on the specified character. 266 * 267 * @param input the string which is to be parsed, not cloned 268 * @param delim the field delimiter character 269 */ 270 public StrTokenizer(final char[] input, final char delim) { 271 this(input); 272 setDelimiterChar(delim); 273 } 274 275 /** 276 * Constructs a tokenizer splitting on the specified delimiter character 277 * and handling quotes using the specified quote character. 278 * 279 * @param input the string which is to be parsed, not cloned 280 * @param delim the field delimiter character 281 * @param quote the field quoted string character 282 */ 283 public StrTokenizer(final char[] input, final char delim, final char quote) { 284 this(input, delim); 285 setQuoteChar(quote); 286 } 287 288 /** 289 * Constructs a tokenizer splitting on the specified string. 290 * 291 * @param input the string which is to be parsed, not cloned 292 * @param delim the field delimiter string 293 */ 294 public StrTokenizer(final char[] input, final String delim) { 295 this(input); 296 setDelimiterString(delim); 297 } 298 299 /** 300 * Constructs a tokenizer splitting using the specified delimiter matcher. 301 * 302 * @param input the string which is to be parsed, not cloned 303 * @param delim the field delimiter matcher 304 */ 305 public StrTokenizer(final char[] input, final StrMatcher delim) { 306 this(input); 307 setDelimiterMatcher(delim); 308 } 309 310 /** 311 * Constructs a tokenizer splitting using the specified delimiter matcher 312 * and handling quotes using the specified quote matcher. 313 * 314 * @param input the string which is to be parsed, not cloned 315 * @param delim the field delimiter character 316 * @param quote the field quoted string character 317 */ 318 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 319 this(input, delim); 320 setQuoteMatcher(quote); 321 } 322 323 /** 324 * Constructs a tokenizer splitting on space, tab, newline and formfeed 325 * as per StringTokenizer. 326 * 327 * @param input the string which is to be parsed 328 */ 329 public StrTokenizer(final String input) { 330 if (input != null) { 331 chars = input.toCharArray(); 332 } else { 333 chars = null; 334 } 335 } 336 337 /** 338 * Constructs a tokenizer splitting on the specified delimiter character. 339 * 340 * @param input the string which is to be parsed 341 * @param delim the field delimiter character 342 */ 343 public StrTokenizer(final String input, final char delim) { 344 this(input); 345 setDelimiterChar(delim); 346 } 347 348 /** 349 * Constructs a tokenizer splitting on the specified delimiter character 350 * and handling quotes using the specified quote character. 351 * 352 * @param input the string which is to be parsed 353 * @param delim the field delimiter character 354 * @param quote the field quoted string character 355 */ 356 public StrTokenizer(final String input, final char delim, final char quote) { 357 this(input, delim); 358 setQuoteChar(quote); 359 } 360 361 /** 362 * Constructs a tokenizer splitting on the specified delimiter string. 363 * 364 * @param input the string which is to be parsed 365 * @param delim the field delimiter string 366 */ 367 public StrTokenizer(final String input, final String delim) { 368 this(input); 369 setDelimiterString(delim); 370 } 371 372 /** 373 * Constructs a tokenizer splitting using the specified delimiter matcher. 374 * 375 * @param input the string which is to be parsed 376 * @param delim the field delimiter matcher 377 */ 378 public StrTokenizer(final String input, final StrMatcher delim) { 379 this(input); 380 setDelimiterMatcher(delim); 381 } 382 383 /** 384 * Constructs a tokenizer splitting using the specified delimiter matcher 385 * and handling quotes using the specified quote matcher. 386 * 387 * @param input the string which is to be parsed 388 * @param delim the field delimiter matcher 389 * @param quote the field quoted string matcher 390 */ 391 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 392 this(input, delim); 393 setQuoteMatcher(quote); 394 } 395 396 /** 397 * Unsupported ListIterator operation. 398 * @param obj this parameter ignored. 399 * @throws UnsupportedOperationException always 400 */ 401 @Override 402 public void add(final String obj) { 403 throw new UnsupportedOperationException("add() is unsupported"); 404 } 405 406 /** 407 * Adds a token to a list, paying attention to the parameters we've set. 408 * 409 * @param list the list to add to 410 * @param tok the token to add 411 */ 412 private void addToken(final List<String> list, String tok) { 413 if (StringUtils.isEmpty(tok)) { 414 if (isIgnoreEmptyTokens()) { 415 return; 416 } 417 if (isEmptyTokenAsNull()) { 418 tok = null; 419 } 420 } 421 list.add(tok); 422 } 423 424 /** 425 * Checks if tokenization has been done, and if not then do it. 426 */ 427 private void checkTokenized() { 428 if (tokens == null) { 429 if (chars == null) { 430 // still call tokenize as subclass may do some work 431 final List<String> split = tokenize(null, 0, 0); 432 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 433 } else { 434 final List<String> split = tokenize(chars, 0, chars.length); 435 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 436 } 437 } 438 } 439 440 /** 441 * Creates a new instance of this Tokenizer. The new instance is reset so 442 * that it will be at the start of the token list. 443 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 444 * 445 * @return a new instance of this Tokenizer which has been reset. 446 */ 447 @Override 448 public Object clone() { 449 try { 450 return cloneReset(); 451 } catch (final CloneNotSupportedException ex) { 452 return null; 453 } 454 } 455 456 /** 457 * Creates a new instance of this Tokenizer. The new instance is reset so that 458 * it will be at the start of the token list. 459 * 460 * @return a new instance of this Tokenizer which has been reset. 461 * @throws CloneNotSupportedException if there is a problem cloning 462 */ 463 Object cloneReset() throws CloneNotSupportedException { 464 // this method exists to enable 100% test coverage 465 final StrTokenizer cloned = (StrTokenizer) super.clone(); 466 if (cloned.chars != null) { 467 cloned.chars = cloned.chars.clone(); 468 } 469 cloned.reset(); 470 return cloned; 471 } 472 473 /** 474 * Gets the String content that the tokenizer is parsing. 475 * 476 * @return the string content being parsed 477 */ 478 public String getContent() { 479 if (chars == null) { 480 return null; 481 } 482 return new String(chars); 483 } 484 485 /** 486 * Gets the field delimiter matcher. 487 * 488 * @return the delimiter matcher in use 489 */ 490 public StrMatcher getDelimiterMatcher() { 491 return this.delimMatcher; 492 } 493 494 // Ignored 495 /** 496 * Gets the ignored character matcher. 497 * <p> 498 * These characters are ignored when parsing the String, unless they are 499 * within a quoted region. 500 * The default value is not to ignore anything. 501 * </p> 502 * 503 * @return the ignored matcher in use 504 */ 505 public StrMatcher getIgnoredMatcher() { 506 return ignoredMatcher; 507 } 508 509 /** 510 * Gets the quote matcher currently in use. 511 * <p> 512 * The quote character is used to wrap data between the tokens. 513 * This enables delimiters to be entered as data. 514 * The default value is '"' (double quote). 515 * </p> 516 * 517 * @return the quote matcher in use 518 */ 519 public StrMatcher getQuoteMatcher() { 520 return quoteMatcher; 521 } 522 523 /** 524 * Gets a copy of the full token list as an independent modifiable array. 525 * 526 * @return the tokens as a String array 527 */ 528 public String[] getTokenArray() { 529 checkTokenized(); 530 return tokens.clone(); 531 } 532 533 /** 534 * Gets a copy of the full token list as an independent modifiable list. 535 * 536 * @return the tokens as a String array 537 */ 538 public List<String> getTokenList() { 539 checkTokenized(); 540 final List<String> list = new ArrayList<>(tokens.length); 541 list.addAll(Arrays.asList(tokens)); 542 return list; 543 } 544 545 /** 546 * Gets the trimmer character matcher. 547 * <p> 548 * These characters are trimmed off on each side of the delimiter 549 * until the token or quote is found. 550 * The default value is not to trim anything. 551 * </p> 552 * 553 * @return the trimmer matcher in use 554 */ 555 public StrMatcher getTrimmerMatcher() { 556 return trimmerMatcher; 557 } 558 559 /** 560 * Checks whether there are any more tokens. 561 * 562 * @return true if there are more tokens 563 */ 564 @Override 565 public boolean hasNext() { 566 checkTokenized(); 567 return tokenPos < tokens.length; 568 } 569 570 /** 571 * Checks whether there are any previous tokens that can be iterated to. 572 * 573 * @return true if there are previous tokens 574 */ 575 @Override 576 public boolean hasPrevious() { 577 checkTokenized(); 578 return tokenPos > 0; 579 } 580 581 /** 582 * Gets whether the tokenizer currently returns empty tokens as null. 583 * The default for this property is false. 584 * 585 * @return true if empty tokens are returned as null 586 */ 587 public boolean isEmptyTokenAsNull() { 588 return this.emptyAsNull; 589 } 590 591 /** 592 * Gets whether the tokenizer currently ignores empty tokens. 593 * The default for this property is true. 594 * 595 * @return true if empty tokens are not returned 596 */ 597 public boolean isIgnoreEmptyTokens() { 598 return ignoreEmptyTokens; 599 } 600 601 /** 602 * Checks if the characters at the index specified match the quote 603 * already matched in readNextToken(). 604 * 605 * @param srcChars the character array being tokenized 606 * @param pos the position to check for a quote 607 * @param len the length of the character array being tokenized 608 * @param quoteStart the start position of the matched quote, 0 if no quoting 609 * @param quoteLen the length of the matched quote, 0 if no quoting 610 * @return true if a quote is matched 611 */ 612 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 613 for (int i = 0; i < quoteLen; i++) { 614 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 615 return false; 616 } 617 } 618 return true; 619 } 620 621 /** 622 * Gets the next token. 623 * 624 * @return the next String token 625 * @throws NoSuchElementException if there are no more elements 626 */ 627 @Override 628 public String next() { 629 if (hasNext()) { 630 return tokens[tokenPos++]; 631 } 632 throw new NoSuchElementException(); 633 } 634 635 /** 636 * Gets the index of the next token to return. 637 * 638 * @return the next token index 639 */ 640 @Override 641 public int nextIndex() { 642 return tokenPos; 643 } 644 645 /** 646 * Gets the next token from the String. 647 * Equivalent to {@link #next()} except it returns null rather than 648 * throwing {@link NoSuchElementException} when no tokens remain. 649 * 650 * @return the next sequential token, or null when no more tokens are found 651 */ 652 public String nextToken() { 653 if (hasNext()) { 654 return tokens[tokenPos++]; 655 } 656 return null; 657 } 658 659 /** 660 * Gets the token previous to the last returned token. 661 * 662 * @return the previous token 663 */ 664 @Override 665 public String previous() { 666 if (hasPrevious()) { 667 return tokens[--tokenPos]; 668 } 669 throw new NoSuchElementException(); 670 } 671 672 /** 673 * Gets the index of the previous token. 674 * 675 * @return the previous token index 676 */ 677 @Override 678 public int previousIndex() { 679 return tokenPos - 1; 680 } 681 682 /** 683 * Gets the previous token from the String. 684 * 685 * @return the previous sequential token, or null when no more tokens are found 686 */ 687 public String previousToken() { 688 if (hasPrevious()) { 689 return tokens[--tokenPos]; 690 } 691 return null; 692 } 693 694 /** 695 * Reads character by character through the String to get the next token. 696 * 697 * @param srcChars the character array being tokenized 698 * @param start the first character of field 699 * @param len the length of the character array being tokenized 700 * @param workArea a temporary work area 701 * @param tokenList the list of parsed tokens 702 * @return the starting position of the next field (the character 703 * immediately after the delimiter), or -1 if end of string found 704 */ 705 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 706 // skip all leading whitespace, unless it is the 707 // field delimiter or the quote character 708 while (start < len) { 709 final int removeLen = Math.max( 710 getIgnoredMatcher().isMatch(srcChars, start, start, len), 711 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 712 if (removeLen == 0 || 713 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 714 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 715 break; 716 } 717 start += removeLen; 718 } 719 720 // handle reaching end 721 if (start >= len) { 722 addToken(tokenList, StringUtils.EMPTY); 723 return -1; 724 } 725 726 // handle empty token 727 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 728 if (delimLen > 0) { 729 addToken(tokenList, StringUtils.EMPTY); 730 return start + delimLen; 731 } 732 733 // handle found token 734 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 735 if (quoteLen > 0) { 736 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 737 } 738 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 739 } 740 741 /** 742 * Reads a possibly quoted string token. 743 * 744 * @param srcChars the character array being tokenized 745 * @param start the first character of field 746 * @param len the length of the character array being tokenized 747 * @param workArea a temporary work area 748 * @param tokenList the list of parsed tokens 749 * @param quoteStart the start position of the matched quote, 0 if no quoting 750 * @param quoteLen the length of the matched quote, 0 if no quoting 751 * @return the starting position of the next field (the character 752 * immediately after the delimiter, or if end of string found, 753 * then the length of string 754 */ 755 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 756 final List<String> tokenList, final int quoteStart, final int quoteLen) { 757 // Loop until we've found the end of the quoted 758 // string or the end of the input 759 workArea.clear(); 760 int pos = start; 761 boolean quoting = quoteLen > 0; 762 int trimStart = 0; 763 764 while (pos < len) { 765 // quoting mode can occur several times throughout a string 766 // we must switch between quoting and non-quoting until we 767 // encounter a non-quoted delimiter, or end of string 768 if (quoting) { 769 // In quoting mode 770 771 // If we've found a quote character, see if it's 772 // followed by a second quote. If so, then we need 773 // to actually put the quote character into the token 774 // rather than end the token. 775 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 776 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 777 // matched pair of quotes, thus an escaped quote 778 workArea.append(srcChars, pos, quoteLen); 779 pos += quoteLen * 2; 780 trimStart = workArea.size(); 781 continue; 782 } 783 784 // end of quoting 785 quoting = false; 786 pos += quoteLen; 787 continue; 788 } 789 790 } else { 791 // Not in quoting mode 792 793 // check for delimiter, and thus end of token 794 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 795 if (delimLen > 0) { 796 // return condition when end of token found 797 addToken(tokenList, workArea.substring(0, trimStart)); 798 return pos + delimLen; 799 } 800 801 // check for quote, and thus back into quoting mode 802 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 803 quoting = true; 804 pos += quoteLen; 805 continue; 806 } 807 808 // check for ignored (outside quotes), and ignore 809 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 810 if (ignoredLen > 0) { 811 pos += ignoredLen; 812 continue; 813 } 814 815 // check for trimmed character 816 // don't yet know if it's at the end, so copy to workArea 817 // use trimStart to keep track of trim at the end 818 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 819 if (trimmedLen > 0) { 820 workArea.append(srcChars, pos, trimmedLen); 821 pos += trimmedLen; 822 continue; 823 } 824 } 825 // copy regular character from inside quotes 826 workArea.append(srcChars[pos++]); 827 trimStart = workArea.size(); 828 } 829 830 // return condition when end of string found 831 addToken(tokenList, workArea.substring(0, trimStart)); 832 return -1; 833 } 834 835 /** 836 * Unsupported ListIterator operation. 837 * 838 * @throws UnsupportedOperationException always 839 */ 840 @Override 841 public void remove() { 842 throw new UnsupportedOperationException("remove() is unsupported"); 843 } 844 845 /** 846 * Resets this tokenizer, forgetting all parsing and iteration already completed. 847 * <p> 848 * This method allows the same tokenizer to be reused for the same String. 849 * </p> 850 * 851 * @return this, to enable chaining 852 */ 853 public StrTokenizer reset() { 854 tokenPos = 0; 855 tokens = null; 856 return this; 857 } 858 859 /** 860 * Reset this tokenizer, giving it a new input string to parse. 861 * In this manner you can re-use a tokenizer with the same settings 862 * on multiple input lines. 863 * 864 * @param input the new character array to tokenize, not cloned, null sets no text to parse 865 * @return this, to enable chaining 866 */ 867 public StrTokenizer reset(final char[] input) { 868 reset(); 869 this.chars = ArrayUtils.clone(input); 870 return this; 871 } 872 873 /** 874 * Reset this tokenizer, giving it a new input string to parse. 875 * In this manner you can re-use a tokenizer with the same settings 876 * on multiple input lines. 877 * 878 * @param input the new string to tokenize, null sets no text to parse 879 * @return this, to enable chaining 880 */ 881 public StrTokenizer reset(final String input) { 882 reset(); 883 if (input != null) { 884 this.chars = input.toCharArray(); 885 } else { 886 this.chars = null; 887 } 888 return this; 889 } 890 891 /** 892 * Unsupported ListIterator operation. 893 * @param obj this parameter ignored. 894 * @throws UnsupportedOperationException always 895 */ 896 @Override 897 public void set(final String obj) { 898 throw new UnsupportedOperationException("set() is unsupported"); 899 } 900 901 /** 902 * Sets the field delimiter character. 903 * 904 * @param delim the delimiter character to use 905 * @return this, to enable chaining 906 */ 907 public StrTokenizer setDelimiterChar(final char delim) { 908 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 909 } 910 911 /** 912 * Sets the field delimiter matcher. 913 * <p> 914 * The delimiter is used to separate one token from another. 915 * </p> 916 * 917 * @param delim the delimiter matcher to use 918 * @return this, to enable chaining 919 */ 920 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 921 if (delim == null) { 922 this.delimMatcher = StrMatcher.noneMatcher(); 923 } else { 924 this.delimMatcher = delim; 925 } 926 return this; 927 } 928 929 /** 930 * Sets the field delimiter string. 931 * 932 * @param delim the delimiter string to use 933 * @return this, to enable chaining 934 */ 935 public StrTokenizer setDelimiterString(final String delim) { 936 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 937 } 938 939 /** 940 * Sets whether the tokenizer should return empty tokens as null. 941 * The default for this property is false. 942 * 943 * @param emptyAsNull whether empty tokens are returned as null 944 * @return this, to enable chaining 945 */ 946 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 947 this.emptyAsNull = emptyAsNull; 948 return this; 949 } 950 951 /** 952 * Sets the character to ignore. 953 * <p> 954 * This character is ignored when parsing the String, unless it is 955 * within a quoted region. 956 * 957 * @param ignored the ignored character to use 958 * @return this, to enable chaining 959 */ 960 public StrTokenizer setIgnoredChar(final char ignored) { 961 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 962 } 963 964 /** 965 * Sets the matcher for characters to ignore. 966 * <p> 967 * These characters are ignored when parsing the String, unless they are 968 * within a quoted region. 969 * </p> 970 * 971 * @param ignored the ignored matcher to use, null ignored 972 * @return this, to enable chaining 973 */ 974 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 975 if (ignored != null) { 976 this.ignoredMatcher = ignored; 977 } 978 return this; 979 } 980 981 /** 982 * Sets whether the tokenizer should ignore and not return empty tokens. 983 * The default for this property is true. 984 * 985 * @param ignoreEmptyTokens whether empty tokens are not returned 986 * @return this, to enable chaining 987 */ 988 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 989 this.ignoreEmptyTokens = ignoreEmptyTokens; 990 return this; 991 } 992 993 /** 994 * Sets the quote character to use. 995 * <p> 996 * The quote character is used to wrap data between the tokens. 997 * This enables delimiters to be entered as data. 998 * </p> 999 * 1000 * @param quote the quote character to use 1001 * @return this, to enable chaining 1002 */ 1003 public StrTokenizer setQuoteChar(final char quote) { 1004 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 1005 } 1006 1007 /** 1008 * Sets the quote matcher to use. 1009 * <p> 1010 * The quote character is used to wrap data between the tokens. 1011 * This enables delimiters to be entered as data. 1012 * </p> 1013 * 1014 * @param quote the quote matcher to use, null ignored 1015 * @return this, to enable chaining 1016 */ 1017 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 1018 if (quote != null) { 1019 this.quoteMatcher = quote; 1020 } 1021 return this; 1022 } 1023 1024 /** 1025 * Sets the matcher for characters to trim. 1026 * <p> 1027 * These characters are trimmed off on each side of the delimiter 1028 * until the token or quote is found. 1029 * </p> 1030 * 1031 * @param trimmer the trimmer matcher to use, null ignored 1032 * @return this, to enable chaining 1033 */ 1034 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1035 if (trimmer != null) { 1036 this.trimmerMatcher = trimmer; 1037 } 1038 return this; 1039 } 1040 1041 // API 1042 /** 1043 * Gets the number of tokens found in the String. 1044 * 1045 * @return the number of matched tokens 1046 */ 1047 public int size() { 1048 checkTokenized(); 1049 return tokens.length; 1050 } 1051 1052 /** 1053 * Internal method to performs the tokenization. 1054 * <p> 1055 * Most users of this class do not need to call this method. This method 1056 * will be called automatically by other (public) methods when required. 1057 * </p> 1058 * <p> 1059 * This method exists to allow subclasses to add code before or after the 1060 * tokenization. For example, a subclass could alter the character array, 1061 * offset or count to be parsed, or call the tokenizer multiple times on 1062 * multiple strings. It is also be possible to filter the results. 1063 * </p> 1064 * <p> 1065 * {@link StrTokenizer} will always pass a zero offset and a count 1066 * equal to the length of the array to this method, however a subclass 1067 * may pass other values, or even an entirely different array. 1068 * </p> 1069 * 1070 * @param srcChars the character array being tokenized, may be null 1071 * @param offset the start position within the character array, must be valid 1072 * @param count the number of characters to tokenize, must be valid 1073 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 1074 */ 1075 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 1076 if (ArrayUtils.isEmpty(srcChars)) { 1077 return Collections.emptyList(); 1078 } 1079 final StrBuilder buf = new StrBuilder(); 1080 final List<String> tokenList = new ArrayList<>(); 1081 int pos = offset; 1082 1083 // loop around the entire buffer 1084 while (pos >= 0 && pos < count) { 1085 // find next token 1086 pos = readNextToken(srcChars, pos, count, buf, tokenList); 1087 1088 // handle case where end of string is a delimiter 1089 if (pos >= count) { 1090 addToken(tokenList, StringUtils.EMPTY); 1091 } 1092 } 1093 return tokenList; 1094 } 1095 1096 /** 1097 * Gets the String content that the tokenizer is parsing. 1098 * 1099 * @return the string content being parsed 1100 */ 1101 @Override 1102 public String toString() { 1103 if (tokens == null) { 1104 return "StrTokenizer[not tokenized yet]"; 1105 } 1106 return "StrTokenizer" + getTokenList(); 1107 } 1108 1109 }