001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.lang3.StringUtils; 026 027/** 028 * Tokenizes a string based on delimiters (separators) 029 * and supporting quoting and ignored character concepts. 030 * <p> 031 * This class can split a String into many smaller strings. It aims 032 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 033 * however it offers much more control and flexibility including implementing 034 * the {@code ListIterator} interface. By default, it is set up 035 * like {@code StringTokenizer}. 036 * <p> 037 * The input String is split into a number of <i>tokens</i>. 038 * Each token is separated from the next String by a <i>delimiter</i>. 039 * One or more delimiter characters must be specified. 040 * <p> 041 * Each token may be surrounded by quotes. 042 * The <i>quote</i> matcher specifies the quote character(s). 043 * A quote may be escaped within a quoted section by duplicating itself. 044 * <p> 045 * Between each token and the delimiter are potentially characters that need trimming. 046 * The <i>trimmer</i> matcher specifies these characters. 047 * One usage might be to trim whitespace characters. 048 * <p> 049 * At any point outside the quotes there might potentially be invalid characters. 050 * The <i>ignored</i> matcher specifies these characters to be removed. 051 * One usage might be to remove new line characters. 052 * <p> 053 * Empty tokens may be removed or returned as null. 054 * <pre> 055 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 056 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 057 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 058 * </pre> 059 * 060 * <table> 061 * <caption>StrTokenizer properties and options</caption> 062 * <tr> 063 * <th>Property</th><th>Type</th><th>Default</th> 064 * </tr> 065 * <tr> 066 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 067 * </tr> 068 * <tr> 069 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 070 * </tr> 071 * <tr> 072 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 073 * </tr> 074 * <tr> 075 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 076 * </tr> 077 * <tr> 078 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 079 * </tr> 080 * </table> 081 * 082 * @since 1.0 083 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0. 084 */ 085@Deprecated 086public class StrTokenizer implements ListIterator<String>, Cloneable { 087 088 /** Comma separated values tokenizer internal variable. */ 089 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 090 /** Tab separated values tokenizer internal variable. */ 091 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 092 static { 093 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 094 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 098 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 099 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 100 101 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 102 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 107 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 108 } 109 110 /** The text to work on. */ 111 private char[] chars; 112 /** The parsed tokens. */ 113 private String[] tokens; 114 /** The current iteration position. */ 115 private int tokenPos; 116 117 /** The delimiter matcher. */ 118 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 119 /** The quote matcher. */ 120 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 121 /** The ignored matcher. */ 122 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 123 /** The trimmer matcher. */ 124 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 125 126 /** Whether to return empty tokens as null. */ 127 private boolean emptyAsNull = false; 128 /** Whether to ignore empty tokens. */ 129 private boolean ignoreEmptyTokens = true; 130 131 //----------------------------------------------------------------------- 132 133 /** 134 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 135 * 136 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 137 */ 138 private static StrTokenizer getCSVClone() { 139 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 140 } 141 142 /** 143 * Gets a new tokenizer instance which parses Comma Separated Value strings 144 * initializing it with the given input. The default for CSV processing 145 * will be trim whitespace from both ends (which can be overridden with 146 * the setTrimmer method). 147 * <p> 148 * You must call a "reset" method to set the string which you want to parse. 149 * @return a new tokenizer instance which parses Comma Separated Value strings 150 */ 151 public static StrTokenizer getCSVInstance() { 152 return getCSVClone(); 153 } 154 155 /** 156 * Gets a new tokenizer instance which parses Comma Separated Value strings 157 * initializing it with the given input. The default for CSV processing 158 * will be trim whitespace from both ends (which can be overridden with 159 * the setTrimmer method). 160 * 161 * @param input the text to parse 162 * @return a new tokenizer instance which parses Comma Separated Value strings 163 */ 164 public static StrTokenizer getCSVInstance(final String input) { 165 final StrTokenizer tok = getCSVClone(); 166 tok.reset(input); 167 return tok; 168 } 169 170 /** 171 * Gets a new tokenizer instance which parses Comma Separated Value strings 172 * initializing it with the given input. The default for CSV processing 173 * will be trim whitespace from both ends (which can be overridden with 174 * the setTrimmer method). 175 * 176 * @param input the text to parse 177 * @return a new tokenizer instance which parses Comma Separated Value strings 178 */ 179 public static StrTokenizer getCSVInstance(final char[] input) { 180 final StrTokenizer tok = getCSVClone(); 181 tok.reset(input); 182 return tok; 183 } 184 185 /** 186 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 187 * 188 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 189 */ 190 private static StrTokenizer getTSVClone() { 191 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 192 } 193 194 195 /** 196 * Gets a new tokenizer instance which parses Tab Separated Value strings. 197 * The default for CSV processing will be trim whitespace from both ends 198 * (which can be overridden with the setTrimmer method). 199 * <p> 200 * You must call a "reset" method to set the string which you want to parse. 201 * @return a new tokenizer instance which parses Tab Separated Value strings. 202 */ 203 public static StrTokenizer getTSVInstance() { 204 return getTSVClone(); 205 } 206 207 /** 208 * Gets a new tokenizer instance which parses Tab Separated Value strings. 209 * The default for CSV processing will be trim whitespace from both ends 210 * (which can be overridden with the setTrimmer method). 211 * @param input the string to parse 212 * @return a new tokenizer instance which parses Tab Separated Value strings. 213 */ 214 public static StrTokenizer getTSVInstance(final String input) { 215 final StrTokenizer tok = getTSVClone(); 216 tok.reset(input); 217 return tok; 218 } 219 220 /** 221 * Gets a new tokenizer instance which parses Tab Separated Value strings. 222 * The default for CSV processing will be trim whitespace from both ends 223 * (which can be overridden with the setTrimmer method). 224 * @param input the string to parse 225 * @return a new tokenizer instance which parses Tab Separated Value strings. 226 */ 227 public static StrTokenizer getTSVInstance(final char[] input) { 228 final StrTokenizer tok = getTSVClone(); 229 tok.reset(input); 230 return tok; 231 } 232 233 //----------------------------------------------------------------------- 234 /** 235 * Constructs a tokenizer splitting on space, tab, newline and form feed 236 * as per StringTokenizer, but with no text to tokenize. 237 * <p> 238 * This constructor is normally used with {@link #reset(String)}. 239 */ 240 public StrTokenizer() { 241 super(); 242 this.chars = null; 243 } 244 245 /** 246 * Constructs a tokenizer splitting on space, tab, newline and form feed 247 * as per StringTokenizer. 248 * 249 * @param input the string which is to be parsed 250 */ 251 public StrTokenizer(final String input) { 252 super(); 253 if (input != null) { 254 chars = input.toCharArray(); 255 } else { 256 chars = null; 257 } 258 } 259 260 /** 261 * Constructs a tokenizer splitting on the specified delimiter character. 262 * 263 * @param input the string which is to be parsed 264 * @param delim the field delimiter character 265 */ 266 public StrTokenizer(final String input, final char delim) { 267 this(input); 268 setDelimiterChar(delim); 269 } 270 271 /** 272 * Constructs a tokenizer splitting on the specified delimiter string. 273 * 274 * @param input the string which is to be parsed 275 * @param delim the field delimiter string 276 */ 277 public StrTokenizer(final String input, final String delim) { 278 this(input); 279 setDelimiterString(delim); 280 } 281 282 /** 283 * Constructs a tokenizer splitting using the specified delimiter matcher. 284 * 285 * @param input the string which is to be parsed 286 * @param delim the field delimiter matcher 287 */ 288 public StrTokenizer(final String input, final StrMatcher delim) { 289 this(input); 290 setDelimiterMatcher(delim); 291 } 292 293 /** 294 * Constructs a tokenizer splitting on the specified delimiter character 295 * and handling quotes using the specified quote character. 296 * 297 * @param input the string which is to be parsed 298 * @param delim the field delimiter character 299 * @param quote the field quoted string character 300 */ 301 public StrTokenizer(final String input, final char delim, final char quote) { 302 this(input, delim); 303 setQuoteChar(quote); 304 } 305 306 /** 307 * Constructs a tokenizer splitting using the specified delimiter matcher 308 * and handling quotes using the specified quote matcher. 309 * 310 * @param input the string which is to be parsed 311 * @param delim the field delimiter matcher 312 * @param quote the field quoted string matcher 313 */ 314 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 315 this(input, delim); 316 setQuoteMatcher(quote); 317 } 318 319 /** 320 * Constructs a tokenizer splitting on space, tab, newline and form feed 321 * as per StringTokenizer. 322 * 323 * @param input the string which is to be parsed, not cloned 324 */ 325 public StrTokenizer(final char[] input) { 326 super(); 327 if (input == null) { 328 this.chars = null; 329 } else { 330 this.chars = input.clone(); 331 } 332 } 333 334 /** 335 * Constructs a tokenizer splitting on the specified character. 336 * 337 * @param input the string which is to be parsed, not cloned 338 * @param delim the field delimiter character 339 */ 340 public StrTokenizer(final char[] input, final char delim) { 341 this(input); 342 setDelimiterChar(delim); 343 } 344 345 /** 346 * Constructs a tokenizer splitting on the specified string. 347 * 348 * @param input the string which is to be parsed, not cloned 349 * @param delim the field delimiter string 350 */ 351 public StrTokenizer(final char[] input, final String delim) { 352 this(input); 353 setDelimiterString(delim); 354 } 355 356 /** 357 * Constructs a tokenizer splitting using the specified delimiter matcher. 358 * 359 * @param input the string which is to be parsed, not cloned 360 * @param delim the field delimiter matcher 361 */ 362 public StrTokenizer(final char[] input, final StrMatcher delim) { 363 this(input); 364 setDelimiterMatcher(delim); 365 } 366 367 /** 368 * Constructs a tokenizer splitting on the specified delimiter character 369 * and handling quotes using the specified quote character. 370 * 371 * @param input the string which is to be parsed, not cloned 372 * @param delim the field delimiter character 373 * @param quote the field quoted string character 374 */ 375 public StrTokenizer(final char[] input, final char delim, final char quote) { 376 this(input, delim); 377 setQuoteChar(quote); 378 } 379 380 /** 381 * Constructs a tokenizer splitting using the specified delimiter matcher 382 * and handling quotes using the specified quote matcher. 383 * 384 * @param input the string which is to be parsed, not cloned 385 * @param delim the field delimiter character 386 * @param quote the field quoted string character 387 */ 388 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 389 this(input, delim); 390 setQuoteMatcher(quote); 391 } 392 393 // API 394 //----------------------------------------------------------------------- 395 /** 396 * Gets the number of tokens found in the String. 397 * 398 * @return The number of matched tokens 399 */ 400 public int size() { 401 checkTokenized(); 402 return tokens.length; 403 } 404 405 /** 406 * Gets the next token from the String. 407 * Equivalent to {@link #next()} except it returns null rather than 408 * throwing {@link NoSuchElementException} when no tokens remain. 409 * 410 * @return The next sequential token, or null when no more tokens are found 411 */ 412 public String nextToken() { 413 if (hasNext()) { 414 return tokens[tokenPos++]; 415 } 416 return null; 417 } 418 419 /** 420 * Gets the previous token from the String. 421 * 422 * @return The previous sequential token, or null when no more tokens are found 423 */ 424 public String previousToken() { 425 if (hasPrevious()) { 426 return tokens[--tokenPos]; 427 } 428 return null; 429 } 430 431 /** 432 * Gets a copy of the full token list as an independent modifiable array. 433 * 434 * @return The tokens as a String array 435 */ 436 public String[] getTokenArray() { 437 checkTokenized(); 438 return tokens.clone(); 439 } 440 441 /** 442 * Gets a copy of the full token list as an independent modifiable list. 443 * 444 * @return The tokens as a String array 445 */ 446 public List<String> getTokenList() { 447 checkTokenized(); 448 final List<String> list = new ArrayList<>(tokens.length); 449 Collections.addAll(list, tokens); 450 451 return list; 452 } 453 454 /** 455 * Resets this tokenizer, forgetting all parsing and iteration already completed. 456 * <p> 457 * This method allows the same tokenizer to be reused for the same String. 458 * 459 * @return this, to enable chaining 460 */ 461 public StrTokenizer reset() { 462 tokenPos = 0; 463 tokens = null; 464 return this; 465 } 466 467 /** 468 * Reset this tokenizer, giving it a new input string to parse. 469 * In this manner you can re-use a tokenizer with the same settings 470 * on multiple input lines. 471 * 472 * @param input the new string to tokenize, null sets no text to parse 473 * @return this, to enable chaining 474 */ 475 public StrTokenizer reset(final String input) { 476 reset(); 477 if (input != null) { 478 this.chars = input.toCharArray(); 479 } else { 480 this.chars = null; 481 } 482 return this; 483 } 484 485 /** 486 * Reset this tokenizer, giving it a new input string to parse. 487 * In this manner you can re-use a tokenizer with the same settings 488 * on multiple input lines. 489 * 490 * @param input the new character array to tokenize, not cloned, null sets no text to parse 491 * @return this, to enable chaining 492 */ 493 public StrTokenizer reset(final char[] input) { 494 reset(); 495 if (input != null) { 496 this.chars = input.clone(); 497 } else { 498 this.chars = null; 499 } 500 return this; 501 } 502 503 // ListIterator 504 //----------------------------------------------------------------------- 505 /** 506 * Checks whether there are any more tokens. 507 * 508 * @return true if there are more tokens 509 */ 510 @Override 511 public boolean hasNext() { 512 checkTokenized(); 513 return tokenPos < tokens.length; 514 } 515 516 /** 517 * Gets the next token. 518 * 519 * @return The next String token 520 * @throws NoSuchElementException if there are no more elements 521 */ 522 @Override 523 public String next() { 524 if (hasNext()) { 525 return tokens[tokenPos++]; 526 } 527 throw new NoSuchElementException(); 528 } 529 530 /** 531 * Gets the index of the next token to return. 532 * 533 * @return The next token index 534 */ 535 @Override 536 public int nextIndex() { 537 return tokenPos; 538 } 539 540 /** 541 * Checks whether there are any previous tokens that can be iterated to. 542 * 543 * @return true if there are previous tokens 544 */ 545 @Override 546 public boolean hasPrevious() { 547 checkTokenized(); 548 return tokenPos > 0; 549 } 550 551 /** 552 * Gets the token previous to the last returned token. 553 * 554 * @return The previous token 555 */ 556 @Override 557 public String previous() { 558 if (hasPrevious()) { 559 return tokens[--tokenPos]; 560 } 561 throw new NoSuchElementException(); 562 } 563 564 /** 565 * Gets the index of the previous token. 566 * 567 * @return The previous token index 568 */ 569 @Override 570 public int previousIndex() { 571 return tokenPos - 1; 572 } 573 574 /** 575 * Unsupported ListIterator operation. 576 * 577 * @throws UnsupportedOperationException always 578 */ 579 @Override 580 public void remove() { 581 throw new UnsupportedOperationException("remove() is unsupported"); 582 } 583 584 /** 585 * Unsupported ListIterator operation. 586 * @param obj this parameter ignored. 587 * @throws UnsupportedOperationException always 588 */ 589 @Override 590 public void set(final String obj) { 591 throw new UnsupportedOperationException("set() is unsupported"); 592 } 593 594 /** 595 * Unsupported ListIterator operation. 596 * @param obj this parameter ignored. 597 * @throws UnsupportedOperationException always 598 */ 599 @Override 600 public void add(final String obj) { 601 throw new UnsupportedOperationException("add() is unsupported"); 602 } 603 604 // Implementation 605 //----------------------------------------------------------------------- 606 /** 607 * Checks if tokenization has been done, and if not then do it. 608 */ 609 private void checkTokenized() { 610 if (tokens == null) { 611 if (chars == null) { 612 // still call tokenize as subclass may do some work 613 final List<String> split = tokenize(null, 0, 0); 614 tokens = split.toArray(new String[split.size()]); 615 } else { 616 final List<String> split = tokenize(chars, 0, chars.length); 617 tokens = split.toArray(new String[split.size()]); 618 } 619 } 620 } 621 622 /** 623 * Internal method to performs the tokenization. 624 * <p> 625 * Most users of this class do not need to call this method. This method 626 * will be called automatically by other (public) methods when required. 627 * <p> 628 * This method exists to allow subclasses to add code before or after the 629 * tokenization. For example, a subclass could alter the character array, 630 * offset or count to be parsed, or call the tokenizer multiple times on 631 * multiple strings. It is also be possible to filter the results. 632 * <p> 633 * {@code StrTokenizer} will always pass a zero offset and a count 634 * equal to the length of the array to this method, however a subclass 635 * may pass other values, or even an entirely different array. 636 * 637 * @param srcChars the character array being tokenized, may be null 638 * @param offset the start position within the character array, must be valid 639 * @param count the number of characters to tokenize, must be valid 640 * @return The modifiable list of String tokens, unmodifiable if null array or zero count 641 */ 642 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 643 if (srcChars == null || count == 0) { 644 return Collections.emptyList(); 645 } 646 final StrBuilder buf = new StrBuilder(); 647 final List<String> tokenList = new ArrayList<>(); 648 int pos = offset; 649 650 // loop around the entire buffer 651 while (pos >= 0 && pos < count) { 652 // find next token 653 pos = readNextToken(srcChars, pos, count, buf, tokenList); 654 655 // handle case where end of string is a delimiter 656 if (pos >= count) { 657 addToken(tokenList, StringUtils.EMPTY); 658 } 659 } 660 return tokenList; 661 } 662 663 /** 664 * Adds a token to a list, paying attention to the parameters we've set. 665 * 666 * @param list the list to add to 667 * @param tok the token to add 668 */ 669 private void addToken(final List<String> list, String tok) { 670 if (tok == null || tok.length() == 0) { 671 if (isIgnoreEmptyTokens()) { 672 return; 673 } 674 if (isEmptyTokenAsNull()) { 675 tok = null; 676 } 677 } 678 list.add(tok); 679 } 680 681 /** 682 * Reads character by character through the String to get the next token. 683 * 684 * @param srcChars the character array being tokenized 685 * @param start the first character of field 686 * @param len the length of the character array being tokenized 687 * @param workArea a temporary work area 688 * @param tokenList the list of parsed tokens 689 * @return The starting position of the next field (the character 690 * immediately after the delimiter), or -1 if end of string found 691 */ 692 private int readNextToken(final char[] srcChars, 693 int start, 694 final int len, 695 final StrBuilder workArea, 696 final List<String> tokenList) { 697 // skip all leading whitespace, unless it is the 698 // field delimiter or the quote character 699 while (start < len) { 700 final int removeLen = Math.max( 701 getIgnoredMatcher().isMatch(srcChars, start, start, len), 702 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 703 if (removeLen == 0 704 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 705 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 706 break; 707 } 708 start += removeLen; 709 } 710 711 // handle reaching end 712 if (start >= len) { 713 addToken(tokenList, StringUtils.EMPTY); 714 return -1; 715 } 716 717 // handle empty token 718 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 719 if (delimLen > 0) { 720 addToken(tokenList, StringUtils.EMPTY); 721 return start + delimLen; 722 } 723 724 // handle found token 725 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 726 if (quoteLen > 0) { 727 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 728 } 729 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 730 } 731 732 /** 733 * Reads a possibly quoted string token. 734 * 735 * @param srcChars the character array being tokenized 736 * @param start the first character of field 737 * @param len the length of the character array being tokenized 738 * @param workArea a temporary work area 739 * @param tokenList the list of parsed tokens 740 * @param quoteStart the start position of the matched quote, 0 if no quoting 741 * @param quoteLen the length of the matched quote, 0 if no quoting 742 * @return The starting position of the next field (the character 743 * immediately after the delimiter, or if end of string found, 744 * then the length of string 745 */ 746 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 747 final List<String> tokenList, final int quoteStart, final int quoteLen) { 748 // Loop until we've found the end of the quoted 749 // string or the end of the input 750 workArea.clear(); 751 int pos = start; 752 boolean quoting = quoteLen > 0; 753 int trimStart = 0; 754 755 while (pos < len) { 756 // quoting mode can occur several times throughout a string 757 // we must switch between quoting and non-quoting until we 758 // encounter a non-quoted delimiter, or end of string 759 if (quoting) { 760 // In quoting mode 761 762 // If we've found a quote character, see if it's 763 // followed by a second quote. If so, then we need 764 // to actually put the quote character into the token 765 // rather than end the token. 766 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 767 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 768 // matched pair of quotes, thus an escaped quote 769 workArea.append(srcChars, pos, quoteLen); 770 pos += quoteLen * 2; 771 trimStart = workArea.size(); 772 continue; 773 } 774 775 // end of quoting 776 quoting = false; 777 pos += quoteLen; 778 continue; 779 } 780 781 // copy regular character from inside quotes 782 workArea.append(srcChars[pos++]); 783 trimStart = workArea.size(); 784 785 } else { 786 // Not in quoting mode 787 788 // check for delimiter, and thus end of token 789 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 790 if (delimLen > 0) { 791 // return condition when end of token found 792 addToken(tokenList, workArea.substring(0, trimStart)); 793 return pos + delimLen; 794 } 795 796 // check for quote, and thus back into quoting mode 797 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 798 quoting = true; 799 pos += quoteLen; 800 continue; 801 } 802 803 // check for ignored (outside quotes), and ignore 804 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 805 if (ignoredLen > 0) { 806 pos += ignoredLen; 807 continue; 808 } 809 810 // check for trimmed character 811 // don't yet know if its at the end, so copy to workArea 812 // use trimStart to keep track of trim at the end 813 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 814 if (trimmedLen > 0) { 815 workArea.append(srcChars, pos, trimmedLen); 816 pos += trimmedLen; 817 continue; 818 } 819 820 // copy regular character from outside quotes 821 workArea.append(srcChars[pos++]); 822 trimStart = workArea.size(); 823 } 824 } 825 826 // return condition when end of string found 827 addToken(tokenList, workArea.substring(0, trimStart)); 828 return -1; 829 } 830 831 /** 832 * Checks if the characters at the index specified match the quote 833 * already matched in readNextToken(). 834 * 835 * @param srcChars the character array being tokenized 836 * @param pos the position to check for a quote 837 * @param len the length of the character array being tokenized 838 * @param quoteStart the start position of the matched quote, 0 if no quoting 839 * @param quoteLen the length of the matched quote, 0 if no quoting 840 * @return true if a quote is matched 841 */ 842 private boolean isQuote(final char[] srcChars, 843 final int pos, 844 final int len, 845 final int quoteStart, 846 final int quoteLen) { 847 for (int i = 0; i < quoteLen; i++) { 848 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 849 return false; 850 } 851 } 852 return true; 853 } 854 855 // Delimiter 856 //----------------------------------------------------------------------- 857 /** 858 * Gets the field delimiter matcher. 859 * 860 * @return The delimiter matcher in use 861 */ 862 public StrMatcher getDelimiterMatcher() { 863 return this.delimMatcher; 864 } 865 866 /** 867 * Sets the field delimiter matcher. 868 * <p> 869 * The delimiter is used to separate one token from another. 870 * 871 * @param delim the delimiter matcher to use 872 * @return this, to enable chaining 873 */ 874 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 875 if (delim == null) { 876 this.delimMatcher = StrMatcher.noneMatcher(); 877 } else { 878 this.delimMatcher = delim; 879 } 880 return this; 881 } 882 883 /** 884 * Sets the field delimiter character. 885 * 886 * @param delim the delimiter character to use 887 * @return this, to enable chaining 888 */ 889 public StrTokenizer setDelimiterChar(final char delim) { 890 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 891 } 892 893 /** 894 * Sets the field delimiter string. 895 * 896 * @param delim the delimiter string to use 897 * @return this, to enable chaining 898 */ 899 public StrTokenizer setDelimiterString(final String delim) { 900 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 901 } 902 903 // Quote 904 //----------------------------------------------------------------------- 905 /** 906 * Gets the quote matcher currently in use. 907 * <p> 908 * The quote character is used to wrap data between the tokens. 909 * This enables delimiters to be entered as data. 910 * The default value is '"' (double quote). 911 * 912 * @return The quote matcher in use 913 */ 914 public StrMatcher getQuoteMatcher() { 915 return quoteMatcher; 916 } 917 918 /** 919 * Set the quote matcher to use. 920 * <p> 921 * The quote character is used to wrap data between the tokens. 922 * This enables delimiters to be entered as data. 923 * 924 * @param quote the quote matcher to use, null ignored 925 * @return this, to enable chaining 926 */ 927 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 928 if (quote != null) { 929 this.quoteMatcher = quote; 930 } 931 return this; 932 } 933 934 /** 935 * Sets the quote character to use. 936 * <p> 937 * The quote character is used to wrap data between the tokens. 938 * This enables delimiters to be entered as data. 939 * 940 * @param quote the quote character to use 941 * @return this, to enable chaining 942 */ 943 public StrTokenizer setQuoteChar(final char quote) { 944 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 945 } 946 947 // Ignored 948 //----------------------------------------------------------------------- 949 /** 950 * Gets the ignored character matcher. 951 * <p> 952 * These characters are ignored when parsing the String, unless they are 953 * within a quoted region. 954 * The default value is not to ignore anything. 955 * 956 * @return The ignored matcher in use 957 */ 958 public StrMatcher getIgnoredMatcher() { 959 return ignoredMatcher; 960 } 961 962 /** 963 * Set the matcher for characters to ignore. 964 * <p> 965 * These characters are ignored when parsing the String, unless they are 966 * within a quoted region. 967 * 968 * @param ignored the ignored matcher to use, null ignored 969 * @return this, to enable chaining 970 */ 971 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 972 if (ignored != null) { 973 this.ignoredMatcher = ignored; 974 } 975 return this; 976 } 977 978 /** 979 * Set the character to ignore. 980 * <p> 981 * This character is ignored when parsing the String, unless it is 982 * within a quoted region. 983 * 984 * @param ignored the ignored character to use 985 * @return this, to enable chaining 986 */ 987 public StrTokenizer setIgnoredChar(final char ignored) { 988 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 989 } 990 991 // Trimmer 992 //----------------------------------------------------------------------- 993 /** 994 * Gets the trimmer character matcher. 995 * <p> 996 * These characters are trimmed off on each side of the delimiter 997 * until the token or quote is found. 998 * The default value is not to trim anything. 999 * 1000 * @return The trimmer matcher in use 1001 */ 1002 public StrMatcher getTrimmerMatcher() { 1003 return trimmerMatcher; 1004 } 1005 1006 /** 1007 * Sets the matcher for characters to trim. 1008 * <p> 1009 * These characters are trimmed off on each side of the delimiter 1010 * until the token or quote is found. 1011 * 1012 * @param trimmer the trimmer matcher to use, null ignored 1013 * @return this, to enable chaining 1014 */ 1015 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1016 if (trimmer != null) { 1017 this.trimmerMatcher = trimmer; 1018 } 1019 return this; 1020 } 1021 1022 //----------------------------------------------------------------------- 1023 /** 1024 * Gets whether the tokenizer currently returns empty tokens as null. 1025 * The default for this property is false. 1026 * 1027 * @return true if empty tokens are returned as null 1028 */ 1029 public boolean isEmptyTokenAsNull() { 1030 return this.emptyAsNull; 1031 } 1032 1033 /** 1034 * Sets whether the tokenizer should return empty tokens as null. 1035 * The default for this property is false. 1036 * 1037 * @param emptyAsNull whether empty tokens are returned as null 1038 * @return this, to enable chaining 1039 */ 1040 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1041 this.emptyAsNull = emptyAsNull; 1042 return this; 1043 } 1044 1045 //----------------------------------------------------------------------- 1046 /** 1047 * Gets whether the tokenizer currently ignores empty tokens. 1048 * The default for this property is true. 1049 * 1050 * @return true if empty tokens are not returned 1051 */ 1052 public boolean isIgnoreEmptyTokens() { 1053 return ignoreEmptyTokens; 1054 } 1055 1056 /** 1057 * Sets whether the tokenizer should ignore and not return empty tokens. 1058 * The default for this property is true. 1059 * 1060 * @param ignoreEmptyTokens whether empty tokens are not returned 1061 * @return this, to enable chaining 1062 */ 1063 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1064 this.ignoreEmptyTokens = ignoreEmptyTokens; 1065 return this; 1066 } 1067 1068 //----------------------------------------------------------------------- 1069 /** 1070 * Gets the String content that the tokenizer is parsing. 1071 * 1072 * @return The string content being parsed 1073 */ 1074 public String getContent() { 1075 if (chars == null) { 1076 return null; 1077 } 1078 return new String(chars); 1079 } 1080 1081 //----------------------------------------------------------------------- 1082 /** 1083 * Creates a new instance of this Tokenizer. The new instance is reset so 1084 * that it will be at the start of the token list. 1085 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 1086 * 1087 * @return a new instance of this Tokenizer which has been reset. 1088 */ 1089 @Override 1090 public Object clone() { 1091 try { 1092 return cloneReset(); 1093 } catch (final CloneNotSupportedException ex) { 1094 return null; 1095 } 1096 } 1097 1098 /** 1099 * Creates a new instance of this Tokenizer. The new instance is reset so that 1100 * it will be at the start of the token list. 1101 * 1102 * @return a new instance of this Tokenizer which has been reset. 1103 * @throws CloneNotSupportedException if there is a problem cloning 1104 */ 1105 Object cloneReset() throws CloneNotSupportedException { 1106 // this method exists to enable 100% test coverage 1107 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1108 if (cloned.chars != null) { 1109 cloned.chars = cloned.chars.clone(); 1110 } 1111 cloned.reset(); 1112 return cloned; 1113 } 1114 1115 //----------------------------------------------------------------------- 1116 /** 1117 * Gets the String content that the tokenizer is parsing. 1118 * 1119 * @return The string content being parsed 1120 */ 1121 @Override 1122 public String toString() { 1123 if (tokens == null) { 1124 return "StrTokenizer[not tokenized yet]"; 1125 } 1126 return "StrTokenizer" + getTokenList(); 1127 } 1128 1129}