001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025 026import org.apache.commons.lang3.ArrayUtils; 027import org.apache.commons.lang3.StringUtils; 028 029/** 030 * Tokenizes a string based on delimiters (separators) 031 * and supporting quoting and ignored character concepts. 032 * <p> 033 * This class can split a String into many smaller strings. It aims 034 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 035 * however it offers much more control and flexibility including implementing 036 * the <code>ListIterator</code> interface. By default, it is set up 037 * like <code>StringTokenizer</code>. 038 * <p> 039 * The input String is split into a number of <i>tokens</i>. 040 * Each token is separated from the next String by a <i>delimiter</i>. 041 * One or more delimiter characters must be specified. 042 * <p> 043 * Each token may be surrounded by quotes. 044 * The <i>quote</i> matcher specifies the quote character(s). 045 * A quote may be escaped within a quoted section by duplicating itself. 046 * <p> 047 * Between each token and the delimiter are potentially characters that need trimming. 048 * The <i>trimmer</i> matcher specifies these characters. 049 * One usage might be to trim whitespace characters. 050 * <p> 051 * At any point outside the quotes there might potentially be invalid characters. 052 * The <i>ignored</i> matcher specifies these characters to be removed. 053 * One usage might be to remove new line characters. 054 * <p> 055 * Empty tokens may be removed or returned as null. 056 * <pre> 057 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 058 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 059 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 060 * </pre> 061 * 062 * <table> 063 * <caption>StrTokenizer properties and options</caption> 064 * <tr> 065 * <th>Property</th><th>Type</th><th>Default</th> 066 * </tr> 067 * <tr> 068 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 069 * </tr> 070 * <tr> 071 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 072 * </tr> 073 * <tr> 074 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 075 * </tr> 076 * <tr> 077 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 078 * </tr> 079 * <tr> 080 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 081 * </tr> 082 * </table> 083 * 084 * @since 2.2 085 * @deprecated as of 3.6, use commons-text 086 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html"> 087 * StringTokenizer</a> instead 088 */ 089@Deprecated 090public class StrTokenizer implements ListIterator<String>, Cloneable { 091 092 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 093 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 094 static { 095 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 096 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 098 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 099 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 100 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 101 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 102 103 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 104 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 107 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 108 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 109 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 110 } 111 112 /** The text to work on. */ 113 private char chars[]; 114 /** The parsed tokens */ 115 private String tokens[]; 116 /** The current iteration position */ 117 private int tokenPos; 118 119 /** The delimiter matcher */ 120 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 121 /** The quote matcher */ 122 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 123 /** The ignored matcher */ 124 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 125 /** The trimmer matcher */ 126 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 127 128 /** Whether to return empty tokens as null */ 129 private boolean emptyAsNull = false; 130 /** Whether to ignore empty tokens */ 131 private boolean ignoreEmptyTokens = true; 132 133 //----------------------------------------------------------------------- 134 135 /** 136 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 137 * 138 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 139 */ 140 private static StrTokenizer getCSVClone() { 141 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 142 } 143 144 /** 145 * Gets a new tokenizer instance which parses Comma Separated Value strings 146 * initializing it with the given input. The default for CSV processing 147 * will be trim whitespace from both ends (which can be overridden with 148 * the setTrimmer method). 149 * <p> 150 * You must call a "reset" method to set the string which you want to parse. 151 * @return a new tokenizer instance which parses Comma Separated Value strings 152 */ 153 public static StrTokenizer getCSVInstance() { 154 return getCSVClone(); 155 } 156 157 /** 158 * Gets a new tokenizer instance which parses Comma Separated Value strings 159 * initializing it with the given input. The default for CSV processing 160 * will be trim whitespace from both ends (which can be overridden with 161 * the setTrimmer method). 162 * 163 * @param input the text to parse 164 * @return a new tokenizer instance which parses Comma Separated Value strings 165 */ 166 public static StrTokenizer getCSVInstance(final String input) { 167 final StrTokenizer tok = getCSVClone(); 168 tok.reset(input); 169 return tok; 170 } 171 172 /** 173 * Gets a new tokenizer instance which parses Comma Separated Value strings 174 * initializing it with the given input. The default for CSV processing 175 * will be trim whitespace from both ends (which can be overridden with 176 * the setTrimmer method). 177 * 178 * @param input the text to parse 179 * @return a new tokenizer instance which parses Comma Separated Value strings 180 */ 181 public static StrTokenizer getCSVInstance(final char[] input) { 182 final StrTokenizer tok = getCSVClone(); 183 tok.reset(input); 184 return tok; 185 } 186 187 /** 188 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 189 * 190 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 191 */ 192 private static StrTokenizer getTSVClone() { 193 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 194 } 195 196 197 /** 198 * Gets a new tokenizer instance which parses Tab Separated Value strings. 199 * The default for CSV processing will be trim whitespace from both ends 200 * (which can be overridden with the setTrimmer method). 201 * <p> 202 * You must call a "reset" method to set the string which you want to parse. 203 * @return a new tokenizer instance which parses Tab Separated Value strings. 204 */ 205 public static StrTokenizer getTSVInstance() { 206 return getTSVClone(); 207 } 208 209 /** 210 * Gets a new tokenizer instance which parses Tab Separated Value strings. 211 * The default for CSV processing will be trim whitespace from both ends 212 * (which can be overridden with the setTrimmer method). 213 * @param input the string to parse 214 * @return a new tokenizer instance which parses Tab Separated Value strings. 215 */ 216 public static StrTokenizer getTSVInstance(final String input) { 217 final StrTokenizer tok = getTSVClone(); 218 tok.reset(input); 219 return tok; 220 } 221 222 /** 223 * Gets a new tokenizer instance which parses Tab Separated Value strings. 224 * The default for CSV processing will be trim whitespace from both ends 225 * (which can be overridden with the setTrimmer method). 226 * @param input the string to parse 227 * @return a new tokenizer instance which parses Tab Separated Value strings. 228 */ 229 public static StrTokenizer getTSVInstance(final char[] input) { 230 final StrTokenizer tok = getTSVClone(); 231 tok.reset(input); 232 return tok; 233 } 234 235 //----------------------------------------------------------------------- 236 /** 237 * Constructs a tokenizer splitting on space, tab, newline and formfeed 238 * as per StringTokenizer, but with no text to tokenize. 239 * <p> 240 * This constructor is normally used with {@link #reset(String)}. 241 */ 242 public StrTokenizer() { 243 super(); 244 this.chars = null; 245 } 246 247 /** 248 * Constructs a tokenizer splitting on space, tab, newline and formfeed 249 * as per StringTokenizer. 250 * 251 * @param input the string which is to be parsed 252 */ 253 public StrTokenizer(final String input) { 254 super(); 255 if (input != null) { 256 chars = input.toCharArray(); 257 } else { 258 chars = null; 259 } 260 } 261 262 /** 263 * Constructs a tokenizer splitting on the specified delimiter character. 264 * 265 * @param input the string which is to be parsed 266 * @param delim the field delimiter character 267 */ 268 public StrTokenizer(final String input, final char delim) { 269 this(input); 270 setDelimiterChar(delim); 271 } 272 273 /** 274 * Constructs a tokenizer splitting on the specified delimiter string. 275 * 276 * @param input the string which is to be parsed 277 * @param delim the field delimiter string 278 */ 279 public StrTokenizer(final String input, final String delim) { 280 this(input); 281 setDelimiterString(delim); 282 } 283 284 /** 285 * Constructs a tokenizer splitting using the specified delimiter matcher. 286 * 287 * @param input the string which is to be parsed 288 * @param delim the field delimiter matcher 289 */ 290 public StrTokenizer(final String input, final StrMatcher delim) { 291 this(input); 292 setDelimiterMatcher(delim); 293 } 294 295 /** 296 * Constructs a tokenizer splitting on the specified delimiter character 297 * and handling quotes using the specified quote character. 298 * 299 * @param input the string which is to be parsed 300 * @param delim the field delimiter character 301 * @param quote the field quoted string character 302 */ 303 public StrTokenizer(final String input, final char delim, final char quote) { 304 this(input, delim); 305 setQuoteChar(quote); 306 } 307 308 /** 309 * Constructs a tokenizer splitting using the specified delimiter matcher 310 * and handling quotes using the specified quote matcher. 311 * 312 * @param input the string which is to be parsed 313 * @param delim the field delimiter matcher 314 * @param quote the field quoted string matcher 315 */ 316 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 317 this(input, delim); 318 setQuoteMatcher(quote); 319 } 320 321 /** 322 * Constructs a tokenizer splitting on space, tab, newline and formfeed 323 * as per StringTokenizer. 324 * 325 * @param input the string which is to be parsed, not cloned 326 */ 327 public StrTokenizer(final char[] input) { 328 super(); 329 this.chars = ArrayUtils.clone(input); 330 } 331 332 /** 333 * Constructs a tokenizer splitting on the specified character. 334 * 335 * @param input the string which is to be parsed, not cloned 336 * @param delim the field delimiter character 337 */ 338 public StrTokenizer(final char[] input, final char delim) { 339 this(input); 340 setDelimiterChar(delim); 341 } 342 343 /** 344 * Constructs a tokenizer splitting on the specified string. 345 * 346 * @param input the string which is to be parsed, not cloned 347 * @param delim the field delimiter string 348 */ 349 public StrTokenizer(final char[] input, final String delim) { 350 this(input); 351 setDelimiterString(delim); 352 } 353 354 /** 355 * Constructs a tokenizer splitting using the specified delimiter matcher. 356 * 357 * @param input the string which is to be parsed, not cloned 358 * @param delim the field delimiter matcher 359 */ 360 public StrTokenizer(final char[] input, final StrMatcher delim) { 361 this(input); 362 setDelimiterMatcher(delim); 363 } 364 365 /** 366 * Constructs a tokenizer splitting on the specified delimiter character 367 * and handling quotes using the specified quote character. 368 * 369 * @param input the string which is to be parsed, not cloned 370 * @param delim the field delimiter character 371 * @param quote the field quoted string character 372 */ 373 public StrTokenizer(final char[] input, final char delim, final char quote) { 374 this(input, delim); 375 setQuoteChar(quote); 376 } 377 378 /** 379 * Constructs a tokenizer splitting using the specified delimiter matcher 380 * and handling quotes using the specified quote matcher. 381 * 382 * @param input the string which is to be parsed, not cloned 383 * @param delim the field delimiter character 384 * @param quote the field quoted string character 385 */ 386 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 387 this(input, delim); 388 setQuoteMatcher(quote); 389 } 390 391 // API 392 //----------------------------------------------------------------------- 393 /** 394 * Gets the number of tokens found in the String. 395 * 396 * @return the number of matched tokens 397 */ 398 public int size() { 399 checkTokenized(); 400 return tokens.length; 401 } 402 403 /** 404 * Gets the next token from the String. 405 * Equivalent to {@link #next()} except it returns null rather than 406 * throwing {@link NoSuchElementException} when no tokens remain. 407 * 408 * @return the next sequential token, or null when no more tokens are found 409 */ 410 public String nextToken() { 411 if (hasNext()) { 412 return tokens[tokenPos++]; 413 } 414 return null; 415 } 416 417 /** 418 * Gets the previous token from the String. 419 * 420 * @return the previous sequential token, or null when no more tokens are found 421 */ 422 public String previousToken() { 423 if (hasPrevious()) { 424 return tokens[--tokenPos]; 425 } 426 return null; 427 } 428 429 /** 430 * Gets a copy of the full token list as an independent modifiable array. 431 * 432 * @return the tokens as a String array 433 */ 434 public String[] getTokenArray() { 435 checkTokenized(); 436 return tokens.clone(); 437 } 438 439 /** 440 * Gets a copy of the full token list as an independent modifiable list. 441 * 442 * @return the tokens as a String array 443 */ 444 public List<String> getTokenList() { 445 checkTokenized(); 446 final List<String> list = new ArrayList<>(tokens.length); 447 list.addAll(Arrays.asList(tokens)); 448 return list; 449 } 450 451 /** 452 * Resets this tokenizer, forgetting all parsing and iteration already completed. 453 * <p> 454 * This method allows the same tokenizer to be reused for the same String. 455 * 456 * @return this, to enable chaining 457 */ 458 public StrTokenizer reset() { 459 tokenPos = 0; 460 tokens = null; 461 return this; 462 } 463 464 /** 465 * Reset this tokenizer, giving it a new input string to parse. 466 * In this manner you can re-use a tokenizer with the same settings 467 * on multiple input lines. 468 * 469 * @param input the new string to tokenize, null sets no text to parse 470 * @return this, to enable chaining 471 */ 472 public StrTokenizer reset(final String input) { 473 reset(); 474 if (input != null) { 475 this.chars = input.toCharArray(); 476 } else { 477 this.chars = null; 478 } 479 return this; 480 } 481 482 /** 483 * Reset this tokenizer, giving it a new input string to parse. 484 * In this manner you can re-use a tokenizer with the same settings 485 * on multiple input lines. 486 * 487 * @param input the new character array to tokenize, not cloned, null sets no text to parse 488 * @return this, to enable chaining 489 */ 490 public StrTokenizer reset(final char[] input) { 491 reset(); 492 this.chars = ArrayUtils.clone(input); 493 return this; 494 } 495 496 // ListIterator 497 //----------------------------------------------------------------------- 498 /** 499 * Checks whether there are any more tokens. 500 * 501 * @return true if there are more tokens 502 */ 503 @Override 504 public boolean hasNext() { 505 checkTokenized(); 506 return tokenPos < tokens.length; 507 } 508 509 /** 510 * Gets the next token. 511 * 512 * @return the next String token 513 * @throws NoSuchElementException if there are no more elements 514 */ 515 @Override 516 public String next() { 517 if (hasNext()) { 518 return tokens[tokenPos++]; 519 } 520 throw new NoSuchElementException(); 521 } 522 523 /** 524 * Gets the index of the next token to return. 525 * 526 * @return the next token index 527 */ 528 @Override 529 public int nextIndex() { 530 return tokenPos; 531 } 532 533 /** 534 * Checks whether there are any previous tokens that can be iterated to. 535 * 536 * @return true if there are previous tokens 537 */ 538 @Override 539 public boolean hasPrevious() { 540 checkTokenized(); 541 return tokenPos > 0; 542 } 543 544 /** 545 * Gets the token previous to the last returned token. 546 * 547 * @return the previous token 548 */ 549 @Override 550 public String previous() { 551 if (hasPrevious()) { 552 return tokens[--tokenPos]; 553 } 554 throw new NoSuchElementException(); 555 } 556 557 /** 558 * Gets the index of the previous token. 559 * 560 * @return the previous token index 561 */ 562 @Override 563 public int previousIndex() { 564 return tokenPos - 1; 565 } 566 567 /** 568 * Unsupported ListIterator operation. 569 * 570 * @throws UnsupportedOperationException always 571 */ 572 @Override 573 public void remove() { 574 throw new UnsupportedOperationException("remove() is unsupported"); 575 } 576 577 /** 578 * Unsupported ListIterator operation. 579 * @param obj this parameter ignored. 580 * @throws UnsupportedOperationException always 581 */ 582 @Override 583 public void set(final String obj) { 584 throw new UnsupportedOperationException("set() is unsupported"); 585 } 586 587 /** 588 * Unsupported ListIterator operation. 589 * @param obj this parameter ignored. 590 * @throws UnsupportedOperationException always 591 */ 592 @Override 593 public void add(final String obj) { 594 throw new UnsupportedOperationException("add() is unsupported"); 595 } 596 597 // Implementation 598 //----------------------------------------------------------------------- 599 /** 600 * Checks if tokenization has been done, and if not then do it. 601 */ 602 private void checkTokenized() { 603 if (tokens == null) { 604 if (chars == null) { 605 // still call tokenize as subclass may do some work 606 final List<String> split = tokenize(null, 0, 0); 607 tokens = split.toArray(new String[split.size()]); 608 } else { 609 final List<String> split = tokenize(chars, 0, chars.length); 610 tokens = split.toArray(new String[split.size()]); 611 } 612 } 613 } 614 615 /** 616 * Internal method to performs the tokenization. 617 * <p> 618 * Most users of this class do not need to call this method. This method 619 * will be called automatically by other (public) methods when required. 620 * <p> 621 * This method exists to allow subclasses to add code before or after the 622 * tokenization. For example, a subclass could alter the character array, 623 * offset or count to be parsed, or call the tokenizer multiple times on 624 * multiple strings. It is also be possible to filter the results. 625 * <p> 626 * <code>StrTokenizer</code> will always pass a zero offset and a count 627 * equal to the length of the array to this method, however a subclass 628 * may pass other values, or even an entirely different array. 629 * 630 * @param srcChars the character array being tokenized, may be null 631 * @param offset the start position within the character array, must be valid 632 * @param count the number of characters to tokenize, must be valid 633 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 634 */ 635 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 636 if (srcChars == null || count == 0) { 637 return Collections.emptyList(); 638 } 639 final StrBuilder buf = new StrBuilder(); 640 final List<String> tokenList = new ArrayList<>(); 641 int pos = offset; 642 643 // loop around the entire buffer 644 while (pos >= 0 && pos < count) { 645 // find next token 646 pos = readNextToken(srcChars, pos, count, buf, tokenList); 647 648 // handle case where end of string is a delimiter 649 if (pos >= count) { 650 addToken(tokenList, StringUtils.EMPTY); 651 } 652 } 653 return tokenList; 654 } 655 656 /** 657 * Adds a token to a list, paying attention to the parameters we've set. 658 * 659 * @param list the list to add to 660 * @param tok the token to add 661 */ 662 private void addToken(final List<String> list, String tok) { 663 if (StringUtils.isEmpty(tok)) { 664 if (isIgnoreEmptyTokens()) { 665 return; 666 } 667 if (isEmptyTokenAsNull()) { 668 tok = null; 669 } 670 } 671 list.add(tok); 672 } 673 674 /** 675 * Reads character by character through the String to get the next token. 676 * 677 * @param srcChars the character array being tokenized 678 * @param start the first character of field 679 * @param len the length of the character array being tokenized 680 * @param workArea a temporary work area 681 * @param tokenList the list of parsed tokens 682 * @return the starting position of the next field (the character 683 * immediately after the delimiter), or -1 if end of string found 684 */ 685 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 686 // skip all leading whitespace, unless it is the 687 // field delimiter or the quote character 688 while (start < len) { 689 final int removeLen = Math.max( 690 getIgnoredMatcher().isMatch(srcChars, start, start, len), 691 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 692 if (removeLen == 0 || 693 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 694 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 695 break; 696 } 697 start += removeLen; 698 } 699 700 // handle reaching end 701 if (start >= len) { 702 addToken(tokenList, StringUtils.EMPTY); 703 return -1; 704 } 705 706 // handle empty token 707 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 708 if (delimLen > 0) { 709 addToken(tokenList, StringUtils.EMPTY); 710 return start + delimLen; 711 } 712 713 // handle found token 714 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 715 if (quoteLen > 0) { 716 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 717 } 718 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 719 } 720 721 /** 722 * Reads a possibly quoted string token. 723 * 724 * @param srcChars the character array being tokenized 725 * @param start the first character of field 726 * @param len the length of the character array being tokenized 727 * @param workArea a temporary work area 728 * @param tokenList the list of parsed tokens 729 * @param quoteStart the start position of the matched quote, 0 if no quoting 730 * @param quoteLen the length of the matched quote, 0 if no quoting 731 * @return the starting position of the next field (the character 732 * immediately after the delimiter, or if end of string found, 733 * then the length of string 734 */ 735 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 736 final List<String> tokenList, final int quoteStart, final int quoteLen) { 737 // Loop until we've found the end of the quoted 738 // string or the end of the input 739 workArea.clear(); 740 int pos = start; 741 boolean quoting = quoteLen > 0; 742 int trimStart = 0; 743 744 while (pos < len) { 745 // quoting mode can occur several times throughout a string 746 // we must switch between quoting and non-quoting until we 747 // encounter a non-quoted delimiter, or end of string 748 if (quoting) { 749 // In quoting mode 750 751 // If we've found a quote character, see if it's 752 // followed by a second quote. If so, then we need 753 // to actually put the quote character into the token 754 // rather than end the token. 755 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 756 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 757 // matched pair of quotes, thus an escaped quote 758 workArea.append(srcChars, pos, quoteLen); 759 pos += quoteLen * 2; 760 trimStart = workArea.size(); 761 continue; 762 } 763 764 // end of quoting 765 quoting = false; 766 pos += quoteLen; 767 continue; 768 } 769 770 // copy regular character from inside quotes 771 workArea.append(srcChars[pos++]); 772 trimStart = workArea.size(); 773 774 } else { 775 // Not in quoting mode 776 777 // check for delimiter, and thus end of token 778 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 779 if (delimLen > 0) { 780 // return condition when end of token found 781 addToken(tokenList, workArea.substring(0, trimStart)); 782 return pos + delimLen; 783 } 784 785 // check for quote, and thus back into quoting mode 786 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 787 quoting = true; 788 pos += quoteLen; 789 continue; 790 } 791 792 // check for ignored (outside quotes), and ignore 793 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 794 if (ignoredLen > 0) { 795 pos += ignoredLen; 796 continue; 797 } 798 799 // check for trimmed character 800 // don't yet know if its at the end, so copy to workArea 801 // use trimStart to keep track of trim at the end 802 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 803 if (trimmedLen > 0) { 804 workArea.append(srcChars, pos, trimmedLen); 805 pos += trimmedLen; 806 continue; 807 } 808 809 // copy regular character from outside quotes 810 workArea.append(srcChars[pos++]); 811 trimStart = workArea.size(); 812 } 813 } 814 815 // return condition when end of string found 816 addToken(tokenList, workArea.substring(0, trimStart)); 817 return -1; 818 } 819 820 /** 821 * Checks if the characters at the index specified match the quote 822 * already matched in readNextToken(). 823 * 824 * @param srcChars the character array being tokenized 825 * @param pos the position to check for a quote 826 * @param len the length of the character array being tokenized 827 * @param quoteStart the start position of the matched quote, 0 if no quoting 828 * @param quoteLen the length of the matched quote, 0 if no quoting 829 * @return true if a quote is matched 830 */ 831 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 832 for (int i = 0; i < quoteLen; i++) { 833 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 834 return false; 835 } 836 } 837 return true; 838 } 839 840 // Delimiter 841 //----------------------------------------------------------------------- 842 /** 843 * Gets the field delimiter matcher. 844 * 845 * @return the delimiter matcher in use 846 */ 847 public StrMatcher getDelimiterMatcher() { 848 return this.delimMatcher; 849 } 850 851 /** 852 * Sets the field delimiter matcher. 853 * <p> 854 * The delimiter is used to separate one token from another. 855 * 856 * @param delim the delimiter matcher to use 857 * @return this, to enable chaining 858 */ 859 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 860 if (delim == null) { 861 this.delimMatcher = StrMatcher.noneMatcher(); 862 } else { 863 this.delimMatcher = delim; 864 } 865 return this; 866 } 867 868 /** 869 * Sets the field delimiter character. 870 * 871 * @param delim the delimiter character to use 872 * @return this, to enable chaining 873 */ 874 public StrTokenizer setDelimiterChar(final char delim) { 875 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 876 } 877 878 /** 879 * Sets the field delimiter string. 880 * 881 * @param delim the delimiter string to use 882 * @return this, to enable chaining 883 */ 884 public StrTokenizer setDelimiterString(final String delim) { 885 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 886 } 887 888 // Quote 889 //----------------------------------------------------------------------- 890 /** 891 * Gets the quote matcher currently in use. 892 * <p> 893 * The quote character is used to wrap data between the tokens. 894 * This enables delimiters to be entered as data. 895 * The default value is '"' (double quote). 896 * 897 * @return the quote matcher in use 898 */ 899 public StrMatcher getQuoteMatcher() { 900 return quoteMatcher; 901 } 902 903 /** 904 * Set the quote matcher to use. 905 * <p> 906 * The quote character is used to wrap data between the tokens. 907 * This enables delimiters to be entered as data. 908 * 909 * @param quote the quote matcher to use, null ignored 910 * @return this, to enable chaining 911 */ 912 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 913 if (quote != null) { 914 this.quoteMatcher = quote; 915 } 916 return this; 917 } 918 919 /** 920 * Sets the quote character to use. 921 * <p> 922 * The quote character is used to wrap data between the tokens. 923 * This enables delimiters to be entered as data. 924 * 925 * @param quote the quote character to use 926 * @return this, to enable chaining 927 */ 928 public StrTokenizer setQuoteChar(final char quote) { 929 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 930 } 931 932 // Ignored 933 //----------------------------------------------------------------------- 934 /** 935 * Gets the ignored character matcher. 936 * <p> 937 * These characters are ignored when parsing the String, unless they are 938 * within a quoted region. 939 * The default value is not to ignore anything. 940 * 941 * @return the ignored matcher in use 942 */ 943 public StrMatcher getIgnoredMatcher() { 944 return ignoredMatcher; 945 } 946 947 /** 948 * Set the matcher for characters to ignore. 949 * <p> 950 * These characters are ignored when parsing the String, unless they are 951 * within a quoted region. 952 * 953 * @param ignored the ignored matcher to use, null ignored 954 * @return this, to enable chaining 955 */ 956 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 957 if (ignored != null) { 958 this.ignoredMatcher = ignored; 959 } 960 return this; 961 } 962 963 /** 964 * Set the character to ignore. 965 * <p> 966 * This character is ignored when parsing the String, unless it is 967 * within a quoted region. 968 * 969 * @param ignored the ignored character to use 970 * @return this, to enable chaining 971 */ 972 public StrTokenizer setIgnoredChar(final char ignored) { 973 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 974 } 975 976 // Trimmer 977 //----------------------------------------------------------------------- 978 /** 979 * Gets the trimmer character matcher. 980 * <p> 981 * These characters are trimmed off on each side of the delimiter 982 * until the token or quote is found. 983 * The default value is not to trim anything. 984 * 985 * @return the trimmer matcher in use 986 */ 987 public StrMatcher getTrimmerMatcher() { 988 return trimmerMatcher; 989 } 990 991 /** 992 * Sets the matcher for characters to trim. 993 * <p> 994 * These characters are trimmed off on each side of the delimiter 995 * until the token or quote is found. 996 * 997 * @param trimmer the trimmer matcher to use, null ignored 998 * @return this, to enable chaining 999 */ 1000 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1001 if (trimmer != null) { 1002 this.trimmerMatcher = trimmer; 1003 } 1004 return this; 1005 } 1006 1007 //----------------------------------------------------------------------- 1008 /** 1009 * Gets whether the tokenizer currently returns empty tokens as null. 1010 * The default for this property is false. 1011 * 1012 * @return true if empty tokens are returned as null 1013 */ 1014 public boolean isEmptyTokenAsNull() { 1015 return this.emptyAsNull; 1016 } 1017 1018 /** 1019 * Sets whether the tokenizer should return empty tokens as null. 1020 * The default for this property is false. 1021 * 1022 * @param emptyAsNull whether empty tokens are returned as null 1023 * @return this, to enable chaining 1024 */ 1025 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1026 this.emptyAsNull = emptyAsNull; 1027 return this; 1028 } 1029 1030 //----------------------------------------------------------------------- 1031 /** 1032 * Gets whether the tokenizer currently ignores empty tokens. 1033 * The default for this property is true. 1034 * 1035 * @return true if empty tokens are not returned 1036 */ 1037 public boolean isIgnoreEmptyTokens() { 1038 return ignoreEmptyTokens; 1039 } 1040 1041 /** 1042 * Sets whether the tokenizer should ignore and not return empty tokens. 1043 * The default for this property is true. 1044 * 1045 * @param ignoreEmptyTokens whether empty tokens are not returned 1046 * @return this, to enable chaining 1047 */ 1048 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1049 this.ignoreEmptyTokens = ignoreEmptyTokens; 1050 return this; 1051 } 1052 1053 //----------------------------------------------------------------------- 1054 /** 1055 * Gets the String content that the tokenizer is parsing. 1056 * 1057 * @return the string content being parsed 1058 */ 1059 public String getContent() { 1060 if (chars == null) { 1061 return null; 1062 } 1063 return new String(chars); 1064 } 1065 1066 //----------------------------------------------------------------------- 1067 /** 1068 * Creates a new instance of this Tokenizer. The new instance is reset so 1069 * that it will be at the start of the token list. 1070 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1071 * 1072 * @return a new instance of this Tokenizer which has been reset. 1073 */ 1074 @Override 1075 public Object clone() { 1076 try { 1077 return cloneReset(); 1078 } catch (final CloneNotSupportedException ex) { 1079 return null; 1080 } 1081 } 1082 1083 /** 1084 * Creates a new instance of this Tokenizer. The new instance is reset so that 1085 * it will be at the start of the token list. 1086 * 1087 * @return a new instance of this Tokenizer which has been reset. 1088 * @throws CloneNotSupportedException if there is a problem cloning 1089 */ 1090 Object cloneReset() throws CloneNotSupportedException { 1091 // this method exists to enable 100% test coverage 1092 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1093 if (cloned.chars != null) { 1094 cloned.chars = cloned.chars.clone(); 1095 } 1096 cloned.reset(); 1097 return cloned; 1098 } 1099 1100 //----------------------------------------------------------------------- 1101 /** 1102 * Gets the String content that the tokenizer is parsing. 1103 * 1104 * @return the string content being parsed 1105 */ 1106 @Override 1107 public String toString() { 1108 if (tokens == null) { 1109 return "StrTokenizer[not tokenized yet]"; 1110 } 1111 return "StrTokenizer" + getTokenList(); 1112 } 1113 1114}