001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.lang3.ArrayUtils; 026import org.apache.commons.lang3.StringUtils; 027 028/** 029 * Tokenizes a string based based on delimiters (separators) 030 * and supporting quoting and ignored character concepts. 031 * <p> 032 * This class can split a String into many smaller strings. It aims 033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 034 * however it offers much more control and flexibility including implementing 035 * the <code>ListIterator</code> interface. By default, it is set up 036 * like <code>StringTokenizer</code>. 037 * <p> 038 * The input String is split into a number of <i>tokens</i>. 039 * Each token is separated from the next String by a <i>delimiter</i>. 040 * One or more delimiter characters must be specified. 041 * <p> 042 * Each token may be surrounded by quotes. 043 * The <i>quote</i> matcher specifies the quote character(s). 044 * A quote may be escaped within a quoted section by duplicating itself. 045 * <p> 046 * Between each token and the delimiter are potentially characters that need trimming. 047 * The <i>trimmer</i> matcher specifies these characters. 048 * One usage might be to trim whitespace characters. 049 * <p> 050 * At any point outside the quotes there might potentially be invalid characters. 051 * The <i>ignored</i> matcher specifies these characters to be removed. 052 * One usage might be to remove new line characters. 053 * <p> 054 * Empty tokens may be removed or returned as null. 055 * <pre> 056 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 057 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 059 * </pre> 060 * <p> 061 * 062 * This tokenizer has the following properties and options: 063 * 064 * <table summary="Tokenizer Properties"> 065 * <tr> 066 * <th>Property</th><th>Type</th><th>Default</th> 067 * </tr> 068 * <tr> 069 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 070 * </tr> 071 * <tr> 072 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 073 * </tr> 074 * <tr> 075 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 076 * </tr> 077 * <tr> 078 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 079 * </tr> 080 * <tr> 081 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 082 * </tr> 083 * </table> 084 * 085 * @since 2.2 086 * @deprecated as of 3.6, use commons-text 087 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StrTokenizer.html"> 088 * StrTokenizer</a> instead 089 */ 090@Deprecated 091public class StrTokenizer implements ListIterator<String>, Cloneable { 092 093 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 094 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 095 static { 096 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 097 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 098 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 099 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 100 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 101 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 102 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 103 104 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 105 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 107 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 108 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 109 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 110 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 111 } 112 113 /** The text to work on. */ 114 private char chars[]; 115 /** The parsed tokens */ 116 private String tokens[]; 117 /** The current iteration position */ 118 private int tokenPos; 119 120 /** The delimiter matcher */ 121 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 122 /** The quote matcher */ 123 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 124 /** The ignored matcher */ 125 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 126 /** The trimmer matcher */ 127 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 128 129 /** Whether to return empty tokens as null */ 130 private boolean emptyAsNull = false; 131 /** Whether to ignore empty tokens */ 132 private boolean ignoreEmptyTokens = true; 133 134 //----------------------------------------------------------------------- 135 136 /** 137 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 138 * 139 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 140 */ 141 private static StrTokenizer getCSVClone() { 142 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 143 } 144 145 /** 146 * Gets a new tokenizer instance which parses Comma Separated Value strings 147 * initializing it with the given input. The default for CSV processing 148 * will be trim whitespace from both ends (which can be overridden with 149 * the setTrimmer method). 150 * <p> 151 * You must call a "reset" method to set the string which you want to parse. 152 * @return a new tokenizer instance which parses Comma Separated Value strings 153 */ 154 public static StrTokenizer getCSVInstance() { 155 return getCSVClone(); 156 } 157 158 /** 159 * Gets a new tokenizer instance which parses Comma Separated Value strings 160 * initializing it with the given input. The default for CSV processing 161 * will be trim whitespace from both ends (which can be overridden with 162 * the setTrimmer method). 163 * 164 * @param input the text to parse 165 * @return a new tokenizer instance which parses Comma Separated Value strings 166 */ 167 public static StrTokenizer getCSVInstance(final String input) { 168 final StrTokenizer tok = getCSVClone(); 169 tok.reset(input); 170 return tok; 171 } 172 173 /** 174 * Gets a new tokenizer instance which parses Comma Separated Value strings 175 * initializing it with the given input. The default for CSV processing 176 * will be trim whitespace from both ends (which can be overridden with 177 * the setTrimmer method). 178 * 179 * @param input the text to parse 180 * @return a new tokenizer instance which parses Comma Separated Value strings 181 */ 182 public static StrTokenizer getCSVInstance(final char[] input) { 183 final StrTokenizer tok = getCSVClone(); 184 tok.reset(input); 185 return tok; 186 } 187 188 /** 189 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 190 * 191 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 192 */ 193 private static StrTokenizer getTSVClone() { 194 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 195 } 196 197 198 /** 199 * Gets a new tokenizer instance which parses Tab Separated Value strings. 200 * The default for CSV processing will be trim whitespace from both ends 201 * (which can be overridden with the setTrimmer method). 202 * <p> 203 * You must call a "reset" method to set the string which you want to parse. 204 * @return a new tokenizer instance which parses Tab Separated Value strings. 205 */ 206 public static StrTokenizer getTSVInstance() { 207 return getTSVClone(); 208 } 209 210 /** 211 * Gets a new tokenizer instance which parses Tab Separated Value strings. 212 * The default for CSV processing will be trim whitespace from both ends 213 * (which can be overridden with the setTrimmer method). 214 * @param input the string to parse 215 * @return a new tokenizer instance which parses Tab Separated Value strings. 216 */ 217 public static StrTokenizer getTSVInstance(final String input) { 218 final StrTokenizer tok = getTSVClone(); 219 tok.reset(input); 220 return tok; 221 } 222 223 /** 224 * Gets a new tokenizer instance which parses Tab Separated Value strings. 225 * The default for CSV processing will be trim whitespace from both ends 226 * (which can be overridden with the setTrimmer method). 227 * @param input the string to parse 228 * @return a new tokenizer instance which parses Tab Separated Value strings. 229 */ 230 public static StrTokenizer getTSVInstance(final char[] input) { 231 final StrTokenizer tok = getTSVClone(); 232 tok.reset(input); 233 return tok; 234 } 235 236 //----------------------------------------------------------------------- 237 /** 238 * Constructs a tokenizer splitting on space, tab, newline and formfeed 239 * as per StringTokenizer, but with no text to tokenize. 240 * <p> 241 * This constructor is normally used with {@link #reset(String)}. 242 */ 243 public StrTokenizer() { 244 super(); 245 this.chars = null; 246 } 247 248 /** 249 * Constructs a tokenizer splitting on space, tab, newline and formfeed 250 * as per StringTokenizer. 251 * 252 * @param input the string which is to be parsed 253 */ 254 public StrTokenizer(final String input) { 255 super(); 256 if (input != null) { 257 chars = input.toCharArray(); 258 } else { 259 chars = null; 260 } 261 } 262 263 /** 264 * Constructs a tokenizer splitting on the specified delimiter character. 265 * 266 * @param input the string which is to be parsed 267 * @param delim the field delimiter character 268 */ 269 public StrTokenizer(final String input, final char delim) { 270 this(input); 271 setDelimiterChar(delim); 272 } 273 274 /** 275 * Constructs a tokenizer splitting on the specified delimiter string. 276 * 277 * @param input the string which is to be parsed 278 * @param delim the field delimiter string 279 */ 280 public StrTokenizer(final String input, final String delim) { 281 this(input); 282 setDelimiterString(delim); 283 } 284 285 /** 286 * Constructs a tokenizer splitting using the specified delimiter matcher. 287 * 288 * @param input the string which is to be parsed 289 * @param delim the field delimiter matcher 290 */ 291 public StrTokenizer(final String input, final StrMatcher delim) { 292 this(input); 293 setDelimiterMatcher(delim); 294 } 295 296 /** 297 * Constructs a tokenizer splitting on the specified delimiter character 298 * and handling quotes using the specified quote character. 299 * 300 * @param input the string which is to be parsed 301 * @param delim the field delimiter character 302 * @param quote the field quoted string character 303 */ 304 public StrTokenizer(final String input, final char delim, final char quote) { 305 this(input, delim); 306 setQuoteChar(quote); 307 } 308 309 /** 310 * Constructs a tokenizer splitting using the specified delimiter matcher 311 * and handling quotes using the specified quote matcher. 312 * 313 * @param input the string which is to be parsed 314 * @param delim the field delimiter matcher 315 * @param quote the field quoted string matcher 316 */ 317 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 318 this(input, delim); 319 setQuoteMatcher(quote); 320 } 321 322 /** 323 * Constructs a tokenizer splitting on space, tab, newline and formfeed 324 * as per StringTokenizer. 325 * 326 * @param input the string which is to be parsed, not cloned 327 */ 328 public StrTokenizer(final char[] input) { 329 super(); 330 this.chars = ArrayUtils.clone(input); 331 } 332 333 /** 334 * Constructs a tokenizer splitting on the specified character. 335 * 336 * @param input the string which is to be parsed, not cloned 337 * @param delim the field delimiter character 338 */ 339 public StrTokenizer(final char[] input, final char delim) { 340 this(input); 341 setDelimiterChar(delim); 342 } 343 344 /** 345 * Constructs a tokenizer splitting on the specified string. 346 * 347 * @param input the string which is to be parsed, not cloned 348 * @param delim the field delimiter string 349 */ 350 public StrTokenizer(final char[] input, final String delim) { 351 this(input); 352 setDelimiterString(delim); 353 } 354 355 /** 356 * Constructs a tokenizer splitting using the specified delimiter matcher. 357 * 358 * @param input the string which is to be parsed, not cloned 359 * @param delim the field delimiter matcher 360 */ 361 public StrTokenizer(final char[] input, final StrMatcher delim) { 362 this(input); 363 setDelimiterMatcher(delim); 364 } 365 366 /** 367 * Constructs a tokenizer splitting on the specified delimiter character 368 * and handling quotes using the specified quote character. 369 * 370 * @param input the string which is to be parsed, not cloned 371 * @param delim the field delimiter character 372 * @param quote the field quoted string character 373 */ 374 public StrTokenizer(final char[] input, final char delim, final char quote) { 375 this(input, delim); 376 setQuoteChar(quote); 377 } 378 379 /** 380 * Constructs a tokenizer splitting using the specified delimiter matcher 381 * and handling quotes using the specified quote matcher. 382 * 383 * @param input the string which is to be parsed, not cloned 384 * @param delim the field delimiter character 385 * @param quote the field quoted string character 386 */ 387 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 388 this(input, delim); 389 setQuoteMatcher(quote); 390 } 391 392 // API 393 //----------------------------------------------------------------------- 394 /** 395 * Gets the number of tokens found in the String. 396 * 397 * @return the number of matched tokens 398 */ 399 public int size() { 400 checkTokenized(); 401 return tokens.length; 402 } 403 404 /** 405 * Gets the next token from the String. 406 * Equivalent to {@link #next()} except it returns null rather than 407 * throwing {@link NoSuchElementException} when no tokens remain. 408 * 409 * @return the next sequential token, or null when no more tokens are found 410 */ 411 public String nextToken() { 412 if (hasNext()) { 413 return tokens[tokenPos++]; 414 } 415 return null; 416 } 417 418 /** 419 * Gets the previous token from the String. 420 * 421 * @return the previous sequential token, or null when no more tokens are found 422 */ 423 public String previousToken() { 424 if (hasPrevious()) { 425 return tokens[--tokenPos]; 426 } 427 return null; 428 } 429 430 /** 431 * Gets a copy of the full token list as an independent modifiable array. 432 * 433 * @return the tokens as a String array 434 */ 435 public String[] getTokenArray() { 436 checkTokenized(); 437 return tokens.clone(); 438 } 439 440 /** 441 * Gets a copy of the full token list as an independent modifiable list. 442 * 443 * @return the tokens as a String array 444 */ 445 public List<String> getTokenList() { 446 checkTokenized(); 447 final List<String> list = new ArrayList<>(tokens.length); 448 for (final String element : tokens) { 449 list.add(element); 450 } 451 return list; 452 } 453 454 /** 455 * Resets this tokenizer, forgetting all parsing and iteration already completed. 456 * <p> 457 * This method allows the same tokenizer to be reused for the same String. 458 * 459 * @return this, to enable chaining 460 */ 461 public StrTokenizer reset() { 462 tokenPos = 0; 463 tokens = null; 464 return this; 465 } 466 467 /** 468 * Reset this tokenizer, giving it a new input string to parse. 469 * In this manner you can re-use a tokenizer with the same settings 470 * on multiple input lines. 471 * 472 * @param input the new string to tokenize, null sets no text to parse 473 * @return this, to enable chaining 474 */ 475 public StrTokenizer reset(final String input) { 476 reset(); 477 if (input != null) { 478 this.chars = input.toCharArray(); 479 } else { 480 this.chars = null; 481 } 482 return this; 483 } 484 485 /** 486 * Reset this tokenizer, giving it a new input string to parse. 487 * In this manner you can re-use a tokenizer with the same settings 488 * on multiple input lines. 489 * 490 * @param input the new character array to tokenize, not cloned, null sets no text to parse 491 * @return this, to enable chaining 492 */ 493 public StrTokenizer reset(final char[] input) { 494 reset(); 495 this.chars = ArrayUtils.clone(input); 496 return this; 497 } 498 499 // ListIterator 500 //----------------------------------------------------------------------- 501 /** 502 * Checks whether there are any more tokens. 503 * 504 * @return true if there are more tokens 505 */ 506 @Override 507 public boolean hasNext() { 508 checkTokenized(); 509 return tokenPos < tokens.length; 510 } 511 512 /** 513 * Gets the next token. 514 * 515 * @return the next String token 516 * @throws NoSuchElementException if there are no more elements 517 */ 518 @Override 519 public String next() { 520 if (hasNext()) { 521 return tokens[tokenPos++]; 522 } 523 throw new NoSuchElementException(); 524 } 525 526 /** 527 * Gets the index of the next token to return. 528 * 529 * @return the next token index 530 */ 531 @Override 532 public int nextIndex() { 533 return tokenPos; 534 } 535 536 /** 537 * Checks whether there are any previous tokens that can be iterated to. 538 * 539 * @return true if there are previous tokens 540 */ 541 @Override 542 public boolean hasPrevious() { 543 checkTokenized(); 544 return tokenPos > 0; 545 } 546 547 /** 548 * Gets the token previous to the last returned token. 549 * 550 * @return the previous token 551 */ 552 @Override 553 public String previous() { 554 if (hasPrevious()) { 555 return tokens[--tokenPos]; 556 } 557 throw new NoSuchElementException(); 558 } 559 560 /** 561 * Gets the index of the previous token. 562 * 563 * @return the previous token index 564 */ 565 @Override 566 public int previousIndex() { 567 return tokenPos - 1; 568 } 569 570 /** 571 * Unsupported ListIterator operation. 572 * 573 * @throws UnsupportedOperationException always 574 */ 575 @Override 576 public void remove() { 577 throw new UnsupportedOperationException("remove() is unsupported"); 578 } 579 580 /** 581 * Unsupported ListIterator operation. 582 * @param obj this parameter ignored. 583 * @throws UnsupportedOperationException always 584 */ 585 @Override 586 public void set(final String obj) { 587 throw new UnsupportedOperationException("set() is unsupported"); 588 } 589 590 /** 591 * Unsupported ListIterator operation. 592 * @param obj this parameter ignored. 593 * @throws UnsupportedOperationException always 594 */ 595 @Override 596 public void add(final String obj) { 597 throw new UnsupportedOperationException("add() is unsupported"); 598 } 599 600 // Implementation 601 //----------------------------------------------------------------------- 602 /** 603 * Checks if tokenization has been done, and if not then do it. 604 */ 605 private void checkTokenized() { 606 if (tokens == null) { 607 if (chars == null) { 608 // still call tokenize as subclass may do some work 609 final List<String> split = tokenize(null, 0, 0); 610 tokens = split.toArray(new String[split.size()]); 611 } else { 612 final List<String> split = tokenize(chars, 0, chars.length); 613 tokens = split.toArray(new String[split.size()]); 614 } 615 } 616 } 617 618 /** 619 * Internal method to performs the tokenization. 620 * <p> 621 * Most users of this class do not need to call this method. This method 622 * will be called automatically by other (public) methods when required. 623 * <p> 624 * This method exists to allow subclasses to add code before or after the 625 * tokenization. For example, a subclass could alter the character array, 626 * offset or count to be parsed, or call the tokenizer multiple times on 627 * multiple strings. It is also be possible to filter the results. 628 * <p> 629 * <code>StrTokenizer</code> will always pass a zero offset and a count 630 * equal to the length of the array to this method, however a subclass 631 * may pass other values, or even an entirely different array. 632 * 633 * @param srcChars the character array being tokenized, may be null 634 * @param offset the start position within the character array, must be valid 635 * @param count the number of characters to tokenize, must be valid 636 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 637 */ 638 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 639 if (srcChars == null || count == 0) { 640 return Collections.emptyList(); 641 } 642 final StrBuilder buf = new StrBuilder(); 643 final List<String> tokenList = new ArrayList<>(); 644 int pos = offset; 645 646 // loop around the entire buffer 647 while (pos >= 0 && pos < count) { 648 // find next token 649 pos = readNextToken(srcChars, pos, count, buf, tokenList); 650 651 // handle case where end of string is a delimiter 652 if (pos >= count) { 653 addToken(tokenList, StringUtils.EMPTY); 654 } 655 } 656 return tokenList; 657 } 658 659 /** 660 * Adds a token to a list, paying attention to the parameters we've set. 661 * 662 * @param list the list to add to 663 * @param tok the token to add 664 */ 665 private void addToken(final List<String> list, String tok) { 666 if (StringUtils.isEmpty(tok)) { 667 if (isIgnoreEmptyTokens()) { 668 return; 669 } 670 if (isEmptyTokenAsNull()) { 671 tok = null; 672 } 673 } 674 list.add(tok); 675 } 676 677 /** 678 * Reads character by character through the String to get the next token. 679 * 680 * @param srcChars the character array being tokenized 681 * @param start the first character of field 682 * @param len the length of the character array being tokenized 683 * @param workArea a temporary work area 684 * @param tokenList the list of parsed tokens 685 * @return the starting position of the next field (the character 686 * immediately after the delimiter), or -1 if end of string found 687 */ 688 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 689 // skip all leading whitespace, unless it is the 690 // field delimiter or the quote character 691 while (start < len) { 692 final int removeLen = Math.max( 693 getIgnoredMatcher().isMatch(srcChars, start, start, len), 694 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 695 if (removeLen == 0 || 696 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 697 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 698 break; 699 } 700 start += removeLen; 701 } 702 703 // handle reaching end 704 if (start >= len) { 705 addToken(tokenList, StringUtils.EMPTY); 706 return -1; 707 } 708 709 // handle empty token 710 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 711 if (delimLen > 0) { 712 addToken(tokenList, StringUtils.EMPTY); 713 return start + delimLen; 714 } 715 716 // handle found token 717 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 718 if (quoteLen > 0) { 719 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 720 } 721 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 722 } 723 724 /** 725 * Reads a possibly quoted string token. 726 * 727 * @param srcChars the character array being tokenized 728 * @param start the first character of field 729 * @param len the length of the character array being tokenized 730 * @param workArea a temporary work area 731 * @param tokenList the list of parsed tokens 732 * @param quoteStart the start position of the matched quote, 0 if no quoting 733 * @param quoteLen the length of the matched quote, 0 if no quoting 734 * @return the starting position of the next field (the character 735 * immediately after the delimiter, or if end of string found, 736 * then the length of string 737 */ 738 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 739 final List<String> tokenList, final int quoteStart, final int quoteLen) { 740 // Loop until we've found the end of the quoted 741 // string or the end of the input 742 workArea.clear(); 743 int pos = start; 744 boolean quoting = quoteLen > 0; 745 int trimStart = 0; 746 747 while (pos < len) { 748 // quoting mode can occur several times throughout a string 749 // we must switch between quoting and non-quoting until we 750 // encounter a non-quoted delimiter, or end of string 751 if (quoting) { 752 // In quoting mode 753 754 // If we've found a quote character, see if it's 755 // followed by a second quote. If so, then we need 756 // to actually put the quote character into the token 757 // rather than end the token. 758 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 759 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 760 // matched pair of quotes, thus an escaped quote 761 workArea.append(srcChars, pos, quoteLen); 762 pos += quoteLen * 2; 763 trimStart = workArea.size(); 764 continue; 765 } 766 767 // end of quoting 768 quoting = false; 769 pos += quoteLen; 770 continue; 771 } 772 773 // copy regular character from inside quotes 774 workArea.append(srcChars[pos++]); 775 trimStart = workArea.size(); 776 777 } else { 778 // Not in quoting mode 779 780 // check for delimiter, and thus end of token 781 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 782 if (delimLen > 0) { 783 // return condition when end of token found 784 addToken(tokenList, workArea.substring(0, trimStart)); 785 return pos + delimLen; 786 } 787 788 // check for quote, and thus back into quoting mode 789 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 790 quoting = true; 791 pos += quoteLen; 792 continue; 793 } 794 795 // check for ignored (outside quotes), and ignore 796 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 797 if (ignoredLen > 0) { 798 pos += ignoredLen; 799 continue; 800 } 801 802 // check for trimmed character 803 // don't yet know if its at the end, so copy to workArea 804 // use trimStart to keep track of trim at the end 805 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 806 if (trimmedLen > 0) { 807 workArea.append(srcChars, pos, trimmedLen); 808 pos += trimmedLen; 809 continue; 810 } 811 812 // copy regular character from outside quotes 813 workArea.append(srcChars[pos++]); 814 trimStart = workArea.size(); 815 } 816 } 817 818 // return condition when end of string found 819 addToken(tokenList, workArea.substring(0, trimStart)); 820 return -1; 821 } 822 823 /** 824 * Checks if the characters at the index specified match the quote 825 * already matched in readNextToken(). 826 * 827 * @param srcChars the character array being tokenized 828 * @param pos the position to check for a quote 829 * @param len the length of the character array being tokenized 830 * @param quoteStart the start position of the matched quote, 0 if no quoting 831 * @param quoteLen the length of the matched quote, 0 if no quoting 832 * @return true if a quote is matched 833 */ 834 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 835 for (int i = 0; i < quoteLen; i++) { 836 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 837 return false; 838 } 839 } 840 return true; 841 } 842 843 // Delimiter 844 //----------------------------------------------------------------------- 845 /** 846 * Gets the field delimiter matcher. 847 * 848 * @return the delimiter matcher in use 849 */ 850 public StrMatcher getDelimiterMatcher() { 851 return this.delimMatcher; 852 } 853 854 /** 855 * Sets the field delimiter matcher. 856 * <p> 857 * The delimitier is used to separate one token from another. 858 * 859 * @param delim the delimiter matcher to use 860 * @return this, to enable chaining 861 */ 862 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 863 if (delim == null) { 864 this.delimMatcher = StrMatcher.noneMatcher(); 865 } else { 866 this.delimMatcher = delim; 867 } 868 return this; 869 } 870 871 /** 872 * Sets the field delimiter character. 873 * 874 * @param delim the delimiter character to use 875 * @return this, to enable chaining 876 */ 877 public StrTokenizer setDelimiterChar(final char delim) { 878 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 879 } 880 881 /** 882 * Sets the field delimiter string. 883 * 884 * @param delim the delimiter string to use 885 * @return this, to enable chaining 886 */ 887 public StrTokenizer setDelimiterString(final String delim) { 888 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 889 } 890 891 // Quote 892 //----------------------------------------------------------------------- 893 /** 894 * Gets the quote matcher currently in use. 895 * <p> 896 * The quote character is used to wrap data between the tokens. 897 * This enables delimiters to be entered as data. 898 * The default value is '"' (double quote). 899 * 900 * @return the quote matcher in use 901 */ 902 public StrMatcher getQuoteMatcher() { 903 return quoteMatcher; 904 } 905 906 /** 907 * Set the quote matcher to use. 908 * <p> 909 * The quote character is used to wrap data between the tokens. 910 * This enables delimiters to be entered as data. 911 * 912 * @param quote the quote matcher to use, null ignored 913 * @return this, to enable chaining 914 */ 915 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 916 if (quote != null) { 917 this.quoteMatcher = quote; 918 } 919 return this; 920 } 921 922 /** 923 * Sets the quote character to use. 924 * <p> 925 * The quote character is used to wrap data between the tokens. 926 * This enables delimiters to be entered as data. 927 * 928 * @param quote the quote character to use 929 * @return this, to enable chaining 930 */ 931 public StrTokenizer setQuoteChar(final char quote) { 932 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 933 } 934 935 // Ignored 936 //----------------------------------------------------------------------- 937 /** 938 * Gets the ignored character matcher. 939 * <p> 940 * These characters are ignored when parsing the String, unless they are 941 * within a quoted region. 942 * The default value is not to ignore anything. 943 * 944 * @return the ignored matcher in use 945 */ 946 public StrMatcher getIgnoredMatcher() { 947 return ignoredMatcher; 948 } 949 950 /** 951 * Set the matcher for characters to ignore. 952 * <p> 953 * These characters are ignored when parsing the String, unless they are 954 * within a quoted region. 955 * 956 * @param ignored the ignored matcher to use, null ignored 957 * @return this, to enable chaining 958 */ 959 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 960 if (ignored != null) { 961 this.ignoredMatcher = ignored; 962 } 963 return this; 964 } 965 966 /** 967 * Set the character to ignore. 968 * <p> 969 * This character is ignored when parsing the String, unless it is 970 * within a quoted region. 971 * 972 * @param ignored the ignored character to use 973 * @return this, to enable chaining 974 */ 975 public StrTokenizer setIgnoredChar(final char ignored) { 976 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 977 } 978 979 // Trimmer 980 //----------------------------------------------------------------------- 981 /** 982 * Gets the trimmer character matcher. 983 * <p> 984 * These characters are trimmed off on each side of the delimiter 985 * until the token or quote is found. 986 * The default value is not to trim anything. 987 * 988 * @return the trimmer matcher in use 989 */ 990 public StrMatcher getTrimmerMatcher() { 991 return trimmerMatcher; 992 } 993 994 /** 995 * Sets the matcher for characters to trim. 996 * <p> 997 * These characters are trimmed off on each side of the delimiter 998 * until the token or quote is found. 999 * 1000 * @param trimmer the trimmer matcher to use, null ignored 1001 * @return this, to enable chaining 1002 */ 1003 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1004 if (trimmer != null) { 1005 this.trimmerMatcher = trimmer; 1006 } 1007 return this; 1008 } 1009 1010 //----------------------------------------------------------------------- 1011 /** 1012 * Gets whether the tokenizer currently returns empty tokens as null. 1013 * The default for this property is false. 1014 * 1015 * @return true if empty tokens are returned as null 1016 */ 1017 public boolean isEmptyTokenAsNull() { 1018 return this.emptyAsNull; 1019 } 1020 1021 /** 1022 * Sets whether the tokenizer should return empty tokens as null. 1023 * The default for this property is false. 1024 * 1025 * @param emptyAsNull whether empty tokens are returned as null 1026 * @return this, to enable chaining 1027 */ 1028 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1029 this.emptyAsNull = emptyAsNull; 1030 return this; 1031 } 1032 1033 //----------------------------------------------------------------------- 1034 /** 1035 * Gets whether the tokenizer currently ignores empty tokens. 1036 * The default for this property is true. 1037 * 1038 * @return true if empty tokens are not returned 1039 */ 1040 public boolean isIgnoreEmptyTokens() { 1041 return ignoreEmptyTokens; 1042 } 1043 1044 /** 1045 * Sets whether the tokenizer should ignore and not return empty tokens. 1046 * The default for this property is true. 1047 * 1048 * @param ignoreEmptyTokens whether empty tokens are not returned 1049 * @return this, to enable chaining 1050 */ 1051 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1052 this.ignoreEmptyTokens = ignoreEmptyTokens; 1053 return this; 1054 } 1055 1056 //----------------------------------------------------------------------- 1057 /** 1058 * Gets the String content that the tokenizer is parsing. 1059 * 1060 * @return the string content being parsed 1061 */ 1062 public String getContent() { 1063 if (chars == null) { 1064 return null; 1065 } 1066 return new String(chars); 1067 } 1068 1069 //----------------------------------------------------------------------- 1070 /** 1071 * Creates a new instance of this Tokenizer. The new instance is reset so 1072 * that it will be at the start of the token list. 1073 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1074 * 1075 * @return a new instance of this Tokenizer which has been reset. 1076 */ 1077 @Override 1078 public Object clone() { 1079 try { 1080 return cloneReset(); 1081 } catch (final CloneNotSupportedException ex) { 1082 return null; 1083 } 1084 } 1085 1086 /** 1087 * Creates a new instance of this Tokenizer. The new instance is reset so that 1088 * it will be at the start of the token list. 1089 * 1090 * @return a new instance of this Tokenizer which has been reset. 1091 * @throws CloneNotSupportedException if there is a problem cloning 1092 */ 1093 Object cloneReset() throws CloneNotSupportedException { 1094 // this method exists to enable 100% test coverage 1095 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1096 if (cloned.chars != null) { 1097 cloned.chars = cloned.chars.clone(); 1098 } 1099 cloned.reset(); 1100 return cloned; 1101 } 1102 1103 //----------------------------------------------------------------------- 1104 /** 1105 * Gets the String content that the tokenizer is parsing. 1106 * 1107 * @return the string content being parsed 1108 */ 1109 @Override 1110 public String toString() { 1111 if (tokens == null) { 1112 return "StrTokenizer[not tokenized yet]"; 1113 } 1114 return "StrTokenizer" + getTokenList(); 1115 } 1116 1117}