001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.lang3.ArrayUtils; 026import org.apache.commons.lang3.StringUtils; 027 028/** 029 * Tokenizes a string based based on delimiters (separators) 030 * and supporting quoting and ignored character concepts. 031 * <p> 032 * This class can split a String into many smaller strings. It aims 033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 034 * however it offers much more control and flexibility including implementing 035 * the <code>ListIterator</code> interface. By default, it is set up 036 * like <code>StringTokenizer</code>. 037 * <p> 038 * The input String is split into a number of <i>tokens</i>. 039 * Each token is separated from the next String by a <i>delimiter</i>. 040 * One or more delimiter characters must be specified. 041 * <p> 042 * Each token may be surrounded by quotes. 043 * The <i>quote</i> matcher specifies the quote character(s). 044 * A quote may be escaped within a quoted section by duplicating itself. 045 * <p> 046 * Between each token and the delimiter are potentially characters that need trimming. 047 * The <i>trimmer</i> matcher specifies these characters. 048 * One usage might be to trim whitespace characters. 049 * <p> 050 * At any point outside the quotes there might potentially be invalid characters. 051 * The <i>ignored</i> matcher specifies these characters to be removed. 052 * One usage might be to remove new line characters. 053 * <p> 054 * Empty tokens may be removed or returned as null. 055 * <pre> 056 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 057 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 059 * </pre> 060 * <p> 061 * 062 * This tokenizer has the following properties and options: 063 * 064 * <table summary="Tokenizer Properties"> 065 * <tr> 066 * <th>Property</th><th>Type</th><th>Default</th> 067 * </tr> 068 * <tr> 069 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 070 * </tr> 071 * <tr> 072 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 073 * </tr> 074 * <tr> 075 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 076 * </tr> 077 * <tr> 078 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 079 * </tr> 080 * <tr> 081 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 082 * </tr> 083 * </table> 084 * 085 * @since 2.2 086 */ 087public class StrTokenizer implements ListIterator<String>, Cloneable { 088 089 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 090 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 091 static { 092 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 093 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 094 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 098 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 099 100 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 101 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 102 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 106 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 107 } 108 109 /** The text to work on. */ 110 private char chars[]; 111 /** The parsed tokens */ 112 private String tokens[]; 113 /** The current iteration position */ 114 private int tokenPos; 115 116 /** The delimiter matcher */ 117 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 118 /** The quote matcher */ 119 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 120 /** The ignored matcher */ 121 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 122 /** The trimmer matcher */ 123 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 124 125 /** Whether to return empty tokens as null */ 126 private boolean emptyAsNull = false; 127 /** Whether to ignore empty tokens */ 128 private boolean ignoreEmptyTokens = true; 129 130 //----------------------------------------------------------------------- 131 132 /** 133 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 134 * 135 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 136 */ 137 private static StrTokenizer getCSVClone() { 138 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 139 } 140 141 /** 142 * Gets a new tokenizer instance which parses Comma Separated Value strings 143 * initializing it with the given input. The default for CSV processing 144 * will be trim whitespace from both ends (which can be overridden with 145 * the setTrimmer method). 146 * <p> 147 * You must call a "reset" method to set the string which you want to parse. 148 * @return a new tokenizer instance which parses Comma Separated Value strings 149 */ 150 public static StrTokenizer getCSVInstance() { 151 return getCSVClone(); 152 } 153 154 /** 155 * Gets a new tokenizer instance which parses Comma Separated Value strings 156 * initializing it with the given input. The default for CSV processing 157 * will be trim whitespace from both ends (which can be overridden with 158 * the setTrimmer method). 159 * 160 * @param input the text to parse 161 * @return a new tokenizer instance which parses Comma Separated Value strings 162 */ 163 public static StrTokenizer getCSVInstance(final String input) { 164 final StrTokenizer tok = getCSVClone(); 165 tok.reset(input); 166 return tok; 167 } 168 169 /** 170 * Gets a new tokenizer instance which parses Comma Separated Value strings 171 * initializing it with the given input. The default for CSV processing 172 * will be trim whitespace from both ends (which can be overridden with 173 * the setTrimmer method). 174 * 175 * @param input the text to parse 176 * @return a new tokenizer instance which parses Comma Separated Value strings 177 */ 178 public static StrTokenizer getCSVInstance(final char[] input) { 179 final StrTokenizer tok = getCSVClone(); 180 tok.reset(input); 181 return tok; 182 } 183 184 /** 185 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 186 * 187 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 188 */ 189 private static StrTokenizer getTSVClone() { 190 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 191 } 192 193 194 /** 195 * Gets a new tokenizer instance which parses Tab Separated Value strings. 196 * The default for CSV processing will be trim whitespace from both ends 197 * (which can be overridden with the setTrimmer method). 198 * <p> 199 * You must call a "reset" method to set the string which you want to parse. 200 * @return a new tokenizer instance which parses Tab Separated Value strings. 201 */ 202 public static StrTokenizer getTSVInstance() { 203 return getTSVClone(); 204 } 205 206 /** 207 * Gets a new tokenizer instance which parses Tab Separated Value strings. 208 * The default for CSV processing will be trim whitespace from both ends 209 * (which can be overridden with the setTrimmer method). 210 * @param input the string to parse 211 * @return a new tokenizer instance which parses Tab Separated Value strings. 212 */ 213 public static StrTokenizer getTSVInstance(final String input) { 214 final StrTokenizer tok = getTSVClone(); 215 tok.reset(input); 216 return tok; 217 } 218 219 /** 220 * Gets a new tokenizer instance which parses Tab Separated Value strings. 221 * The default for CSV processing will be trim whitespace from both ends 222 * (which can be overridden with the setTrimmer method). 223 * @param input the string to parse 224 * @return a new tokenizer instance which parses Tab Separated Value strings. 225 */ 226 public static StrTokenizer getTSVInstance(final char[] input) { 227 final StrTokenizer tok = getTSVClone(); 228 tok.reset(input); 229 return tok; 230 } 231 232 //----------------------------------------------------------------------- 233 /** 234 * Constructs a tokenizer splitting on space, tab, newline and formfeed 235 * as per StringTokenizer, but with no text to tokenize. 236 * <p> 237 * This constructor is normally used with {@link #reset(String)}. 238 */ 239 public StrTokenizer() { 240 super(); 241 this.chars = null; 242 } 243 244 /** 245 * Constructs a tokenizer splitting on space, tab, newline and formfeed 246 * as per StringTokenizer. 247 * 248 * @param input the string which is to be parsed 249 */ 250 public StrTokenizer(final String input) { 251 super(); 252 if (input != null) { 253 chars = input.toCharArray(); 254 } else { 255 chars = null; 256 } 257 } 258 259 /** 260 * Constructs a tokenizer splitting on the specified delimiter character. 261 * 262 * @param input the string which is to be parsed 263 * @param delim the field delimiter character 264 */ 265 public StrTokenizer(final String input, final char delim) { 266 this(input); 267 setDelimiterChar(delim); 268 } 269 270 /** 271 * Constructs a tokenizer splitting on the specified delimiter string. 272 * 273 * @param input the string which is to be parsed 274 * @param delim the field delimiter string 275 */ 276 public StrTokenizer(final String input, final String delim) { 277 this(input); 278 setDelimiterString(delim); 279 } 280 281 /** 282 * Constructs a tokenizer splitting using the specified delimiter matcher. 283 * 284 * @param input the string which is to be parsed 285 * @param delim the field delimiter matcher 286 */ 287 public StrTokenizer(final String input, final StrMatcher delim) { 288 this(input); 289 setDelimiterMatcher(delim); 290 } 291 292 /** 293 * Constructs a tokenizer splitting on the specified delimiter character 294 * and handling quotes using the specified quote character. 295 * 296 * @param input the string which is to be parsed 297 * @param delim the field delimiter character 298 * @param quote the field quoted string character 299 */ 300 public StrTokenizer(final String input, final char delim, final char quote) { 301 this(input, delim); 302 setQuoteChar(quote); 303 } 304 305 /** 306 * Constructs a tokenizer splitting using the specified delimiter matcher 307 * and handling quotes using the specified quote matcher. 308 * 309 * @param input the string which is to be parsed 310 * @param delim the field delimiter matcher 311 * @param quote the field quoted string matcher 312 */ 313 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 314 this(input, delim); 315 setQuoteMatcher(quote); 316 } 317 318 /** 319 * Constructs a tokenizer splitting on space, tab, newline and formfeed 320 * as per StringTokenizer. 321 * 322 * @param input the string which is to be parsed, not cloned 323 */ 324 public StrTokenizer(final char[] input) { 325 super(); 326 this.chars = ArrayUtils.clone(input); 327 } 328 329 /** 330 * Constructs a tokenizer splitting on the specified character. 331 * 332 * @param input the string which is to be parsed, not cloned 333 * @param delim the field delimiter character 334 */ 335 public StrTokenizer(final char[] input, final char delim) { 336 this(input); 337 setDelimiterChar(delim); 338 } 339 340 /** 341 * Constructs a tokenizer splitting on the specified string. 342 * 343 * @param input the string which is to be parsed, not cloned 344 * @param delim the field delimiter string 345 */ 346 public StrTokenizer(final char[] input, final String delim) { 347 this(input); 348 setDelimiterString(delim); 349 } 350 351 /** 352 * Constructs a tokenizer splitting using the specified delimiter matcher. 353 * 354 * @param input the string which is to be parsed, not cloned 355 * @param delim the field delimiter matcher 356 */ 357 public StrTokenizer(final char[] input, final StrMatcher delim) { 358 this(input); 359 setDelimiterMatcher(delim); 360 } 361 362 /** 363 * Constructs a tokenizer splitting on the specified delimiter character 364 * and handling quotes using the specified quote character. 365 * 366 * @param input the string which is to be parsed, not cloned 367 * @param delim the field delimiter character 368 * @param quote the field quoted string character 369 */ 370 public StrTokenizer(final char[] input, final char delim, final char quote) { 371 this(input, delim); 372 setQuoteChar(quote); 373 } 374 375 /** 376 * Constructs a tokenizer splitting using the specified delimiter matcher 377 * and handling quotes using the specified quote matcher. 378 * 379 * @param input the string which is to be parsed, not cloned 380 * @param delim the field delimiter character 381 * @param quote the field quoted string character 382 */ 383 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 384 this(input, delim); 385 setQuoteMatcher(quote); 386 } 387 388 // API 389 //----------------------------------------------------------------------- 390 /** 391 * Gets the number of tokens found in the String. 392 * 393 * @return the number of matched tokens 394 */ 395 public int size() { 396 checkTokenized(); 397 return tokens.length; 398 } 399 400 /** 401 * Gets the next token from the String. 402 * Equivalent to {@link #next()} except it returns null rather than 403 * throwing {@link NoSuchElementException} when no tokens remain. 404 * 405 * @return the next sequential token, or null when no more tokens are found 406 */ 407 public String nextToken() { 408 if (hasNext()) { 409 return tokens[tokenPos++]; 410 } 411 return null; 412 } 413 414 /** 415 * Gets the previous token from the String. 416 * 417 * @return the previous sequential token, or null when no more tokens are found 418 */ 419 public String previousToken() { 420 if (hasPrevious()) { 421 return tokens[--tokenPos]; 422 } 423 return null; 424 } 425 426 /** 427 * Gets a copy of the full token list as an independent modifiable array. 428 * 429 * @return the tokens as a String array 430 */ 431 public String[] getTokenArray() { 432 checkTokenized(); 433 return tokens.clone(); 434 } 435 436 /** 437 * Gets a copy of the full token list as an independent modifiable list. 438 * 439 * @return the tokens as a String array 440 */ 441 public List<String> getTokenList() { 442 checkTokenized(); 443 final List<String> list = new ArrayList<String>(tokens.length); 444 for (final String element : tokens) { 445 list.add(element); 446 } 447 return list; 448 } 449 450 /** 451 * Resets this tokenizer, forgetting all parsing and iteration already completed. 452 * <p> 453 * This method allows the same tokenizer to be reused for the same String. 454 * 455 * @return this, to enable chaining 456 */ 457 public StrTokenizer reset() { 458 tokenPos = 0; 459 tokens = null; 460 return this; 461 } 462 463 /** 464 * Reset this tokenizer, giving it a new input string to parse. 465 * In this manner you can re-use a tokenizer with the same settings 466 * on multiple input lines. 467 * 468 * @param input the new string to tokenize, null sets no text to parse 469 * @return this, to enable chaining 470 */ 471 public StrTokenizer reset(final String input) { 472 reset(); 473 if (input != null) { 474 this.chars = input.toCharArray(); 475 } else { 476 this.chars = null; 477 } 478 return this; 479 } 480 481 /** 482 * Reset this tokenizer, giving it a new input string to parse. 483 * In this manner you can re-use a tokenizer with the same settings 484 * on multiple input lines. 485 * 486 * @param input the new character array to tokenize, not cloned, null sets no text to parse 487 * @return this, to enable chaining 488 */ 489 public StrTokenizer reset(final char[] input) { 490 reset(); 491 this.chars = ArrayUtils.clone(input); 492 return this; 493 } 494 495 // ListIterator 496 //----------------------------------------------------------------------- 497 /** 498 * Checks whether there are any more tokens. 499 * 500 * @return true if there are more tokens 501 */ 502 @Override 503 public boolean hasNext() { 504 checkTokenized(); 505 return tokenPos < tokens.length; 506 } 507 508 /** 509 * Gets the next token. 510 * 511 * @return the next String token 512 * @throws NoSuchElementException if there are no more elements 513 */ 514 @Override 515 public String next() { 516 if (hasNext()) { 517 return tokens[tokenPos++]; 518 } 519 throw new NoSuchElementException(); 520 } 521 522 /** 523 * Gets the index of the next token to return. 524 * 525 * @return the next token index 526 */ 527 @Override 528 public int nextIndex() { 529 return tokenPos; 530 } 531 532 /** 533 * Checks whether there are any previous tokens that can be iterated to. 534 * 535 * @return true if there are previous tokens 536 */ 537 @Override 538 public boolean hasPrevious() { 539 checkTokenized(); 540 return tokenPos > 0; 541 } 542 543 /** 544 * Gets the token previous to the last returned token. 545 * 546 * @return the previous token 547 */ 548 @Override 549 public String previous() { 550 if (hasPrevious()) { 551 return tokens[--tokenPos]; 552 } 553 throw new NoSuchElementException(); 554 } 555 556 /** 557 * Gets the index of the previous token. 558 * 559 * @return the previous token index 560 */ 561 @Override 562 public int previousIndex() { 563 return tokenPos - 1; 564 } 565 566 /** 567 * Unsupported ListIterator operation. 568 * 569 * @throws UnsupportedOperationException always 570 */ 571 @Override 572 public void remove() { 573 throw new UnsupportedOperationException("remove() is unsupported"); 574 } 575 576 /** 577 * Unsupported ListIterator operation. 578 * @param obj this parameter ignored. 579 * @throws UnsupportedOperationException always 580 */ 581 @Override 582 public void set(final String obj) { 583 throw new UnsupportedOperationException("set() is unsupported"); 584 } 585 586 /** 587 * Unsupported ListIterator operation. 588 * @param obj this parameter ignored. 589 * @throws UnsupportedOperationException always 590 */ 591 @Override 592 public void add(final String obj) { 593 throw new UnsupportedOperationException("add() is unsupported"); 594 } 595 596 // Implementation 597 //----------------------------------------------------------------------- 598 /** 599 * Checks if tokenization has been done, and if not then do it. 600 */ 601 private void checkTokenized() { 602 if (tokens == null) { 603 if (chars == null) { 604 // still call tokenize as subclass may do some work 605 final List<String> split = tokenize(null, 0, 0); 606 tokens = split.toArray(new String[split.size()]); 607 } else { 608 final List<String> split = tokenize(chars, 0, chars.length); 609 tokens = split.toArray(new String[split.size()]); 610 } 611 } 612 } 613 614 /** 615 * Internal method to performs the tokenization. 616 * <p> 617 * Most users of this class do not need to call this method. This method 618 * will be called automatically by other (public) methods when required. 619 * <p> 620 * This method exists to allow subclasses to add code before or after the 621 * tokenization. For example, a subclass could alter the character array, 622 * offset or count to be parsed, or call the tokenizer multiple times on 623 * multiple strings. It is also be possible to filter the results. 624 * <p> 625 * <code>StrTokenizer</code> will always pass a zero offset and a count 626 * equal to the length of the array to this method, however a subclass 627 * may pass other values, or even an entirely different array. 628 * 629 * @param srcChars the character array being tokenized, may be null 630 * @param offset the start position within the character array, must be valid 631 * @param count the number of characters to tokenize, must be valid 632 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 633 */ 634 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 635 if (srcChars == null || count == 0) { 636 return Collections.emptyList(); 637 } 638 final StrBuilder buf = new StrBuilder(); 639 final List<String> tokenList = new ArrayList<String>(); 640 int pos = offset; 641 642 // loop around the entire buffer 643 while (pos >= 0 && pos < count) { 644 // find next token 645 pos = readNextToken(srcChars, pos, count, buf, tokenList); 646 647 // handle case where end of string is a delimiter 648 if (pos >= count) { 649 addToken(tokenList, StringUtils.EMPTY); 650 } 651 } 652 return tokenList; 653 } 654 655 /** 656 * Adds a token to a list, paying attention to the parameters we've set. 657 * 658 * @param list the list to add to 659 * @param tok the token to add 660 */ 661 private void addToken(final List<String> list, String tok) { 662 if (StringUtils.isEmpty(tok)) { 663 if (isIgnoreEmptyTokens()) { 664 return; 665 } 666 if (isEmptyTokenAsNull()) { 667 tok = null; 668 } 669 } 670 list.add(tok); 671 } 672 673 /** 674 * Reads character by character through the String to get the next token. 675 * 676 * @param srcChars the character array being tokenized 677 * @param start the first character of field 678 * @param len the length of the character array being tokenized 679 * @param workArea a temporary work area 680 * @param tokenList the list of parsed tokens 681 * @return the starting position of the next field (the character 682 * immediately after the delimiter), or -1 if end of string found 683 */ 684 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 685 // skip all leading whitespace, unless it is the 686 // field delimiter or the quote character 687 while (start < len) { 688 final int removeLen = Math.max( 689 getIgnoredMatcher().isMatch(srcChars, start, start, len), 690 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 691 if (removeLen == 0 || 692 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 693 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 694 break; 695 } 696 start += removeLen; 697 } 698 699 // handle reaching end 700 if (start >= len) { 701 addToken(tokenList, StringUtils.EMPTY); 702 return -1; 703 } 704 705 // handle empty token 706 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 707 if (delimLen > 0) { 708 addToken(tokenList, StringUtils.EMPTY); 709 return start + delimLen; 710 } 711 712 // handle found token 713 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 714 if (quoteLen > 0) { 715 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 716 } 717 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 718 } 719 720 /** 721 * Reads a possibly quoted string token. 722 * 723 * @param srcChars the character array being tokenized 724 * @param start the first character of field 725 * @param len the length of the character array being tokenized 726 * @param workArea a temporary work area 727 * @param tokenList the list of parsed tokens 728 * @param quoteStart the start position of the matched quote, 0 if no quoting 729 * @param quoteLen the length of the matched quote, 0 if no quoting 730 * @return the starting position of the next field (the character 731 * immediately after the delimiter, or if end of string found, 732 * then the length of string 733 */ 734 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 735 final List<String> tokenList, final int quoteStart, final int quoteLen) { 736 // Loop until we've found the end of the quoted 737 // string or the end of the input 738 workArea.clear(); 739 int pos = start; 740 boolean quoting = quoteLen > 0; 741 int trimStart = 0; 742 743 while (pos < len) { 744 // quoting mode can occur several times throughout a string 745 // we must switch between quoting and non-quoting until we 746 // encounter a non-quoted delimiter, or end of string 747 if (quoting) { 748 // In quoting mode 749 750 // If we've found a quote character, see if it's 751 // followed by a second quote. If so, then we need 752 // to actually put the quote character into the token 753 // rather than end the token. 754 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 755 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 756 // matched pair of quotes, thus an escaped quote 757 workArea.append(srcChars, pos, quoteLen); 758 pos += quoteLen * 2; 759 trimStart = workArea.size(); 760 continue; 761 } 762 763 // end of quoting 764 quoting = false; 765 pos += quoteLen; 766 continue; 767 } 768 769 // copy regular character from inside quotes 770 workArea.append(srcChars[pos++]); 771 trimStart = workArea.size(); 772 773 } else { 774 // Not in quoting mode 775 776 // check for delimiter, and thus end of token 777 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 778 if (delimLen > 0) { 779 // return condition when end of token found 780 addToken(tokenList, workArea.substring(0, trimStart)); 781 return pos + delimLen; 782 } 783 784 // check for quote, and thus back into quoting mode 785 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 786 quoting = true; 787 pos += quoteLen; 788 continue; 789 } 790 791 // check for ignored (outside quotes), and ignore 792 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 793 if (ignoredLen > 0) { 794 pos += ignoredLen; 795 continue; 796 } 797 798 // check for trimmed character 799 // don't yet know if its at the end, so copy to workArea 800 // use trimStart to keep track of trim at the end 801 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 802 if (trimmedLen > 0) { 803 workArea.append(srcChars, pos, trimmedLen); 804 pos += trimmedLen; 805 continue; 806 } 807 808 // copy regular character from outside quotes 809 workArea.append(srcChars[pos++]); 810 trimStart = workArea.size(); 811 } 812 } 813 814 // return condition when end of string found 815 addToken(tokenList, workArea.substring(0, trimStart)); 816 return -1; 817 } 818 819 /** 820 * Checks if the characters at the index specified match the quote 821 * already matched in readNextToken(). 822 * 823 * @param srcChars the character array being tokenized 824 * @param pos the position to check for a quote 825 * @param len the length of the character array being tokenized 826 * @param quoteStart the start position of the matched quote, 0 if no quoting 827 * @param quoteLen the length of the matched quote, 0 if no quoting 828 * @return true if a quote is matched 829 */ 830 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 831 for (int i = 0; i < quoteLen; i++) { 832 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 833 return false; 834 } 835 } 836 return true; 837 } 838 839 // Delimiter 840 //----------------------------------------------------------------------- 841 /** 842 * Gets the field delimiter matcher. 843 * 844 * @return the delimiter matcher in use 845 */ 846 public StrMatcher getDelimiterMatcher() { 847 return this.delimMatcher; 848 } 849 850 /** 851 * Sets the field delimiter matcher. 852 * <p> 853 * The delimitier is used to separate one token from another. 854 * 855 * @param delim the delimiter matcher to use 856 * @return this, to enable chaining 857 */ 858 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 859 if (delim == null) { 860 this.delimMatcher = StrMatcher.noneMatcher(); 861 } else { 862 this.delimMatcher = delim; 863 } 864 return this; 865 } 866 867 /** 868 * Sets the field delimiter character. 869 * 870 * @param delim the delimiter character to use 871 * @return this, to enable chaining 872 */ 873 public StrTokenizer setDelimiterChar(final char delim) { 874 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 875 } 876 877 /** 878 * Sets the field delimiter string. 879 * 880 * @param delim the delimiter string to use 881 * @return this, to enable chaining 882 */ 883 public StrTokenizer setDelimiterString(final String delim) { 884 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 885 } 886 887 // Quote 888 //----------------------------------------------------------------------- 889 /** 890 * Gets the quote matcher currently in use. 891 * <p> 892 * The quote character is used to wrap data between the tokens. 893 * This enables delimiters to be entered as data. 894 * The default value is '"' (double quote). 895 * 896 * @return the quote matcher in use 897 */ 898 public StrMatcher getQuoteMatcher() { 899 return quoteMatcher; 900 } 901 902 /** 903 * Set the quote matcher to use. 904 * <p> 905 * The quote character is used to wrap data between the tokens. 906 * This enables delimiters to be entered as data. 907 * 908 * @param quote the quote matcher to use, null ignored 909 * @return this, to enable chaining 910 */ 911 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 912 if (quote != null) { 913 this.quoteMatcher = quote; 914 } 915 return this; 916 } 917 918 /** 919 * Sets the quote character to use. 920 * <p> 921 * The quote character is used to wrap data between the tokens. 922 * This enables delimiters to be entered as data. 923 * 924 * @param quote the quote character to use 925 * @return this, to enable chaining 926 */ 927 public StrTokenizer setQuoteChar(final char quote) { 928 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 929 } 930 931 // Ignored 932 //----------------------------------------------------------------------- 933 /** 934 * Gets the ignored character matcher. 935 * <p> 936 * These characters are ignored when parsing the String, unless they are 937 * within a quoted region. 938 * The default value is not to ignore anything. 939 * 940 * @return the ignored matcher in use 941 */ 942 public StrMatcher getIgnoredMatcher() { 943 return ignoredMatcher; 944 } 945 946 /** 947 * Set the matcher for characters to ignore. 948 * <p> 949 * These characters are ignored when parsing the String, unless they are 950 * within a quoted region. 951 * 952 * @param ignored the ignored matcher to use, null ignored 953 * @return this, to enable chaining 954 */ 955 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 956 if (ignored != null) { 957 this.ignoredMatcher = ignored; 958 } 959 return this; 960 } 961 962 /** 963 * Set the character to ignore. 964 * <p> 965 * This character is ignored when parsing the String, unless it is 966 * within a quoted region. 967 * 968 * @param ignored the ignored character to use 969 * @return this, to enable chaining 970 */ 971 public StrTokenizer setIgnoredChar(final char ignored) { 972 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 973 } 974 975 // Trimmer 976 //----------------------------------------------------------------------- 977 /** 978 * Gets the trimmer character matcher. 979 * <p> 980 * These characters are trimmed off on each side of the delimiter 981 * until the token or quote is found. 982 * The default value is not to trim anything. 983 * 984 * @return the trimmer matcher in use 985 */ 986 public StrMatcher getTrimmerMatcher() { 987 return trimmerMatcher; 988 } 989 990 /** 991 * Sets the matcher for characters to trim. 992 * <p> 993 * These characters are trimmed off on each side of the delimiter 994 * until the token or quote is found. 995 * 996 * @param trimmer the trimmer matcher to use, null ignored 997 * @return this, to enable chaining 998 */ 999 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1000 if (trimmer != null) { 1001 this.trimmerMatcher = trimmer; 1002 } 1003 return this; 1004 } 1005 1006 //----------------------------------------------------------------------- 1007 /** 1008 * Gets whether the tokenizer currently returns empty tokens as null. 1009 * The default for this property is false. 1010 * 1011 * @return true if empty tokens are returned as null 1012 */ 1013 public boolean isEmptyTokenAsNull() { 1014 return this.emptyAsNull; 1015 } 1016 1017 /** 1018 * Sets whether the tokenizer should return empty tokens as null. 1019 * The default for this property is false. 1020 * 1021 * @param emptyAsNull whether empty tokens are returned as null 1022 * @return this, to enable chaining 1023 */ 1024 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1025 this.emptyAsNull = emptyAsNull; 1026 return this; 1027 } 1028 1029 //----------------------------------------------------------------------- 1030 /** 1031 * Gets whether the tokenizer currently ignores empty tokens. 1032 * The default for this property is true. 1033 * 1034 * @return true if empty tokens are not returned 1035 */ 1036 public boolean isIgnoreEmptyTokens() { 1037 return ignoreEmptyTokens; 1038 } 1039 1040 /** 1041 * Sets whether the tokenizer should ignore and not return empty tokens. 1042 * The default for this property is true. 1043 * 1044 * @param ignoreEmptyTokens whether empty tokens are not returned 1045 * @return this, to enable chaining 1046 */ 1047 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1048 this.ignoreEmptyTokens = ignoreEmptyTokens; 1049 return this; 1050 } 1051 1052 //----------------------------------------------------------------------- 1053 /** 1054 * Gets the String content that the tokenizer is parsing. 1055 * 1056 * @return the string content being parsed 1057 */ 1058 public String getContent() { 1059 if (chars == null) { 1060 return null; 1061 } 1062 return new String(chars); 1063 } 1064 1065 //----------------------------------------------------------------------- 1066 /** 1067 * Creates a new instance of this Tokenizer. The new instance is reset so 1068 * that it will be at the start of the token list. 1069 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1070 * 1071 * @return a new instance of this Tokenizer which has been reset. 1072 */ 1073 @Override 1074 public Object clone() { 1075 try { 1076 return cloneReset(); 1077 } catch (final CloneNotSupportedException ex) { 1078 return null; 1079 } 1080 } 1081 1082 /** 1083 * Creates a new instance of this Tokenizer. The new instance is reset so that 1084 * it will be at the start of the token list. 1085 * 1086 * @return a new instance of this Tokenizer which has been reset. 1087 * @throws CloneNotSupportedException if there is a problem cloning 1088 */ 1089 Object cloneReset() throws CloneNotSupportedException { 1090 // this method exists to enable 100% test coverage 1091 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1092 if (cloned.chars != null) { 1093 cloned.chars = cloned.chars.clone(); 1094 } 1095 cloned.reset(); 1096 return cloned; 1097 } 1098 1099 //----------------------------------------------------------------------- 1100 /** 1101 * Gets the String content that the tokenizer is parsing. 1102 * 1103 * @return the string content being parsed 1104 */ 1105 @Override 1106 public String toString() { 1107 if (tokens == null) { 1108 return "StrTokenizer[not tokenized yet]"; 1109 } 1110 return "StrTokenizer" + getTokenList(); 1111 } 1112 1113}