001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang3.text; 018 019 import java.util.ArrayList; 020 import java.util.Collections; 021 import java.util.List; 022 import java.util.ListIterator; 023 import java.util.NoSuchElementException; 024 025 import org.apache.commons.lang3.ArrayUtils; 026 027 /** 028 * Tokenizes a string based based on delimiters (separators) 029 * and supporting quoting and ignored character concepts. 030 * <p> 031 * This class can split a String into many smaller strings. It aims 032 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 033 * however it offers much more control and flexibility including implementing 034 * the <code>ListIterator</code> interface. By default, it is set up 035 * like <code>StringTokenizer</code>. 036 * <p> 037 * The input String is split into a number of <i>tokens</i>. 038 * Each token is separated from the next String by a <i>delimiter</i>. 039 * One or more delimiter characters must be specified. 040 * <p> 041 * Each token may be surrounded by quotes. 042 * The <i>quote</i> matcher specifies the quote character(s). 043 * A quote may be escaped within a quoted section by duplicating itself. 044 * <p> 045 * Between each token and the delimiter are potentially characters that need trimming. 046 * The <i>trimmer</i> matcher specifies these characters. 047 * One usage might be to trim whitespace characters. 048 * <p> 049 * At any point outside the quotes there might potentially be invalid characters. 050 * The <i>ignored</i> matcher specifies these characters to be removed. 051 * One usage might be to remove new line characters. 052 * <p> 053 * Empty tokens may be removed or returned as null. 054 * <pre> 055 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 056 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 057 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 058 * </pre> 059 * <p> 060 * 061 * This tokenizer has the following properties and options: 062 * 063 * <table> 064 * <tr> 065 * <th>Property</th><th>Type</th><th>Default</th> 066 * </tr> 067 * <tr> 068 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 069 * </tr> 070 * <tr> 071 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 072 * </tr> 073 * <tr> 074 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 075 * </tr> 076 * <tr> 077 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 078 * </tr> 079 * <tr> 080 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 081 * </tr> 082 * </table> 083 * 084 * @since 2.2 085 * @version $Id: StrTokenizer.java 1088899 2011-04-05 05:31:27Z bayard $ 086 */ 087 public class StrTokenizer implements ListIterator<String>, Cloneable { 088 089 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 090 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 091 static { 092 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 093 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 094 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 098 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 099 100 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 101 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 102 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 106 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 107 } 108 109 /** The text to work on. */ 110 private char chars[]; 111 /** The parsed tokens */ 112 private String tokens[]; 113 /** The current iteration position */ 114 private int tokenPos; 115 116 /** The delimiter matcher */ 117 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 118 /** The quote matcher */ 119 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 120 /** The ignored matcher */ 121 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 122 /** The trimmer matcher */ 123 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 124 125 /** Whether to return empty tokens as null */ 126 private boolean emptyAsNull = false; 127 /** Whether to ignore empty tokens */ 128 private boolean ignoreEmptyTokens = true; 129 130 //----------------------------------------------------------------------- 131 132 /** 133 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 134 * 135 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 136 */ 137 private static StrTokenizer getCSVClone() { 138 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 139 } 140 141 /** 142 * Gets a new tokenizer instance which parses Comma Separated Value strings 143 * initializing it with the given input. The default for CSV processing 144 * will be trim whitespace from both ends (which can be overridden with 145 * the setTrimmer method). 146 * <p> 147 * You must call a "reset" method to set the string which you want to parse. 148 * @return a new tokenizer instance which parses Comma Separated Value strings 149 */ 150 public static StrTokenizer getCSVInstance() { 151 return getCSVClone(); 152 } 153 154 /** 155 * Gets a new tokenizer instance which parses Comma Separated Value strings 156 * initializing it with the given input. The default for CSV processing 157 * will be trim whitespace from both ends (which can be overridden with 158 * the setTrimmer method). 159 * 160 * @param input the text to parse 161 * @return a new tokenizer instance which parses Comma Separated Value strings 162 */ 163 public static StrTokenizer getCSVInstance(String input) { 164 StrTokenizer tok = getCSVClone(); 165 tok.reset(input); 166 return tok; 167 } 168 169 /** 170 * Gets a new tokenizer instance which parses Comma Separated Value strings 171 * initializing it with the given input. The default for CSV processing 172 * will be trim whitespace from both ends (which can be overridden with 173 * the setTrimmer method). 174 * 175 * @param input the text to parse 176 * @return a new tokenizer instance which parses Comma Separated Value strings 177 */ 178 public static StrTokenizer getCSVInstance(char[] input) { 179 StrTokenizer tok = getCSVClone(); 180 tok.reset(input); 181 return tok; 182 } 183 184 /** 185 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 186 * 187 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 188 */ 189 private static StrTokenizer getTSVClone() { 190 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 191 } 192 193 194 /** 195 * Gets a new tokenizer instance which parses Tab Separated Value strings. 196 * The default for CSV processing will be trim whitespace from both ends 197 * (which can be overridden with the setTrimmer method). 198 * <p> 199 * You must call a "reset" method to set the string which you want to parse. 200 * @return a new tokenizer instance which parses Tab Separated Value strings. 201 */ 202 public static StrTokenizer getTSVInstance() { 203 return getTSVClone(); 204 } 205 206 /** 207 * Gets a new tokenizer instance which parses Tab Separated Value strings. 208 * The default for CSV processing will be trim whitespace from both ends 209 * (which can be overridden with the setTrimmer method). 210 * @param input the string to parse 211 * @return a new tokenizer instance which parses Tab Separated Value strings. 212 */ 213 public static StrTokenizer getTSVInstance(String input) { 214 StrTokenizer tok = getTSVClone(); 215 tok.reset(input); 216 return tok; 217 } 218 219 /** 220 * Gets a new tokenizer instance which parses Tab Separated Value strings. 221 * The default for CSV processing will be trim whitespace from both ends 222 * (which can be overridden with the setTrimmer method). 223 * @param input the string to parse 224 * @return a new tokenizer instance which parses Tab Separated Value strings. 225 */ 226 public static StrTokenizer getTSVInstance(char[] input) { 227 StrTokenizer tok = getTSVClone(); 228 tok.reset(input); 229 return tok; 230 } 231 232 //----------------------------------------------------------------------- 233 /** 234 * Constructs a tokenizer splitting on space, tab, newline and formfeed 235 * as per StringTokenizer, but with no text to tokenize. 236 * <p> 237 * This constructor is normally used with {@link #reset(String)}. 238 */ 239 public StrTokenizer() { 240 super(); 241 this.chars = null; 242 } 243 244 /** 245 * Constructs a tokenizer splitting on space, tab, newline and formfeed 246 * as per StringTokenizer. 247 * 248 * @param input the string which is to be parsed 249 */ 250 public StrTokenizer(String input) { 251 super(); 252 if (input != null) { 253 chars = input.toCharArray(); 254 } else { 255 chars = null; 256 } 257 } 258 259 /** 260 * Constructs a tokenizer splitting on the specified delimiter character. 261 * 262 * @param input the string which is to be parsed 263 * @param delim the field delimiter character 264 */ 265 public StrTokenizer(String input, char delim) { 266 this(input); 267 setDelimiterChar(delim); 268 } 269 270 /** 271 * Constructs a tokenizer splitting on the specified delimiter string. 272 * 273 * @param input the string which is to be parsed 274 * @param delim the field delimiter string 275 */ 276 public StrTokenizer(String input, String delim) { 277 this(input); 278 setDelimiterString(delim); 279 } 280 281 /** 282 * Constructs a tokenizer splitting using the specified delimiter matcher. 283 * 284 * @param input the string which is to be parsed 285 * @param delim the field delimiter matcher 286 */ 287 public StrTokenizer(String input, StrMatcher delim) { 288 this(input); 289 setDelimiterMatcher(delim); 290 } 291 292 /** 293 * Constructs a tokenizer splitting on the specified delimiter character 294 * and handling quotes using the specified quote character. 295 * 296 * @param input the string which is to be parsed 297 * @param delim the field delimiter character 298 * @param quote the field quoted string character 299 */ 300 public StrTokenizer(String input, char delim, char quote) { 301 this(input, delim); 302 setQuoteChar(quote); 303 } 304 305 /** 306 * Constructs a tokenizer splitting using the specified delimiter matcher 307 * and handling quotes using the specified quote matcher. 308 * 309 * @param input the string which is to be parsed 310 * @param delim the field delimiter matcher 311 * @param quote the field quoted string matcher 312 */ 313 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) { 314 this(input, delim); 315 setQuoteMatcher(quote); 316 } 317 318 /** 319 * Constructs a tokenizer splitting on space, tab, newline and formfeed 320 * as per StringTokenizer. 321 * 322 * @param input the string which is to be parsed, not cloned 323 */ 324 public StrTokenizer(char[] input) { 325 super(); 326 this.chars = ArrayUtils.clone(input); 327 } 328 329 /** 330 * Constructs a tokenizer splitting on the specified character. 331 * 332 * @param input the string which is to be parsed, not cloned 333 * @param delim the field delimiter character 334 */ 335 public StrTokenizer(char[] input, char delim) { 336 this(input); 337 setDelimiterChar(delim); 338 } 339 340 /** 341 * Constructs a tokenizer splitting on the specified string. 342 * 343 * @param input the string which is to be parsed, not cloned 344 * @param delim the field delimiter string 345 */ 346 public StrTokenizer(char[] input, String delim) { 347 this(input); 348 setDelimiterString(delim); 349 } 350 351 /** 352 * Constructs a tokenizer splitting using the specified delimiter matcher. 353 * 354 * @param input the string which is to be parsed, not cloned 355 * @param delim the field delimiter matcher 356 */ 357 public StrTokenizer(char[] input, StrMatcher delim) { 358 this(input); 359 setDelimiterMatcher(delim); 360 } 361 362 /** 363 * Constructs a tokenizer splitting on the specified delimiter character 364 * and handling quotes using the specified quote character. 365 * 366 * @param input the string which is to be parsed, not cloned 367 * @param delim the field delimiter character 368 * @param quote the field quoted string character 369 */ 370 public StrTokenizer(char[] input, char delim, char quote) { 371 this(input, delim); 372 setQuoteChar(quote); 373 } 374 375 /** 376 * Constructs a tokenizer splitting using the specified delimiter matcher 377 * and handling quotes using the specified quote matcher. 378 * 379 * @param input the string which is to be parsed, not cloned 380 * @param delim the field delimiter character 381 * @param quote the field quoted string character 382 */ 383 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) { 384 this(input, delim); 385 setQuoteMatcher(quote); 386 } 387 388 // API 389 //----------------------------------------------------------------------- 390 /** 391 * Gets the number of tokens found in the String. 392 * 393 * @return the number of matched tokens 394 */ 395 public int size() { 396 checkTokenized(); 397 return tokens.length; 398 } 399 400 /** 401 * Gets the next token from the String. 402 * Equivalent to {@link #next()} except it returns null rather than 403 * throwing {@link NoSuchElementException} when no tokens remain. 404 * 405 * @return the next sequential token, or null when no more tokens are found 406 */ 407 public String nextToken() { 408 if (hasNext()) { 409 return tokens[tokenPos++]; 410 } 411 return null; 412 } 413 414 /** 415 * Gets the previous token from the String. 416 * 417 * @return the previous sequential token, or null when no more tokens are found 418 */ 419 public String previousToken() { 420 if (hasPrevious()) { 421 return tokens[--tokenPos]; 422 } 423 return null; 424 } 425 426 /** 427 * Gets a copy of the full token list as an independent modifiable array. 428 * 429 * @return the tokens as a String array 430 */ 431 public String[] getTokenArray() { 432 checkTokenized(); 433 return tokens.clone(); 434 } 435 436 /** 437 * Gets a copy of the full token list as an independent modifiable list. 438 * 439 * @return the tokens as a String array 440 */ 441 public List<String> getTokenList() { 442 checkTokenized(); 443 List<String> list = new ArrayList<String>(tokens.length); 444 for (String element : tokens) { 445 list.add(element); 446 } 447 return list; 448 } 449 450 /** 451 * Resets this tokenizer, forgetting all parsing and iteration already completed. 452 * <p> 453 * This method allows the same tokenizer to be reused for the same String. 454 * 455 * @return this, to enable chaining 456 */ 457 public StrTokenizer reset() { 458 tokenPos = 0; 459 tokens = null; 460 return this; 461 } 462 463 /** 464 * Reset this tokenizer, giving it a new input string to parse. 465 * In this manner you can re-use a tokenizer with the same settings 466 * on multiple input lines. 467 * 468 * @param input the new string to tokenize, null sets no text to parse 469 * @return this, to enable chaining 470 */ 471 public StrTokenizer reset(String input) { 472 reset(); 473 if (input != null) { 474 this.chars = input.toCharArray(); 475 } else { 476 this.chars = null; 477 } 478 return this; 479 } 480 481 /** 482 * Reset this tokenizer, giving it a new input string to parse. 483 * In this manner you can re-use a tokenizer with the same settings 484 * on multiple input lines. 485 * 486 * @param input the new character array to tokenize, not cloned, null sets no text to parse 487 * @return this, to enable chaining 488 */ 489 public StrTokenizer reset(char[] input) { 490 reset(); 491 this.chars = ArrayUtils.clone(input); 492 return this; 493 } 494 495 // ListIterator 496 //----------------------------------------------------------------------- 497 /** 498 * Checks whether there are any more tokens. 499 * 500 * @return true if there are more tokens 501 */ 502 public boolean hasNext() { 503 checkTokenized(); 504 return tokenPos < tokens.length; 505 } 506 507 /** 508 * Gets the next token. 509 * 510 * @return the next String token 511 * @throws NoSuchElementException if there are no more elements 512 */ 513 public String next() { 514 if (hasNext()) { 515 return tokens[tokenPos++]; 516 } 517 throw new NoSuchElementException(); 518 } 519 520 /** 521 * Gets the index of the next token to return. 522 * 523 * @return the next token index 524 */ 525 public int nextIndex() { 526 return tokenPos; 527 } 528 529 /** 530 * Checks whether there are any previous tokens that can be iterated to. 531 * 532 * @return true if there are previous tokens 533 */ 534 public boolean hasPrevious() { 535 checkTokenized(); 536 return tokenPos > 0; 537 } 538 539 /** 540 * Gets the token previous to the last returned token. 541 * 542 * @return the previous token 543 */ 544 public String previous() { 545 if (hasPrevious()) { 546 return tokens[--tokenPos]; 547 } 548 throw new NoSuchElementException(); 549 } 550 551 /** 552 * Gets the index of the previous token. 553 * 554 * @return the previous token index 555 */ 556 public int previousIndex() { 557 return tokenPos - 1; 558 } 559 560 /** 561 * Unsupported ListIterator operation. 562 * 563 * @throws UnsupportedOperationException always 564 */ 565 public void remove() { 566 throw new UnsupportedOperationException("remove() is unsupported"); 567 } 568 569 /** 570 * Unsupported ListIterator operation. 571 * @param obj this parameter ignored. 572 * @throws UnsupportedOperationException always 573 */ 574 public void set(String obj) { 575 throw new UnsupportedOperationException("set() is unsupported"); 576 } 577 578 /** 579 * Unsupported ListIterator operation. 580 * @param obj this parameter ignored. 581 * @throws UnsupportedOperationException always 582 */ 583 public void add(String obj) { 584 throw new UnsupportedOperationException("add() is unsupported"); 585 } 586 587 // Implementation 588 //----------------------------------------------------------------------- 589 /** 590 * Checks if tokenization has been done, and if not then do it. 591 */ 592 private void checkTokenized() { 593 if (tokens == null) { 594 if (chars == null) { 595 // still call tokenize as subclass may do some work 596 List<String> split = tokenize(null, 0, 0); 597 tokens = split.toArray(new String[split.size()]); 598 } else { 599 List<String> split = tokenize(chars, 0, chars.length); 600 tokens = split.toArray(new String[split.size()]); 601 } 602 } 603 } 604 605 /** 606 * Internal method to performs the tokenization. 607 * <p> 608 * Most users of this class do not need to call this method. This method 609 * will be called automatically by other (public) methods when required. 610 * <p> 611 * This method exists to allow subclasses to add code before or after the 612 * tokenization. For example, a subclass could alter the character array, 613 * offset or count to be parsed, or call the tokenizer multiple times on 614 * multiple strings. It is also be possible to filter the results. 615 * <p> 616 * <code>StrTokenizer</code> will always pass a zero offset and a count 617 * equal to the length of the array to this method, however a subclass 618 * may pass other values, or even an entirely different array. 619 * 620 * @param chars the character array being tokenized, may be null 621 * @param offset the start position within the character array, must be valid 622 * @param count the number of characters to tokenize, must be valid 623 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 624 */ 625 protected List<String> tokenize(char[] chars, int offset, int count) { 626 if (chars == null || count == 0) { 627 return Collections.emptyList(); 628 } 629 StrBuilder buf = new StrBuilder(); 630 List<String> tokens = new ArrayList<String>(); 631 int pos = offset; 632 633 // loop around the entire buffer 634 while (pos >= 0 && pos < count) { 635 // find next token 636 pos = readNextToken(chars, pos, count, buf, tokens); 637 638 // handle case where end of string is a delimiter 639 if (pos >= count) { 640 addToken(tokens, ""); 641 } 642 } 643 return tokens; 644 } 645 646 /** 647 * Adds a token to a list, paying attention to the parameters we've set. 648 * 649 * @param list the list to add to 650 * @param tok the token to add 651 */ 652 private void addToken(List<String> list, String tok) { 653 if (tok == null || tok.length() == 0) { 654 if (isIgnoreEmptyTokens()) { 655 return; 656 } 657 if (isEmptyTokenAsNull()) { 658 tok = null; 659 } 660 } 661 list.add(tok); 662 } 663 664 /** 665 * Reads character by character through the String to get the next token. 666 * 667 * @param chars the character array being tokenized 668 * @param start the first character of field 669 * @param len the length of the character array being tokenized 670 * @param workArea a temporary work area 671 * @param tokens the list of parsed tokens 672 * @return the starting position of the next field (the character 673 * immediately after the delimiter), or -1 if end of string found 674 */ 675 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) { 676 // skip all leading whitespace, unless it is the 677 // field delimiter or the quote character 678 while (start < len) { 679 int removeLen = Math.max( 680 getIgnoredMatcher().isMatch(chars, start, start, len), 681 getTrimmerMatcher().isMatch(chars, start, start, len)); 682 if (removeLen == 0 || 683 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 || 684 getQuoteMatcher().isMatch(chars, start, start, len) > 0) { 685 break; 686 } 687 start += removeLen; 688 } 689 690 // handle reaching end 691 if (start >= len) { 692 addToken(tokens, ""); 693 return -1; 694 } 695 696 // handle empty token 697 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len); 698 if (delimLen > 0) { 699 addToken(tokens, ""); 700 return start + delimLen; 701 } 702 703 // handle found token 704 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len); 705 if (quoteLen > 0) { 706 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen); 707 } 708 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0); 709 } 710 711 /** 712 * Reads a possibly quoted string token. 713 * 714 * @param chars the character array being tokenized 715 * @param start the first character of field 716 * @param len the length of the character array being tokenized 717 * @param workArea a temporary work area 718 * @param tokens the list of parsed tokens 719 * @param quoteStart the start position of the matched quote, 0 if no quoting 720 * @param quoteLen the length of the matched quote, 0 if no quoting 721 * @return the starting position of the next field (the character 722 * immediately after the delimiter, or if end of string found, 723 * then the length of string 724 */ 725 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 726 List<String> tokens, int quoteStart, int quoteLen) { 727 // Loop until we've found the end of the quoted 728 // string or the end of the input 729 workArea.clear(); 730 int pos = start; 731 boolean quoting = (quoteLen > 0); 732 int trimStart = 0; 733 734 while (pos < len) { 735 // quoting mode can occur several times throughout a string 736 // we must switch between quoting and non-quoting until we 737 // encounter a non-quoted delimiter, or end of string 738 if (quoting) { 739 // In quoting mode 740 741 // If we've found a quote character, see if it's 742 // followed by a second quote. If so, then we need 743 // to actually put the quote character into the token 744 // rather than end the token. 745 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 746 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) { 747 // matched pair of quotes, thus an escaped quote 748 workArea.append(chars, pos, quoteLen); 749 pos += (quoteLen * 2); 750 trimStart = workArea.size(); 751 continue; 752 } 753 754 // end of quoting 755 quoting = false; 756 pos += quoteLen; 757 continue; 758 } 759 760 // copy regular character from inside quotes 761 workArea.append(chars[pos++]); 762 trimStart = workArea.size(); 763 764 } else { 765 // Not in quoting mode 766 767 // check for delimiter, and thus end of token 768 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len); 769 if (delimLen > 0) { 770 // return condition when end of token found 771 addToken(tokens, workArea.substring(0, trimStart)); 772 return pos + delimLen; 773 } 774 775 // check for quote, and thus back into quoting mode 776 if (quoteLen > 0) { 777 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 778 quoting = true; 779 pos += quoteLen; 780 continue; 781 } 782 } 783 784 // check for ignored (outside quotes), and ignore 785 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len); 786 if (ignoredLen > 0) { 787 pos += ignoredLen; 788 continue; 789 } 790 791 // check for trimmed character 792 // don't yet know if its at the end, so copy to workArea 793 // use trimStart to keep track of trim at the end 794 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len); 795 if (trimmedLen > 0) { 796 workArea.append(chars, pos, trimmedLen); 797 pos += trimmedLen; 798 continue; 799 } 800 801 // copy regular character from outside quotes 802 workArea.append(chars[pos++]); 803 trimStart = workArea.size(); 804 } 805 } 806 807 // return condition when end of string found 808 addToken(tokens, workArea.substring(0, trimStart)); 809 return -1; 810 } 811 812 /** 813 * Checks if the characters at the index specified match the quote 814 * already matched in readNextToken(). 815 * 816 * @param chars the character array being tokenized 817 * @param pos the position to check for a quote 818 * @param len the length of the character array being tokenized 819 * @param quoteStart the start position of the matched quote, 0 if no quoting 820 * @param quoteLen the length of the matched quote, 0 if no quoting 821 * @return true if a quote is matched 822 */ 823 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) { 824 for (int i = 0; i < quoteLen; i++) { 825 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) { 826 return false; 827 } 828 } 829 return true; 830 } 831 832 // Delimiter 833 //----------------------------------------------------------------------- 834 /** 835 * Gets the field delimiter matcher. 836 * 837 * @return the delimiter matcher in use 838 */ 839 public StrMatcher getDelimiterMatcher() { 840 return this.delimMatcher; 841 } 842 843 /** 844 * Sets the field delimiter matcher. 845 * <p> 846 * The delimitier is used to separate one token from another. 847 * 848 * @param delim the delimiter matcher to use 849 * @return this, to enable chaining 850 */ 851 public StrTokenizer setDelimiterMatcher(StrMatcher delim) { 852 if (delim == null) { 853 this.delimMatcher = StrMatcher.noneMatcher(); 854 } else { 855 this.delimMatcher = delim; 856 } 857 return this; 858 } 859 860 /** 861 * Sets the field delimiter character. 862 * 863 * @param delim the delimiter character to use 864 * @return this, to enable chaining 865 */ 866 public StrTokenizer setDelimiterChar(char delim) { 867 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 868 } 869 870 /** 871 * Sets the field delimiter string. 872 * 873 * @param delim the delimiter string to use 874 * @return this, to enable chaining 875 */ 876 public StrTokenizer setDelimiterString(String delim) { 877 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 878 } 879 880 // Quote 881 //----------------------------------------------------------------------- 882 /** 883 * Gets the quote matcher currently in use. 884 * <p> 885 * The quote character is used to wrap data between the tokens. 886 * This enables delimiters to be entered as data. 887 * The default value is '"' (double quote). 888 * 889 * @return the quote matcher in use 890 */ 891 public StrMatcher getQuoteMatcher() { 892 return quoteMatcher; 893 } 894 895 /** 896 * Set the quote matcher to use. 897 * <p> 898 * The quote character is used to wrap data between the tokens. 899 * This enables delimiters to be entered as data. 900 * 901 * @param quote the quote matcher to use, null ignored 902 * @return this, to enable chaining 903 */ 904 public StrTokenizer setQuoteMatcher(StrMatcher quote) { 905 if (quote != null) { 906 this.quoteMatcher = quote; 907 } 908 return this; 909 } 910 911 /** 912 * Sets the quote character to use. 913 * <p> 914 * The quote character is used to wrap data between the tokens. 915 * This enables delimiters to be entered as data. 916 * 917 * @param quote the quote character to use 918 * @return this, to enable chaining 919 */ 920 public StrTokenizer setQuoteChar(char quote) { 921 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 922 } 923 924 // Ignored 925 //----------------------------------------------------------------------- 926 /** 927 * Gets the ignored character matcher. 928 * <p> 929 * These characters are ignored when parsing the String, unless they are 930 * within a quoted region. 931 * The default value is not to ignore anything. 932 * 933 * @return the ignored matcher in use 934 */ 935 public StrMatcher getIgnoredMatcher() { 936 return ignoredMatcher; 937 } 938 939 /** 940 * Set the matcher for characters to ignore. 941 * <p> 942 * These characters are ignored when parsing the String, unless they are 943 * within a quoted region. 944 * 945 * @param ignored the ignored matcher to use, null ignored 946 * @return this, to enable chaining 947 */ 948 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) { 949 if (ignored != null) { 950 this.ignoredMatcher = ignored; 951 } 952 return this; 953 } 954 955 /** 956 * Set the character to ignore. 957 * <p> 958 * This character is ignored when parsing the String, unless it is 959 * within a quoted region. 960 * 961 * @param ignored the ignored character to use 962 * @return this, to enable chaining 963 */ 964 public StrTokenizer setIgnoredChar(char ignored) { 965 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 966 } 967 968 // Trimmer 969 //----------------------------------------------------------------------- 970 /** 971 * Gets the trimmer character matcher. 972 * <p> 973 * These characters are trimmed off on each side of the delimiter 974 * until the token or quote is found. 975 * The default value is not to trim anything. 976 * 977 * @return the trimmer matcher in use 978 */ 979 public StrMatcher getTrimmerMatcher() { 980 return trimmerMatcher; 981 } 982 983 /** 984 * Sets the matcher for characters to trim. 985 * <p> 986 * These characters are trimmed off on each side of the delimiter 987 * until the token or quote is found. 988 * 989 * @param trimmer the trimmer matcher to use, null ignored 990 * @return this, to enable chaining 991 */ 992 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) { 993 if (trimmer != null) { 994 this.trimmerMatcher = trimmer; 995 } 996 return this; 997 } 998 999 //----------------------------------------------------------------------- 1000 /** 1001 * Gets whether the tokenizer currently returns empty tokens as null. 1002 * The default for this property is false. 1003 * 1004 * @return true if empty tokens are returned as null 1005 */ 1006 public boolean isEmptyTokenAsNull() { 1007 return this.emptyAsNull; 1008 } 1009 1010 /** 1011 * Sets whether the tokenizer should return empty tokens as null. 1012 * The default for this property is false. 1013 * 1014 * @param emptyAsNull whether empty tokens are returned as null 1015 * @return this, to enable chaining 1016 */ 1017 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) { 1018 this.emptyAsNull = emptyAsNull; 1019 return this; 1020 } 1021 1022 //----------------------------------------------------------------------- 1023 /** 1024 * Gets whether the tokenizer currently ignores empty tokens. 1025 * The default for this property is true. 1026 * 1027 * @return true if empty tokens are not returned 1028 */ 1029 public boolean isIgnoreEmptyTokens() { 1030 return ignoreEmptyTokens; 1031 } 1032 1033 /** 1034 * Sets whether the tokenizer should ignore and not return empty tokens. 1035 * The default for this property is true. 1036 * 1037 * @param ignoreEmptyTokens whether empty tokens are not returned 1038 * @return this, to enable chaining 1039 */ 1040 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) { 1041 this.ignoreEmptyTokens = ignoreEmptyTokens; 1042 return this; 1043 } 1044 1045 //----------------------------------------------------------------------- 1046 /** 1047 * Gets the String content that the tokenizer is parsing. 1048 * 1049 * @return the string content being parsed 1050 */ 1051 public String getContent() { 1052 if (chars == null) { 1053 return null; 1054 } 1055 return new String(chars); 1056 } 1057 1058 //----------------------------------------------------------------------- 1059 /** 1060 * Creates a new instance of this Tokenizer. The new instance is reset so 1061 * that it will be at the start of the token list. 1062 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1063 * 1064 * @return a new instance of this Tokenizer which has been reset. 1065 */ 1066 @Override 1067 public Object clone() { 1068 try { 1069 return cloneReset(); 1070 } catch (CloneNotSupportedException ex) { 1071 return null; 1072 } 1073 } 1074 1075 /** 1076 * Creates a new instance of this Tokenizer. The new instance is reset so that 1077 * it will be at the start of the token list. 1078 * 1079 * @return a new instance of this Tokenizer which has been reset. 1080 * @throws CloneNotSupportedException if there is a problem cloning 1081 */ 1082 Object cloneReset() throws CloneNotSupportedException { 1083 // this method exists to enable 100% test coverage 1084 StrTokenizer cloned = (StrTokenizer) super.clone(); 1085 if (cloned.chars != null) { 1086 cloned.chars = cloned.chars.clone(); 1087 } 1088 cloned.reset(); 1089 return cloned; 1090 } 1091 1092 //----------------------------------------------------------------------- 1093 /** 1094 * Gets the String content that the tokenizer is parsing. 1095 * 1096 * @return the string content being parsed 1097 */ 1098 @Override 1099 public String toString() { 1100 if (tokens == null) { 1101 return "StrTokenizer[not tokenized yet]"; 1102 } 1103 return "StrTokenizer" + getTokenList(); 1104 } 1105 1106 }