001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang.text; 018 019 import java.util.ArrayList; 020 import java.util.Collections; 021 import java.util.List; 022 import java.util.ListIterator; 023 import java.util.NoSuchElementException; 024 025 /** 026 * Tokenizes a string based based on delimiters (separators) 027 * and supporting quoting and ignored character concepts. 028 * <p> 029 * This class can split a String into many smaller strings. It aims 030 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 031 * however it offers much more control and flexibility including implementing 032 * the <code>ListIterator</code> interface. By default, it is set up 033 * like <code>StringTokenizer</code>. 034 * <p> 035 * The input String is split into a number of <i>tokens</i>. 036 * Each token is separated from the next String by a <i>delimiter</i>. 037 * One or more delimiter characters must be specified. 038 * <p> 039 * Each token may be surrounded by quotes. 040 * The <i>quote</i> matcher specifies the quote character(s). 041 * A quote may be escaped within a quoted section by duplicating itself. 042 * <p> 043 * Between each token and the delimiter are potentially characters that need trimming. 044 * The <i>trimmer</i> matcher specifies these characters. 045 * One usage might be to trim whitespace characters. 046 * <p> 047 * At any point outside the quotes there might potentially be invalid characters. 048 * The <i>ignored</i> matcher specifies these characters to be removed. 049 * One usage might be to remove new line characters. 050 * <p> 051 * Empty tokens may be removed or returned as null. 052 * <pre> 053 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 054 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 056 * </pre> 057 * <p> 058 * 059 * This tokenizer has the following properties and options: 060 * 061 * <table> 062 * <tr> 063 * <th>Property</th><th>Type</th><th>Default</th> 064 * </tr> 065 * <tr> 066 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 067 * </tr> 068 * <tr> 069 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 070 * </tr> 071 * <tr> 072 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 073 * </tr> 074 * <tr> 075 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 076 * </tr> 077 * <tr> 078 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 079 * </tr> 080 * </table> 081 * 082 * @author Apache Software Foundation 083 * @author Matthew Inger 084 * @author Gary D. Gregory 085 * @since 2.2 086 * @version $Id: StrTokenizer.java 907631 2010-02-08 12:22:48Z sebb $ 087 */ 088 public class StrTokenizer implements ListIterator, Cloneable { 089 090 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 091 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 092 static { 093 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 094 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 098 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 099 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 100 101 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 102 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 107 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 108 } 109 110 /** The text to work on. */ 111 private char chars[]; 112 /** The parsed tokens */ 113 private String tokens[]; 114 /** The current iteration position */ 115 private int tokenPos; 116 117 /** The delimiter matcher */ 118 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 119 /** The quote matcher */ 120 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 121 /** The ignored matcher */ 122 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 123 /** The trimmer matcher */ 124 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 125 126 /** Whether to return empty tokens as null */ 127 private boolean emptyAsNull = false; 128 /** Whether to ignore empty tokens */ 129 private boolean ignoreEmptyTokens = true; 130 131 //----------------------------------------------------------------------- 132 133 /** 134 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 135 * 136 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 137 */ 138 private static StrTokenizer getCSVClone() { 139 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 140 } 141 142 /** 143 * Gets a new tokenizer instance which parses Comma Separated Value strings 144 * initializing it with the given input. The default for CSV processing 145 * will be trim whitespace from both ends (which can be overridden with 146 * the setTrimmer method). 147 * <p> 148 * You must call a "reset" method to set the string which you want to parse. 149 * @return a new tokenizer instance which parses Comma Separated Value strings 150 */ 151 public static StrTokenizer getCSVInstance() { 152 return getCSVClone(); 153 } 154 155 /** 156 * Gets a new tokenizer instance which parses Comma Separated Value strings 157 * initializing it with the given input. The default for CSV processing 158 * will be trim whitespace from both ends (which can be overridden with 159 * the setTrimmer method). 160 * 161 * @param input the text to parse 162 * @return a new tokenizer instance which parses Comma Separated Value strings 163 */ 164 public static StrTokenizer getCSVInstance(String input) { 165 StrTokenizer tok = getCSVClone(); 166 tok.reset(input); 167 return tok; 168 } 169 170 /** 171 * Gets a new tokenizer instance which parses Comma Separated Value strings 172 * initializing it with the given input. The default for CSV processing 173 * will be trim whitespace from both ends (which can be overridden with 174 * the setTrimmer method). 175 * 176 * @param input the text to parse 177 * @return a new tokenizer instance which parses Comma Separated Value strings 178 */ 179 public static StrTokenizer getCSVInstance(char[] input) { 180 StrTokenizer tok = getCSVClone(); 181 tok.reset(input); 182 return tok; 183 } 184 185 /** 186 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 187 * 188 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 189 */ 190 private static StrTokenizer getTSVClone() { 191 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 192 } 193 194 195 /** 196 * Gets a new tokenizer instance which parses Tab Separated Value strings. 197 * The default for CSV processing will be trim whitespace from both ends 198 * (which can be overridden with the setTrimmer method). 199 * <p> 200 * You must call a "reset" method to set the string which you want to parse. 201 * @return a new tokenizer instance which parses Tab Separated Value strings. 202 */ 203 public static StrTokenizer getTSVInstance() { 204 return getTSVClone(); 205 } 206 207 /** 208 * Gets a new tokenizer instance which parses Tab Separated Value strings. 209 * The default for CSV processing will be trim whitespace from both ends 210 * (which can be overridden with the setTrimmer method). 211 * @param input the string to parse 212 * @return a new tokenizer instance which parses Tab Separated Value strings. 213 */ 214 public static StrTokenizer getTSVInstance(String input) { 215 StrTokenizer tok = getTSVClone(); 216 tok.reset(input); 217 return tok; 218 } 219 220 /** 221 * Gets a new tokenizer instance which parses Tab Separated Value strings. 222 * The default for CSV processing will be trim whitespace from both ends 223 * (which can be overridden with the setTrimmer method). 224 * @param input the string to parse 225 * @return a new tokenizer instance which parses Tab Separated Value strings. 226 */ 227 public static StrTokenizer getTSVInstance(char[] input) { 228 StrTokenizer tok = getTSVClone(); 229 tok.reset(input); 230 return tok; 231 } 232 233 //----------------------------------------------------------------------- 234 /** 235 * Constructs a tokenizer splitting on space, tab, newline and formfeed 236 * as per StringTokenizer, but with no text to tokenize. 237 * <p> 238 * This constructor is normally used with {@link #reset(String)}. 239 */ 240 public StrTokenizer() { 241 super(); 242 this.chars = null; 243 } 244 245 /** 246 * Constructs a tokenizer splitting on space, tab, newline and formfeed 247 * as per StringTokenizer. 248 * 249 * @param input the string which is to be parsed 250 */ 251 public StrTokenizer(String input) { 252 super(); 253 if (input != null) { 254 chars = input.toCharArray(); 255 } else { 256 chars = null; 257 } 258 } 259 260 /** 261 * Constructs a tokenizer splitting on the specified delimiter character. 262 * 263 * @param input the string which is to be parsed 264 * @param delim the field delimiter character 265 */ 266 public StrTokenizer(String input, char delim) { 267 this(input); 268 setDelimiterChar(delim); 269 } 270 271 /** 272 * Constructs a tokenizer splitting on the specified delimiter string. 273 * 274 * @param input the string which is to be parsed 275 * @param delim the field delimiter string 276 */ 277 public StrTokenizer(String input, String delim) { 278 this(input); 279 setDelimiterString(delim); 280 } 281 282 /** 283 * Constructs a tokenizer splitting using the specified delimiter matcher. 284 * 285 * @param input the string which is to be parsed 286 * @param delim the field delimiter matcher 287 */ 288 public StrTokenizer(String input, StrMatcher delim) { 289 this(input); 290 setDelimiterMatcher(delim); 291 } 292 293 /** 294 * Constructs a tokenizer splitting on the specified delimiter character 295 * and handling quotes using the specified quote character. 296 * 297 * @param input the string which is to be parsed 298 * @param delim the field delimiter character 299 * @param quote the field quoted string character 300 */ 301 public StrTokenizer(String input, char delim, char quote) { 302 this(input, delim); 303 setQuoteChar(quote); 304 } 305 306 /** 307 * Constructs a tokenizer splitting using the specified delimiter matcher 308 * and handling quotes using the specified quote matcher. 309 * 310 * @param input the string which is to be parsed 311 * @param delim the field delimiter matcher 312 * @param quote the field quoted string matcher 313 */ 314 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) { 315 this(input, delim); 316 setQuoteMatcher(quote); 317 } 318 319 /** 320 * Constructs a tokenizer splitting on space, tab, newline and formfeed 321 * as per StringTokenizer. 322 * <p> 323 * The input character array is not cloned, and must not be altered after 324 * passing in to this method. 325 * 326 * @param input the string which is to be parsed, not cloned 327 */ 328 public StrTokenizer(char[] input) { 329 super(); 330 this.chars = input; 331 } 332 333 /** 334 * Constructs a tokenizer splitting on the specified character. 335 * <p> 336 * The input character array is not cloned, and must not be altered after 337 * passing in to this method. 338 * 339 * @param input the string which is to be parsed, not cloned 340 * @param delim the field delimiter character 341 */ 342 public StrTokenizer(char[] input, char delim) { 343 this(input); 344 setDelimiterChar(delim); 345 } 346 347 /** 348 * Constructs a tokenizer splitting on the specified string. 349 * <p> 350 * The input character array is not cloned, and must not be altered after 351 * passing in to this method. 352 * 353 * @param input the string which is to be parsed, not cloned 354 * @param delim the field delimiter string 355 */ 356 public StrTokenizer(char[] input, String delim) { 357 this(input); 358 setDelimiterString(delim); 359 } 360 361 /** 362 * Constructs a tokenizer splitting using the specified delimiter matcher. 363 * <p> 364 * The input character array is not cloned, and must not be altered after 365 * passing in to this method. 366 * 367 * @param input the string which is to be parsed, not cloned 368 * @param delim the field delimiter matcher 369 */ 370 public StrTokenizer(char[] input, StrMatcher delim) { 371 this(input); 372 setDelimiterMatcher(delim); 373 } 374 375 /** 376 * Constructs a tokenizer splitting on the specified delimiter character 377 * and handling quotes using the specified quote character. 378 * <p> 379 * The input character array is not cloned, and must not be altered after 380 * passing in to this method. 381 * 382 * @param input the string which is to be parsed, not cloned 383 * @param delim the field delimiter character 384 * @param quote the field quoted string character 385 */ 386 public StrTokenizer(char[] input, char delim, char quote) { 387 this(input, delim); 388 setQuoteChar(quote); 389 } 390 391 /** 392 * Constructs a tokenizer splitting using the specified delimiter matcher 393 * and handling quotes using the specified quote matcher. 394 * <p> 395 * The input character array is not cloned, and must not be altered after 396 * passing in to this method. 397 * 398 * @param input the string which is to be parsed, not cloned 399 * @param delim the field delimiter character 400 * @param quote the field quoted string character 401 */ 402 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) { 403 this(input, delim); 404 setQuoteMatcher(quote); 405 } 406 407 // API 408 //----------------------------------------------------------------------- 409 /** 410 * Gets the number of tokens found in the String. 411 * 412 * @return the number of matched tokens 413 */ 414 public int size() { 415 checkTokenized(); 416 return tokens.length; 417 } 418 419 /** 420 * Gets the next token from the String. 421 * Equivalent to {@link #next()} except it returns null rather than 422 * throwing {@link NoSuchElementException} when no tokens remain. 423 * 424 * @return the next sequential token, or null when no more tokens are found 425 */ 426 public String nextToken() { 427 if (hasNext()) { 428 return tokens[tokenPos++]; 429 } 430 return null; 431 } 432 433 /** 434 * Gets the previous token from the String. 435 * 436 * @return the previous sequential token, or null when no more tokens are found 437 */ 438 public String previousToken() { 439 if (hasPrevious()) { 440 return tokens[--tokenPos]; 441 } 442 return null; 443 } 444 445 /** 446 * Gets a copy of the full token list as an independent modifiable array. 447 * 448 * @return the tokens as a String array 449 */ 450 public String[] getTokenArray() { 451 checkTokenized(); 452 return (String[]) tokens.clone(); 453 } 454 455 /** 456 * Gets a copy of the full token list as an independent modifiable list. 457 * 458 * @return the tokens as a String array 459 */ 460 public List getTokenList() { 461 checkTokenized(); 462 List list = new ArrayList(tokens.length); 463 for (int i = 0; i < tokens.length; i++) { 464 list.add(tokens[i]); 465 } 466 return list; 467 } 468 469 /** 470 * Resets this tokenizer, forgetting all parsing and iteration already completed. 471 * <p> 472 * This method allows the same tokenizer to be reused for the same String. 473 * 474 * @return this, to enable chaining 475 */ 476 public StrTokenizer reset() { 477 tokenPos = 0; 478 tokens = null; 479 return this; 480 } 481 482 /** 483 * Reset this tokenizer, giving it a new input string to parse. 484 * In this manner you can re-use a tokenizer with the same settings 485 * on multiple input lines. 486 * 487 * @param input the new string to tokenize, null sets no text to parse 488 * @return this, to enable chaining 489 */ 490 public StrTokenizer reset(String input) { 491 reset(); 492 if (input != null) { 493 this.chars = input.toCharArray(); 494 } else { 495 this.chars = null; 496 } 497 return this; 498 } 499 500 /** 501 * Reset this tokenizer, giving it a new input string to parse. 502 * In this manner you can re-use a tokenizer with the same settings 503 * on multiple input lines. 504 * <p> 505 * The input character array is not cloned, and must not be altered after 506 * passing in to this method. 507 * 508 * @param input the new character array to tokenize, not cloned, null sets no text to parse 509 * @return this, to enable chaining 510 */ 511 public StrTokenizer reset(char[] input) { 512 reset(); 513 this.chars = input; 514 return this; 515 } 516 517 // ListIterator 518 //----------------------------------------------------------------------- 519 /** 520 * Checks whether there are any more tokens. 521 * 522 * @return true if there are more tokens 523 */ 524 public boolean hasNext() { 525 checkTokenized(); 526 return tokenPos < tokens.length; 527 } 528 529 /** 530 * Gets the next token. 531 * 532 * @return the next String token 533 * @throws NoSuchElementException if there are no more elements 534 */ 535 public Object next() { 536 if (hasNext()) { 537 return tokens[tokenPos++]; 538 } 539 throw new NoSuchElementException(); 540 } 541 542 /** 543 * Gets the index of the next token to return. 544 * 545 * @return the next token index 546 */ 547 public int nextIndex() { 548 return tokenPos; 549 } 550 551 /** 552 * Checks whether there are any previous tokens that can be iterated to. 553 * 554 * @return true if there are previous tokens 555 */ 556 public boolean hasPrevious() { 557 checkTokenized(); 558 return tokenPos > 0; 559 } 560 561 /** 562 * Gets the token previous to the last returned token. 563 * 564 * @return the previous token 565 */ 566 public Object previous() { 567 if (hasPrevious()) { 568 return tokens[--tokenPos]; 569 } 570 throw new NoSuchElementException(); 571 } 572 573 /** 574 * Gets the index of the previous token. 575 * 576 * @return the previous token index 577 */ 578 public int previousIndex() { 579 return tokenPos - 1; 580 } 581 582 /** 583 * Unsupported ListIterator operation. 584 * 585 * @throws UnsupportedOperationException always 586 */ 587 public void remove() { 588 throw new UnsupportedOperationException("remove() is unsupported"); 589 } 590 591 /** 592 * Unsupported ListIterator operation. 593 * @param obj this parameter ignored. 594 * @throws UnsupportedOperationException always 595 */ 596 public void set(Object obj) { 597 throw new UnsupportedOperationException("set() is unsupported"); 598 } 599 600 /** 601 * Unsupported ListIterator operation. 602 * @param obj this parameter ignored. 603 * @throws UnsupportedOperationException always 604 */ 605 public void add(Object obj) { 606 throw new UnsupportedOperationException("add() is unsupported"); 607 } 608 609 // Implementation 610 //----------------------------------------------------------------------- 611 /** 612 * Checks if tokenization has been done, and if not then do it. 613 */ 614 private void checkTokenized() { 615 if (tokens == null) { 616 if (chars == null) { 617 // still call tokenize as subclass may do some work 618 List split = tokenize(null, 0, 0); 619 tokens = (String[]) split.toArray(new String[split.size()]); 620 } else { 621 List split = tokenize(chars, 0, chars.length); 622 tokens = (String[]) split.toArray(new String[split.size()]); 623 } 624 } 625 } 626 627 /** 628 * Internal method to performs the tokenization. 629 * <p> 630 * Most users of this class do not need to call this method. This method 631 * will be called automatically by other (public) methods when required. 632 * <p> 633 * This method exists to allow subclasses to add code before or after the 634 * tokenization. For example, a subclass could alter the character array, 635 * offset or count to be parsed, or call the tokenizer multiple times on 636 * multiple strings. It is also be possible to filter the results. 637 * <p> 638 * <code>StrTokenizer</code> will always pass a zero offset and a count 639 * equal to the length of the array to this method, however a subclass 640 * may pass other values, or even an entirely different array. 641 * 642 * @param chars the character array being tokenized, may be null 643 * @param offset the start position within the character array, must be valid 644 * @param count the number of characters to tokenize, must be valid 645 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 646 */ 647 protected List tokenize(char[] chars, int offset, int count) { 648 if (chars == null || count == 0) { 649 return Collections.EMPTY_LIST; 650 } 651 StrBuilder buf = new StrBuilder(); 652 List tokens = new ArrayList(); 653 int pos = offset; 654 655 // loop around the entire buffer 656 while (pos >= 0 && pos < count) { 657 // find next token 658 pos = readNextToken(chars, pos, count, buf, tokens); 659 660 // handle case where end of string is a delimiter 661 if (pos >= count) { 662 addToken(tokens, ""); 663 } 664 } 665 return tokens; 666 } 667 668 /** 669 * Adds a token to a list, paying attention to the parameters we've set. 670 * 671 * @param list the list to add to 672 * @param tok the token to add 673 */ 674 private void addToken(List list, String tok) { 675 if (tok == null || tok.length() == 0) { 676 if (isIgnoreEmptyTokens()) { 677 return; 678 } 679 if (isEmptyTokenAsNull()) { 680 tok = null; 681 } 682 } 683 list.add(tok); 684 } 685 686 /** 687 * Reads character by character through the String to get the next token. 688 * 689 * @param chars the character array being tokenized 690 * @param start the first character of field 691 * @param len the length of the character array being tokenized 692 * @param workArea a temporary work area 693 * @param tokens the list of parsed tokens 694 * @return the starting position of the next field (the character 695 * immediately after the delimiter), or -1 if end of string found 696 */ 697 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) { 698 // skip all leading whitespace, unless it is the 699 // field delimiter or the quote character 700 while (start < len) { 701 int removeLen = Math.max( 702 getIgnoredMatcher().isMatch(chars, start, start, len), 703 getTrimmerMatcher().isMatch(chars, start, start, len)); 704 if (removeLen == 0 || 705 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 || 706 getQuoteMatcher().isMatch(chars, start, start, len) > 0) { 707 break; 708 } 709 start += removeLen; 710 } 711 712 // handle reaching end 713 if (start >= len) { 714 addToken(tokens, ""); 715 return -1; 716 } 717 718 // handle empty token 719 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len); 720 if (delimLen > 0) { 721 addToken(tokens, ""); 722 return start + delimLen; 723 } 724 725 // handle found token 726 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len); 727 if (quoteLen > 0) { 728 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen); 729 } 730 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0); 731 } 732 733 /** 734 * Reads a possibly quoted string token. 735 * 736 * @param chars the character array being tokenized 737 * @param start the first character of field 738 * @param len the length of the character array being tokenized 739 * @param workArea a temporary work area 740 * @param tokens the list of parsed tokens 741 * @param quoteStart the start position of the matched quote, 0 if no quoting 742 * @param quoteLen the length of the matched quote, 0 if no quoting 743 * @return the starting position of the next field (the character 744 * immediately after the delimiter, or if end of string found, 745 * then the length of string 746 */ 747 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 748 List tokens, int quoteStart, int quoteLen) 749 { 750 // Loop until we've found the end of the quoted 751 // string or the end of the input 752 workArea.clear(); 753 int pos = start; 754 boolean quoting = (quoteLen > 0); 755 int trimStart = 0; 756 757 while (pos < len) { 758 // quoting mode can occur several times throughout a string 759 // we must switch between quoting and non-quoting until we 760 // encounter a non-quoted delimiter, or end of string 761 if (quoting) { 762 // In quoting mode 763 764 // If we've found a quote character, see if it's 765 // followed by a second quote. If so, then we need 766 // to actually put the quote character into the token 767 // rather than end the token. 768 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 769 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) { 770 // matched pair of quotes, thus an escaped quote 771 workArea.append(chars, pos, quoteLen); 772 pos += (quoteLen * 2); 773 trimStart = workArea.size(); 774 continue; 775 } 776 777 // end of quoting 778 quoting = false; 779 pos += quoteLen; 780 continue; 781 } 782 783 // copy regular character from inside quotes 784 workArea.append(chars[pos++]); 785 trimStart = workArea.size(); 786 787 } else { 788 // Not in quoting mode 789 790 // check for delimiter, and thus end of token 791 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len); 792 if (delimLen > 0) { 793 // return condition when end of token found 794 addToken(tokens, workArea.substring(0, trimStart)); 795 return pos + delimLen; 796 } 797 798 // check for quote, and thus back into quoting mode 799 if (quoteLen > 0) { 800 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 801 quoting = true; 802 pos += quoteLen; 803 continue; 804 } 805 } 806 807 // check for ignored (outside quotes), and ignore 808 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len); 809 if (ignoredLen > 0) { 810 pos += ignoredLen; 811 continue; 812 } 813 814 // check for trimmed character 815 // don't yet know if its at the end, so copy to workArea 816 // use trimStart to keep track of trim at the end 817 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len); 818 if (trimmedLen > 0) { 819 workArea.append(chars, pos, trimmedLen); 820 pos += trimmedLen; 821 continue; 822 } 823 824 // copy regular character from outside quotes 825 workArea.append(chars[pos++]); 826 trimStart = workArea.size(); 827 } 828 } 829 830 // return condition when end of string found 831 addToken(tokens, workArea.substring(0, trimStart)); 832 return -1; 833 } 834 835 /** 836 * Checks if the characters at the index specified match the quote 837 * already matched in readNextToken(). 838 * 839 * @param chars the character array being tokenized 840 * @param pos the position to check for a quote 841 * @param len the length of the character array being tokenized 842 * @param quoteStart the start position of the matched quote, 0 if no quoting 843 * @param quoteLen the length of the matched quote, 0 if no quoting 844 * @return true if a quote is matched 845 */ 846 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) { 847 for (int i = 0; i < quoteLen; i++) { 848 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) { 849 return false; 850 } 851 } 852 return true; 853 } 854 855 // Delimiter 856 //----------------------------------------------------------------------- 857 /** 858 * Gets the field delimiter matcher. 859 * 860 * @return the delimiter matcher in use 861 */ 862 public StrMatcher getDelimiterMatcher() { 863 return this.delimMatcher; 864 } 865 866 /** 867 * Sets the field delimiter matcher. 868 * <p> 869 * The delimitier is used to separate one token from another. 870 * 871 * @param delim the delimiter matcher to use 872 * @return this, to enable chaining 873 */ 874 public StrTokenizer setDelimiterMatcher(StrMatcher delim) { 875 if (delim == null) { 876 this.delimMatcher = StrMatcher.noneMatcher(); 877 } else { 878 this.delimMatcher = delim; 879 } 880 return this; 881 } 882 883 /** 884 * Sets the field delimiter character. 885 * 886 * @param delim the delimiter character to use 887 * @return this, to enable chaining 888 */ 889 public StrTokenizer setDelimiterChar(char delim) { 890 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 891 } 892 893 /** 894 * Sets the field delimiter string. 895 * 896 * @param delim the delimiter string to use 897 * @return this, to enable chaining 898 */ 899 public StrTokenizer setDelimiterString(String delim) { 900 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 901 } 902 903 // Quote 904 //----------------------------------------------------------------------- 905 /** 906 * Gets the quote matcher currently in use. 907 * <p> 908 * The quote character is used to wrap data between the tokens. 909 * This enables delimiters to be entered as data. 910 * The default value is '"' (double quote). 911 * 912 * @return the quote matcher in use 913 */ 914 public StrMatcher getQuoteMatcher() { 915 return quoteMatcher; 916 } 917 918 /** 919 * Set the quote matcher to use. 920 * <p> 921 * The quote character is used to wrap data between the tokens. 922 * This enables delimiters to be entered as data. 923 * 924 * @param quote the quote matcher to use, null ignored 925 * @return this, to enable chaining 926 */ 927 public StrTokenizer setQuoteMatcher(StrMatcher quote) { 928 if (quote != null) { 929 this.quoteMatcher = quote; 930 } 931 return this; 932 } 933 934 /** 935 * Sets the quote character to use. 936 * <p> 937 * The quote character is used to wrap data between the tokens. 938 * This enables delimiters to be entered as data. 939 * 940 * @param quote the quote character to use 941 * @return this, to enable chaining 942 */ 943 public StrTokenizer setQuoteChar(char quote) { 944 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 945 } 946 947 // Ignored 948 //----------------------------------------------------------------------- 949 /** 950 * Gets the ignored character matcher. 951 * <p> 952 * These characters are ignored when parsing the String, unless they are 953 * within a quoted region. 954 * The default value is not to ignore anything. 955 * 956 * @return the ignored matcher in use 957 */ 958 public StrMatcher getIgnoredMatcher() { 959 return ignoredMatcher; 960 } 961 962 /** 963 * Set the matcher for characters to ignore. 964 * <p> 965 * These characters are ignored when parsing the String, unless they are 966 * within a quoted region. 967 * 968 * @param ignored the ignored matcher to use, null ignored 969 * @return this, to enable chaining 970 */ 971 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) { 972 if (ignored != null) { 973 this.ignoredMatcher = ignored; 974 } 975 return this; 976 } 977 978 /** 979 * Set the character to ignore. 980 * <p> 981 * This character is ignored when parsing the String, unless it is 982 * within a quoted region. 983 * 984 * @param ignored the ignored character to use 985 * @return this, to enable chaining 986 */ 987 public StrTokenizer setIgnoredChar(char ignored) { 988 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 989 } 990 991 // Trimmer 992 //----------------------------------------------------------------------- 993 /** 994 * Gets the trimmer character matcher. 995 * <p> 996 * These characters are trimmed off on each side of the delimiter 997 * until the token or quote is found. 998 * The default value is not to trim anything. 999 * 1000 * @return the trimmer matcher in use 1001 */ 1002 public StrMatcher getTrimmerMatcher() { 1003 return trimmerMatcher; 1004 } 1005 1006 /** 1007 * Sets the matcher for characters to trim. 1008 * <p> 1009 * These characters are trimmed off on each side of the delimiter 1010 * until the token or quote is found. 1011 * 1012 * @param trimmer the trimmer matcher to use, null ignored 1013 * @return this, to enable chaining 1014 */ 1015 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) { 1016 if (trimmer != null) { 1017 this.trimmerMatcher = trimmer; 1018 } 1019 return this; 1020 } 1021 1022 //----------------------------------------------------------------------- 1023 /** 1024 * Gets whether the tokenizer currently returns empty tokens as null. 1025 * The default for this property is false. 1026 * 1027 * @return true if empty tokens are returned as null 1028 */ 1029 public boolean isEmptyTokenAsNull() { 1030 return this.emptyAsNull; 1031 } 1032 1033 /** 1034 * Sets whether the tokenizer should return empty tokens as null. 1035 * The default for this property is false. 1036 * 1037 * @param emptyAsNull whether empty tokens are returned as null 1038 * @return this, to enable chaining 1039 */ 1040 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) { 1041 this.emptyAsNull = emptyAsNull; 1042 return this; 1043 } 1044 1045 //----------------------------------------------------------------------- 1046 /** 1047 * Gets whether the tokenizer currently ignores empty tokens. 1048 * The default for this property is true. 1049 * 1050 * @return true if empty tokens are not returned 1051 */ 1052 public boolean isIgnoreEmptyTokens() { 1053 return ignoreEmptyTokens; 1054 } 1055 1056 /** 1057 * Sets whether the tokenizer should ignore and not return empty tokens. 1058 * The default for this property is true. 1059 * 1060 * @param ignoreEmptyTokens whether empty tokens are not returned 1061 * @return this, to enable chaining 1062 */ 1063 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) { 1064 this.ignoreEmptyTokens = ignoreEmptyTokens; 1065 return this; 1066 } 1067 1068 //----------------------------------------------------------------------- 1069 /** 1070 * Gets the String content that the tokenizer is parsing. 1071 * 1072 * @return the string content being parsed 1073 */ 1074 public String getContent() { 1075 if (chars == null) { 1076 return null; 1077 } 1078 return new String(chars); 1079 } 1080 1081 //----------------------------------------------------------------------- 1082 /** 1083 * Creates a new instance of this Tokenizer. The new instance is reset so 1084 * that it will be at the start of the token list. 1085 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1086 * 1087 * @return a new instance of this Tokenizer which has been reset. 1088 */ 1089 public Object clone() { 1090 try { 1091 return cloneReset(); 1092 } catch (CloneNotSupportedException ex) { 1093 return null; 1094 } 1095 } 1096 1097 /** 1098 * Creates a new instance of this Tokenizer. The new instance is reset so that 1099 * it will be at the start of the token list. 1100 * 1101 * @return a new instance of this Tokenizer which has been reset. 1102 * @throws CloneNotSupportedException if there is a problem cloning 1103 */ 1104 Object cloneReset() throws CloneNotSupportedException { 1105 // this method exists to enable 100% test coverage 1106 StrTokenizer cloned = (StrTokenizer) super.clone(); 1107 if (cloned.chars != null) { 1108 cloned.chars = (char[]) cloned.chars.clone(); 1109 } 1110 cloned.reset(); 1111 return cloned; 1112 } 1113 1114 //----------------------------------------------------------------------- 1115 /** 1116 * Gets the String content that the tokenizer is parsing. 1117 * 1118 * @return the string content being parsed 1119 */ 1120 public String toString() { 1121 if (tokens == null) { 1122 return "StrTokenizer[not tokenized yet]"; 1123 } 1124 return "StrTokenizer" + getTokenList(); 1125 } 1126 1127 }