001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang3.text; 018 019 import java.util.ArrayList; 020 import java.util.Collections; 021 import java.util.List; 022 import java.util.ListIterator; 023 import java.util.NoSuchElementException; 024 025 import org.apache.commons.lang3.ArrayUtils; 026 027 /** 028 * Tokenizes a string based based on delimiters (separators) 029 * and supporting quoting and ignored character concepts. 030 * <p> 031 * This class can split a String into many smaller strings. It aims 032 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 033 * however it offers much more control and flexibility including implementing 034 * the <code>ListIterator</code> interface. By default, it is set up 035 * like <code>StringTokenizer</code>. 036 * <p> 037 * The input String is split into a number of <i>tokens</i>. 038 * Each token is separated from the next String by a <i>delimiter</i>. 039 * One or more delimiter characters must be specified. 040 * <p> 041 * Each token may be surrounded by quotes. 042 * The <i>quote</i> matcher specifies the quote character(s). 043 * A quote may be escaped within a quoted section by duplicating itself. 044 * <p> 045 * Between each token and the delimiter are potentially characters that need trimming. 046 * The <i>trimmer</i> matcher specifies these characters. 047 * One usage might be to trim whitespace characters. 048 * <p> 049 * At any point outside the quotes there might potentially be invalid characters. 050 * The <i>ignored</i> matcher specifies these characters to be removed. 051 * One usage might be to remove new line characters. 052 * <p> 053 * Empty tokens may be removed or returned as null. 054 * <pre> 055 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 056 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 057 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 058 * </pre> 059 * <p> 060 * 061 * This tokenizer has the following properties and options: 062 * 063 * <table> 064 * <tr> 065 * <th>Property</th><th>Type</th><th>Default</th> 066 * </tr> 067 * <tr> 068 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 069 * </tr> 070 * <tr> 071 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 072 * </tr> 073 * <tr> 074 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 075 * </tr> 076 * <tr> 077 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 078 * </tr> 079 * <tr> 080 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 081 * </tr> 082 * </table> 083 * 084 * @since 2.2 085 * @version $Id: StrTokenizer.java 1153241 2011-08-02 18:49:52Z ggregory $ 086 */ 087 public class StrTokenizer implements ListIterator<String>, Cloneable { 088 089 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 090 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 091 static { 092 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 093 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 094 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 098 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 099 100 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 101 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 102 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 106 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 107 } 108 109 /** The text to work on. */ 110 private char chars[]; 111 /** The parsed tokens */ 112 private String tokens[]; 113 /** The current iteration position */ 114 private int tokenPos; 115 116 /** The delimiter matcher */ 117 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 118 /** The quote matcher */ 119 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 120 /** The ignored matcher */ 121 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 122 /** The trimmer matcher */ 123 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 124 125 /** Whether to return empty tokens as null */ 126 private boolean emptyAsNull = false; 127 /** Whether to ignore empty tokens */ 128 private boolean ignoreEmptyTokens = true; 129 130 //----------------------------------------------------------------------- 131 132 /** 133 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 134 * 135 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 136 */ 137 private static StrTokenizer getCSVClone() { 138 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 139 } 140 141 /** 142 * Gets a new tokenizer instance which parses Comma Separated Value strings 143 * initializing it with the given input. The default for CSV processing 144 * will be trim whitespace from both ends (which can be overridden with 145 * the setTrimmer method). 146 * <p> 147 * You must call a "reset" method to set the string which you want to parse. 148 * @return a new tokenizer instance which parses Comma Separated Value strings 149 */ 150 public static StrTokenizer getCSVInstance() { 151 return getCSVClone(); 152 } 153 154 /** 155 * Gets a new tokenizer instance which parses Comma Separated Value strings 156 * initializing it with the given input. The default for CSV processing 157 * will be trim whitespace from both ends (which can be overridden with 158 * the setTrimmer method). 159 * 160 * @param input the text to parse 161 * @return a new tokenizer instance which parses Comma Separated Value strings 162 */ 163 public static StrTokenizer getCSVInstance(String input) { 164 StrTokenizer tok = getCSVClone(); 165 tok.reset(input); 166 return tok; 167 } 168 169 /** 170 * Gets a new tokenizer instance which parses Comma Separated Value strings 171 * initializing it with the given input. The default for CSV processing 172 * will be trim whitespace from both ends (which can be overridden with 173 * the setTrimmer method). 174 * 175 * @param input the text to parse 176 * @return a new tokenizer instance which parses Comma Separated Value strings 177 */ 178 public static StrTokenizer getCSVInstance(char[] input) { 179 StrTokenizer tok = getCSVClone(); 180 tok.reset(input); 181 return tok; 182 } 183 184 /** 185 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 186 * 187 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 188 */ 189 private static StrTokenizer getTSVClone() { 190 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 191 } 192 193 194 /** 195 * Gets a new tokenizer instance which parses Tab Separated Value strings. 196 * The default for CSV processing will be trim whitespace from both ends 197 * (which can be overridden with the setTrimmer method). 198 * <p> 199 * You must call a "reset" method to set the string which you want to parse. 200 * @return a new tokenizer instance which parses Tab Separated Value strings. 201 */ 202 public static StrTokenizer getTSVInstance() { 203 return getTSVClone(); 204 } 205 206 /** 207 * Gets a new tokenizer instance which parses Tab Separated Value strings. 208 * The default for CSV processing will be trim whitespace from both ends 209 * (which can be overridden with the setTrimmer method). 210 * @param input the string to parse 211 * @return a new tokenizer instance which parses Tab Separated Value strings. 212 */ 213 public static StrTokenizer getTSVInstance(String input) { 214 StrTokenizer tok = getTSVClone(); 215 tok.reset(input); 216 return tok; 217 } 218 219 /** 220 * Gets a new tokenizer instance which parses Tab Separated Value strings. 221 * The default for CSV processing will be trim whitespace from both ends 222 * (which can be overridden with the setTrimmer method). 223 * @param input the string to parse 224 * @return a new tokenizer instance which parses Tab Separated Value strings. 225 */ 226 public static StrTokenizer getTSVInstance(char[] input) { 227 StrTokenizer tok = getTSVClone(); 228 tok.reset(input); 229 return tok; 230 } 231 232 //----------------------------------------------------------------------- 233 /** 234 * Constructs a tokenizer splitting on space, tab, newline and formfeed 235 * as per StringTokenizer, but with no text to tokenize. 236 * <p> 237 * This constructor is normally used with {@link #reset(String)}. 238 */ 239 public StrTokenizer() { 240 super(); 241 this.chars = null; 242 } 243 244 /** 245 * Constructs a tokenizer splitting on space, tab, newline and formfeed 246 * as per StringTokenizer. 247 * 248 * @param input the string which is to be parsed 249 */ 250 public StrTokenizer(String input) { 251 super(); 252 if (input != null) { 253 chars = input.toCharArray(); 254 } else { 255 chars = null; 256 } 257 } 258 259 /** 260 * Constructs a tokenizer splitting on the specified delimiter character. 261 * 262 * @param input the string which is to be parsed 263 * @param delim the field delimiter character 264 */ 265 public StrTokenizer(String input, char delim) { 266 this(input); 267 setDelimiterChar(delim); 268 } 269 270 /** 271 * Constructs a tokenizer splitting on the specified delimiter string. 272 * 273 * @param input the string which is to be parsed 274 * @param delim the field delimiter string 275 */ 276 public StrTokenizer(String input, String delim) { 277 this(input); 278 setDelimiterString(delim); 279 } 280 281 /** 282 * Constructs a tokenizer splitting using the specified delimiter matcher. 283 * 284 * @param input the string which is to be parsed 285 * @param delim the field delimiter matcher 286 */ 287 public StrTokenizer(String input, StrMatcher delim) { 288 this(input); 289 setDelimiterMatcher(delim); 290 } 291 292 /** 293 * Constructs a tokenizer splitting on the specified delimiter character 294 * and handling quotes using the specified quote character. 295 * 296 * @param input the string which is to be parsed 297 * @param delim the field delimiter character 298 * @param quote the field quoted string character 299 */ 300 public StrTokenizer(String input, char delim, char quote) { 301 this(input, delim); 302 setQuoteChar(quote); 303 } 304 305 /** 306 * Constructs a tokenizer splitting using the specified delimiter matcher 307 * and handling quotes using the specified quote matcher. 308 * 309 * @param input the string which is to be parsed 310 * @param delim the field delimiter matcher 311 * @param quote the field quoted string matcher 312 */ 313 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) { 314 this(input, delim); 315 setQuoteMatcher(quote); 316 } 317 318 /** 319 * Constructs a tokenizer splitting on space, tab, newline and formfeed 320 * as per StringTokenizer. 321 * 322 * @param input the string which is to be parsed, not cloned 323 */ 324 public StrTokenizer(char[] input) { 325 super(); 326 this.chars = ArrayUtils.clone(input); 327 } 328 329 /** 330 * Constructs a tokenizer splitting on the specified character. 331 * 332 * @param input the string which is to be parsed, not cloned 333 * @param delim the field delimiter character 334 */ 335 public StrTokenizer(char[] input, char delim) { 336 this(input); 337 setDelimiterChar(delim); 338 } 339 340 /** 341 * Constructs a tokenizer splitting on the specified string. 342 * 343 * @param input the string which is to be parsed, not cloned 344 * @param delim the field delimiter string 345 */ 346 public StrTokenizer(char[] input, String delim) { 347 this(input); 348 setDelimiterString(delim); 349 } 350 351 /** 352 * Constructs a tokenizer splitting using the specified delimiter matcher. 353 * 354 * @param input the string which is to be parsed, not cloned 355 * @param delim the field delimiter matcher 356 */ 357 public StrTokenizer(char[] input, StrMatcher delim) { 358 this(input); 359 setDelimiterMatcher(delim); 360 } 361 362 /** 363 * Constructs a tokenizer splitting on the specified delimiter character 364 * and handling quotes using the specified quote character. 365 * 366 * @param input the string which is to be parsed, not cloned 367 * @param delim the field delimiter character 368 * @param quote the field quoted string character 369 */ 370 public StrTokenizer(char[] input, char delim, char quote) { 371 this(input, delim); 372 setQuoteChar(quote); 373 } 374 375 /** 376 * Constructs a tokenizer splitting using the specified delimiter matcher 377 * and handling quotes using the specified quote matcher. 378 * 379 * @param input the string which is to be parsed, not cloned 380 * @param delim the field delimiter character 381 * @param quote the field quoted string character 382 */ 383 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) { 384 this(input, delim); 385 setQuoteMatcher(quote); 386 } 387 388 // API 389 //----------------------------------------------------------------------- 390 /** 391 * Gets the number of tokens found in the String. 392 * 393 * @return the number of matched tokens 394 */ 395 public int size() { 396 checkTokenized(); 397 return tokens.length; 398 } 399 400 /** 401 * Gets the next token from the String. 402 * Equivalent to {@link #next()} except it returns null rather than 403 * throwing {@link NoSuchElementException} when no tokens remain. 404 * 405 * @return the next sequential token, or null when no more tokens are found 406 */ 407 public String nextToken() { 408 if (hasNext()) { 409 return tokens[tokenPos++]; 410 } 411 return null; 412 } 413 414 /** 415 * Gets the previous token from the String. 416 * 417 * @return the previous sequential token, or null when no more tokens are found 418 */ 419 public String previousToken() { 420 if (hasPrevious()) { 421 return tokens[--tokenPos]; 422 } 423 return null; 424 } 425 426 /** 427 * Gets a copy of the full token list as an independent modifiable array. 428 * 429 * @return the tokens as a String array 430 */ 431 public String[] getTokenArray() { 432 checkTokenized(); 433 return tokens.clone(); 434 } 435 436 /** 437 * Gets a copy of the full token list as an independent modifiable list. 438 * 439 * @return the tokens as a String array 440 */ 441 public List<String> getTokenList() { 442 checkTokenized(); 443 List<String> list = new ArrayList<String>(tokens.length); 444 for (String element : tokens) { 445 list.add(element); 446 } 447 return list; 448 } 449 450 /** 451 * Resets this tokenizer, forgetting all parsing and iteration already completed. 452 * <p> 453 * This method allows the same tokenizer to be reused for the same String. 454 * 455 * @return this, to enable chaining 456 */ 457 public StrTokenizer reset() { 458 tokenPos = 0; 459 tokens = null; 460 return this; 461 } 462 463 /** 464 * Reset this tokenizer, giving it a new input string to parse. 465 * In this manner you can re-use a tokenizer with the same settings 466 * on multiple input lines. 467 * 468 * @param input the new string to tokenize, null sets no text to parse 469 * @return this, to enable chaining 470 */ 471 public StrTokenizer reset(String input) { 472 reset(); 473 if (input != null) { 474 this.chars = input.toCharArray(); 475 } else { 476 this.chars = null; 477 } 478 return this; 479 } 480 481 /** 482 * Reset this tokenizer, giving it a new input string to parse. 483 * In this manner you can re-use a tokenizer with the same settings 484 * on multiple input lines. 485 * 486 * @param input the new character array to tokenize, not cloned, null sets no text to parse 487 * @return this, to enable chaining 488 */ 489 public StrTokenizer reset(char[] input) { 490 reset(); 491 this.chars = ArrayUtils.clone(input); 492 return this; 493 } 494 495 // ListIterator 496 //----------------------------------------------------------------------- 497 /** 498 * Checks whether there are any more tokens. 499 * 500 * @return true if there are more tokens 501 */ 502 public boolean hasNext() { 503 checkTokenized(); 504 return tokenPos < tokens.length; 505 } 506 507 /** 508 * Gets the next token. 509 * 510 * @return the next String token 511 * @throws NoSuchElementException if there are no more elements 512 */ 513 public String next() { 514 if (hasNext()) { 515 return tokens[tokenPos++]; 516 } 517 throw new NoSuchElementException(); 518 } 519 520 /** 521 * Gets the index of the next token to return. 522 * 523 * @return the next token index 524 */ 525 public int nextIndex() { 526 return tokenPos; 527 } 528 529 /** 530 * Checks whether there are any previous tokens that can be iterated to. 531 * 532 * @return true if there are previous tokens 533 */ 534 public boolean hasPrevious() { 535 checkTokenized(); 536 return tokenPos > 0; 537 } 538 539 /** 540 * Gets the token previous to the last returned token. 541 * 542 * @return the previous token 543 */ 544 public String previous() { 545 if (hasPrevious()) { 546 return tokens[--tokenPos]; 547 } 548 throw new NoSuchElementException(); 549 } 550 551 /** 552 * Gets the index of the previous token. 553 * 554 * @return the previous token index 555 */ 556 public int previousIndex() { 557 return tokenPos - 1; 558 } 559 560 /** 561 * Unsupported ListIterator operation. 562 * 563 * @throws UnsupportedOperationException always 564 */ 565 public void remove() { 566 throw new UnsupportedOperationException("remove() is unsupported"); 567 } 568 569 /** 570 * Unsupported ListIterator operation. 571 * @param obj this parameter ignored. 572 * @throws UnsupportedOperationException always 573 */ 574 public void set(String obj) { 575 throw new UnsupportedOperationException("set() is unsupported"); 576 } 577 578 /** 579 * Unsupported ListIterator operation. 580 * @param obj this parameter ignored. 581 * @throws UnsupportedOperationException always 582 */ 583 public void add(String obj) { 584 throw new UnsupportedOperationException("add() is unsupported"); 585 } 586 587 // Implementation 588 //----------------------------------------------------------------------- 589 /** 590 * Checks if tokenization has been done, and if not then do it. 591 */ 592 private void checkTokenized() { 593 if (tokens == null) { 594 if (chars == null) { 595 // still call tokenize as subclass may do some work 596 List<String> split = tokenize(null, 0, 0); 597 tokens = split.toArray(new String[split.size()]); 598 } else { 599 List<String> split = tokenize(chars, 0, chars.length); 600 tokens = split.toArray(new String[split.size()]); 601 } 602 } 603 } 604 605 /** 606 * Internal method to performs the tokenization. 607 * <p> 608 * Most users of this class do not need to call this method. This method 609 * will be called automatically by other (public) methods when required. 610 * <p> 611 * This method exists to allow subclasses to add code before or after the 612 * tokenization. For example, a subclass could alter the character array, 613 * offset or count to be parsed, or call the tokenizer multiple times on 614 * multiple strings. It is also be possible to filter the results. 615 * <p> 616 * <code>StrTokenizer</code> will always pass a zero offset and a count 617 * equal to the length of the array to this method, however a subclass 618 * may pass other values, or even an entirely different array. 619 * 620 * @param chars the character array being tokenized, may be null 621 * @param offset the start position within the character array, must be valid 622 * @param count the number of characters to tokenize, must be valid 623 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 624 */ 625 protected List<String> tokenize(char[] chars, int offset, int count) { 626 if (chars == null || count == 0) { 627 return Collections.emptyList(); 628 } 629 StrBuilder buf = new StrBuilder(); 630 List<String> tokens = new ArrayList<String>(); 631 int pos = offset; 632 633 // loop around the entire buffer 634 while (pos >= 0 && pos < count) { 635 // find next token 636 pos = readNextToken(chars, pos, count, buf, tokens); 637 638 // handle case where end of string is a delimiter 639 if (pos >= count) { 640 addToken(tokens, ""); 641 } 642 } 643 return tokens; 644 } 645 646 /** 647 * Adds a token to a list, paying attention to the parameters we've set. 648 * 649 * @param list the list to add to 650 * @param tok the token to add 651 */ 652 private void addToken(List<String> list, String tok) { 653 if (tok == null || tok.length() == 0) { 654 if (isIgnoreEmptyTokens()) { 655 return; 656 } 657 if (isEmptyTokenAsNull()) { 658 tok = null; 659 } 660 } 661 list.add(tok); 662 } 663 664 /** 665 * Reads character by character through the String to get the next token. 666 * 667 * @param chars the character array being tokenized 668 * @param start the first character of field 669 * @param len the length of the character array being tokenized 670 * @param workArea a temporary work area 671 * @param tokens the list of parsed tokens 672 * @return the starting position of the next field (the character 673 * immediately after the delimiter), or -1 if end of string found 674 */ 675 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) { 676 // skip all leading whitespace, unless it is the 677 // field delimiter or the quote character 678 while (start < len) { 679 int removeLen = Math.max( 680 getIgnoredMatcher().isMatch(chars, start, start, len), 681 getTrimmerMatcher().isMatch(chars, start, start, len)); 682 if (removeLen == 0 || 683 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 || 684 getQuoteMatcher().isMatch(chars, start, start, len) > 0) { 685 break; 686 } 687 start += removeLen; 688 } 689 690 // handle reaching end 691 if (start >= len) { 692 addToken(tokens, ""); 693 return -1; 694 } 695 696 // handle empty token 697 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len); 698 if (delimLen > 0) { 699 addToken(tokens, ""); 700 return start + delimLen; 701 } 702 703 // handle found token 704 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len); 705 if (quoteLen > 0) { 706 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen); 707 } 708 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0); 709 } 710 711 /** 712 * Reads a possibly quoted string token. 713 * 714 * @param chars the character array being tokenized 715 * @param start the first character of field 716 * @param len the length of the character array being tokenized 717 * @param workArea a temporary work area 718 * @param tokens the list of parsed tokens 719 * @param quoteStart the start position of the matched quote, 0 if no quoting 720 * @param quoteLen the length of the matched quote, 0 if no quoting 721 * @return the starting position of the next field (the character 722 * immediately after the delimiter, or if end of string found, 723 * then the length of string 724 */ 725 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 726 List<String> tokens, int quoteStart, int quoteLen) { 727 // Loop until we've found the end of the quoted 728 // string or the end of the input 729 workArea.clear(); 730 int pos = start; 731 boolean quoting = (quoteLen > 0); 732 int trimStart = 0; 733 734 while (pos < len) { 735 // quoting mode can occur several times throughout a string 736 // we must switch between quoting and non-quoting until we 737 // encounter a non-quoted delimiter, or end of string 738 if (quoting) { 739 // In quoting mode 740 741 // If we've found a quote character, see if it's 742 // followed by a second quote. If so, then we need 743 // to actually put the quote character into the token 744 // rather than end the token. 745 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 746 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) { 747 // matched pair of quotes, thus an escaped quote 748 workArea.append(chars, pos, quoteLen); 749 pos += (quoteLen * 2); 750 trimStart = workArea.size(); 751 continue; 752 } 753 754 // end of quoting 755 quoting = false; 756 pos += quoteLen; 757 continue; 758 } 759 760 // copy regular character from inside quotes 761 workArea.append(chars[pos++]); 762 trimStart = workArea.size(); 763 764 } else { 765 // Not in quoting mode 766 767 // check for delimiter, and thus end of token 768 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len); 769 if (delimLen > 0) { 770 // return condition when end of token found 771 addToken(tokens, workArea.substring(0, trimStart)); 772 return pos + delimLen; 773 } 774 775 // check for quote, and thus back into quoting mode 776 if (quoteLen > 0 && isQuote(chars, pos, len, quoteStart, quoteLen)) { 777 quoting = true; 778 pos += quoteLen; 779 continue; 780 } 781 782 // check for ignored (outside quotes), and ignore 783 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len); 784 if (ignoredLen > 0) { 785 pos += ignoredLen; 786 continue; 787 } 788 789 // check for trimmed character 790 // don't yet know if its at the end, so copy to workArea 791 // use trimStart to keep track of trim at the end 792 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len); 793 if (trimmedLen > 0) { 794 workArea.append(chars, pos, trimmedLen); 795 pos += trimmedLen; 796 continue; 797 } 798 799 // copy regular character from outside quotes 800 workArea.append(chars[pos++]); 801 trimStart = workArea.size(); 802 } 803 } 804 805 // return condition when end of string found 806 addToken(tokens, workArea.substring(0, trimStart)); 807 return -1; 808 } 809 810 /** 811 * Checks if the characters at the index specified match the quote 812 * already matched in readNextToken(). 813 * 814 * @param chars the character array being tokenized 815 * @param pos the position to check for a quote 816 * @param len the length of the character array being tokenized 817 * @param quoteStart the start position of the matched quote, 0 if no quoting 818 * @param quoteLen the length of the matched quote, 0 if no quoting 819 * @return true if a quote is matched 820 */ 821 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) { 822 for (int i = 0; i < quoteLen; i++) { 823 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) { 824 return false; 825 } 826 } 827 return true; 828 } 829 830 // Delimiter 831 //----------------------------------------------------------------------- 832 /** 833 * Gets the field delimiter matcher. 834 * 835 * @return the delimiter matcher in use 836 */ 837 public StrMatcher getDelimiterMatcher() { 838 return this.delimMatcher; 839 } 840 841 /** 842 * Sets the field delimiter matcher. 843 * <p> 844 * The delimitier is used to separate one token from another. 845 * 846 * @param delim the delimiter matcher to use 847 * @return this, to enable chaining 848 */ 849 public StrTokenizer setDelimiterMatcher(StrMatcher delim) { 850 if (delim == null) { 851 this.delimMatcher = StrMatcher.noneMatcher(); 852 } else { 853 this.delimMatcher = delim; 854 } 855 return this; 856 } 857 858 /** 859 * Sets the field delimiter character. 860 * 861 * @param delim the delimiter character to use 862 * @return this, to enable chaining 863 */ 864 public StrTokenizer setDelimiterChar(char delim) { 865 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 866 } 867 868 /** 869 * Sets the field delimiter string. 870 * 871 * @param delim the delimiter string to use 872 * @return this, to enable chaining 873 */ 874 public StrTokenizer setDelimiterString(String delim) { 875 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 876 } 877 878 // Quote 879 //----------------------------------------------------------------------- 880 /** 881 * Gets the quote matcher currently in use. 882 * <p> 883 * The quote character is used to wrap data between the tokens. 884 * This enables delimiters to be entered as data. 885 * The default value is '"' (double quote). 886 * 887 * @return the quote matcher in use 888 */ 889 public StrMatcher getQuoteMatcher() { 890 return quoteMatcher; 891 } 892 893 /** 894 * Set the quote matcher to use. 895 * <p> 896 * The quote character is used to wrap data between the tokens. 897 * This enables delimiters to be entered as data. 898 * 899 * @param quote the quote matcher to use, null ignored 900 * @return this, to enable chaining 901 */ 902 public StrTokenizer setQuoteMatcher(StrMatcher quote) { 903 if (quote != null) { 904 this.quoteMatcher = quote; 905 } 906 return this; 907 } 908 909 /** 910 * Sets the quote character to use. 911 * <p> 912 * The quote character is used to wrap data between the tokens. 913 * This enables delimiters to be entered as data. 914 * 915 * @param quote the quote character to use 916 * @return this, to enable chaining 917 */ 918 public StrTokenizer setQuoteChar(char quote) { 919 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 920 } 921 922 // Ignored 923 //----------------------------------------------------------------------- 924 /** 925 * Gets the ignored character matcher. 926 * <p> 927 * These characters are ignored when parsing the String, unless they are 928 * within a quoted region. 929 * The default value is not to ignore anything. 930 * 931 * @return the ignored matcher in use 932 */ 933 public StrMatcher getIgnoredMatcher() { 934 return ignoredMatcher; 935 } 936 937 /** 938 * Set the matcher for characters to ignore. 939 * <p> 940 * These characters are ignored when parsing the String, unless they are 941 * within a quoted region. 942 * 943 * @param ignored the ignored matcher to use, null ignored 944 * @return this, to enable chaining 945 */ 946 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) { 947 if (ignored != null) { 948 this.ignoredMatcher = ignored; 949 } 950 return this; 951 } 952 953 /** 954 * Set the character to ignore. 955 * <p> 956 * This character is ignored when parsing the String, unless it is 957 * within a quoted region. 958 * 959 * @param ignored the ignored character to use 960 * @return this, to enable chaining 961 */ 962 public StrTokenizer setIgnoredChar(char ignored) { 963 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 964 } 965 966 // Trimmer 967 //----------------------------------------------------------------------- 968 /** 969 * Gets the trimmer character matcher. 970 * <p> 971 * These characters are trimmed off on each side of the delimiter 972 * until the token or quote is found. 973 * The default value is not to trim anything. 974 * 975 * @return the trimmer matcher in use 976 */ 977 public StrMatcher getTrimmerMatcher() { 978 return trimmerMatcher; 979 } 980 981 /** 982 * Sets the matcher for characters to trim. 983 * <p> 984 * These characters are trimmed off on each side of the delimiter 985 * until the token or quote is found. 986 * 987 * @param trimmer the trimmer matcher to use, null ignored 988 * @return this, to enable chaining 989 */ 990 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) { 991 if (trimmer != null) { 992 this.trimmerMatcher = trimmer; 993 } 994 return this; 995 } 996 997 //----------------------------------------------------------------------- 998 /** 999 * Gets whether the tokenizer currently returns empty tokens as null. 1000 * The default for this property is false. 1001 * 1002 * @return true if empty tokens are returned as null 1003 */ 1004 public boolean isEmptyTokenAsNull() { 1005 return this.emptyAsNull; 1006 } 1007 1008 /** 1009 * Sets whether the tokenizer should return empty tokens as null. 1010 * The default for this property is false. 1011 * 1012 * @param emptyAsNull whether empty tokens are returned as null 1013 * @return this, to enable chaining 1014 */ 1015 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) { 1016 this.emptyAsNull = emptyAsNull; 1017 return this; 1018 } 1019 1020 //----------------------------------------------------------------------- 1021 /** 1022 * Gets whether the tokenizer currently ignores empty tokens. 1023 * The default for this property is true. 1024 * 1025 * @return true if empty tokens are not returned 1026 */ 1027 public boolean isIgnoreEmptyTokens() { 1028 return ignoreEmptyTokens; 1029 } 1030 1031 /** 1032 * Sets whether the tokenizer should ignore and not return empty tokens. 1033 * The default for this property is true. 1034 * 1035 * @param ignoreEmptyTokens whether empty tokens are not returned 1036 * @return this, to enable chaining 1037 */ 1038 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) { 1039 this.ignoreEmptyTokens = ignoreEmptyTokens; 1040 return this; 1041 } 1042 1043 //----------------------------------------------------------------------- 1044 /** 1045 * Gets the String content that the tokenizer is parsing. 1046 * 1047 * @return the string content being parsed 1048 */ 1049 public String getContent() { 1050 if (chars == null) { 1051 return null; 1052 } 1053 return new String(chars); 1054 } 1055 1056 //----------------------------------------------------------------------- 1057 /** 1058 * Creates a new instance of this Tokenizer. The new instance is reset so 1059 * that it will be at the start of the token list. 1060 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1061 * 1062 * @return a new instance of this Tokenizer which has been reset. 1063 */ 1064 @Override 1065 public Object clone() { 1066 try { 1067 return cloneReset(); 1068 } catch (CloneNotSupportedException ex) { 1069 return null; 1070 } 1071 } 1072 1073 /** 1074 * Creates a new instance of this Tokenizer. The new instance is reset so that 1075 * it will be at the start of the token list. 1076 * 1077 * @return a new instance of this Tokenizer which has been reset. 1078 * @throws CloneNotSupportedException if there is a problem cloning 1079 */ 1080 Object cloneReset() throws CloneNotSupportedException { 1081 // this method exists to enable 100% test coverage 1082 StrTokenizer cloned = (StrTokenizer) super.clone(); 1083 if (cloned.chars != null) { 1084 cloned.chars = cloned.chars.clone(); 1085 } 1086 cloned.reset(); 1087 return cloned; 1088 } 1089 1090 //----------------------------------------------------------------------- 1091 /** 1092 * Gets the String content that the tokenizer is parsing. 1093 * 1094 * @return the string content being parsed 1095 */ 1096 @Override 1097 public String toString() { 1098 if (tokens == null) { 1099 return "StrTokenizer[not tokenized yet]"; 1100 } 1101 return "StrTokenizer" + getTokenList(); 1102 } 1103 1104 }