001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang3.text; 018 019 import java.util.ArrayList; 020 import java.util.Collections; 021 import java.util.List; 022 import java.util.ListIterator; 023 import java.util.NoSuchElementException; 024 025 import org.apache.commons.lang3.ArrayUtils; 026 027 /** 028 * Tokenizes a string based based on delimiters (separators) 029 * and supporting quoting and ignored character concepts. 030 * <p> 031 * This class can split a String into many smaller strings. It aims 032 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 033 * however it offers much more control and flexibility including implementing 034 * the <code>ListIterator</code> interface. By default, it is set up 035 * like <code>StringTokenizer</code>. 036 * <p> 037 * The input String is split into a number of <i>tokens</i>. 038 * Each token is separated from the next String by a <i>delimiter</i>. 039 * One or more delimiter characters must be specified. 040 * <p> 041 * Each token may be surrounded by quotes. 042 * The <i>quote</i> matcher specifies the quote character(s). 043 * A quote may be escaped within a quoted section by duplicating itself. 044 * <p> 045 * Between each token and the delimiter are potentially characters that need trimming. 046 * The <i>trimmer</i> matcher specifies these characters. 047 * One usage might be to trim whitespace characters. 048 * <p> 049 * At any point outside the quotes there might potentially be invalid characters. 050 * The <i>ignored</i> matcher specifies these characters to be removed. 051 * One usage might be to remove new line characters. 052 * <p> 053 * Empty tokens may be removed or returned as null. 054 * <pre> 055 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 056 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 057 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 058 * </pre> 059 * <p> 060 * 061 * This tokenizer has the following properties and options: 062 * 063 * <table> 064 * <tr> 065 * <th>Property</th><th>Type</th><th>Default</th> 066 * </tr> 067 * <tr> 068 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 069 * </tr> 070 * <tr> 071 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 072 * </tr> 073 * <tr> 074 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 075 * </tr> 076 * <tr> 077 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 078 * </tr> 079 * <tr> 080 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 081 * </tr> 082 * </table> 083 * 084 * @author Apache Software Foundation 085 * @author Matthew Inger 086 * @author Gary D. Gregory 087 * @since 2.2 088 * @version $Id: StrTokenizer.java 907630 2010-02-08 12:22:32Z sebb $ 089 */ 090 public class StrTokenizer implements ListIterator<String>, Cloneable { 091 092 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 093 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 094 static { 095 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 096 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 098 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 099 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 100 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 101 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 102 103 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 104 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 107 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 108 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 109 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 110 } 111 112 /** The text to work on. */ 113 private char chars[]; 114 /** The parsed tokens */ 115 private String tokens[]; 116 /** The current iteration position */ 117 private int tokenPos; 118 119 /** The delimiter matcher */ 120 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 121 /** The quote matcher */ 122 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 123 /** The ignored matcher */ 124 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 125 /** The trimmer matcher */ 126 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 127 128 /** Whether to return empty tokens as null */ 129 private boolean emptyAsNull = false; 130 /** Whether to ignore empty tokens */ 131 private boolean ignoreEmptyTokens = true; 132 133 //----------------------------------------------------------------------- 134 135 /** 136 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 137 * 138 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 139 */ 140 private static StrTokenizer getCSVClone() { 141 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 142 } 143 144 /** 145 * Gets a new tokenizer instance which parses Comma Separated Value strings 146 * initializing it with the given input. The default for CSV processing 147 * will be trim whitespace from both ends (which can be overridden with 148 * the setTrimmer method). 149 * <p> 150 * You must call a "reset" method to set the string which you want to parse. 151 * @return a new tokenizer instance which parses Comma Separated Value strings 152 */ 153 public static StrTokenizer getCSVInstance() { 154 return getCSVClone(); 155 } 156 157 /** 158 * Gets a new tokenizer instance which parses Comma Separated Value strings 159 * initializing it with the given input. The default for CSV processing 160 * will be trim whitespace from both ends (which can be overridden with 161 * the setTrimmer method). 162 * 163 * @param input the text to parse 164 * @return a new tokenizer instance which parses Comma Separated Value strings 165 */ 166 public static StrTokenizer getCSVInstance(String input) { 167 StrTokenizer tok = getCSVClone(); 168 tok.reset(input); 169 return tok; 170 } 171 172 /** 173 * Gets a new tokenizer instance which parses Comma Separated Value strings 174 * initializing it with the given input. The default for CSV processing 175 * will be trim whitespace from both ends (which can be overridden with 176 * the setTrimmer method). 177 * 178 * @param input the text to parse 179 * @return a new tokenizer instance which parses Comma Separated Value strings 180 */ 181 public static StrTokenizer getCSVInstance(char[] input) { 182 StrTokenizer tok = getCSVClone(); 183 tok.reset(input); 184 return tok; 185 } 186 187 /** 188 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 189 * 190 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 191 */ 192 private static StrTokenizer getTSVClone() { 193 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 194 } 195 196 197 /** 198 * Gets a new tokenizer instance which parses Tab Separated Value strings. 199 * The default for CSV processing will be trim whitespace from both ends 200 * (which can be overridden with the setTrimmer method). 201 * <p> 202 * You must call a "reset" method to set the string which you want to parse. 203 * @return a new tokenizer instance which parses Tab Separated Value strings. 204 */ 205 public static StrTokenizer getTSVInstance() { 206 return getTSVClone(); 207 } 208 209 /** 210 * Gets a new tokenizer instance which parses Tab Separated Value strings. 211 * The default for CSV processing will be trim whitespace from both ends 212 * (which can be overridden with the setTrimmer method). 213 * @param input the string to parse 214 * @return a new tokenizer instance which parses Tab Separated Value strings. 215 */ 216 public static StrTokenizer getTSVInstance(String input) { 217 StrTokenizer tok = getTSVClone(); 218 tok.reset(input); 219 return tok; 220 } 221 222 /** 223 * Gets a new tokenizer instance which parses Tab Separated Value strings. 224 * The default for CSV processing will be trim whitespace from both ends 225 * (which can be overridden with the setTrimmer method). 226 * @param input the string to parse 227 * @return a new tokenizer instance which parses Tab Separated Value strings. 228 */ 229 public static StrTokenizer getTSVInstance(char[] input) { 230 StrTokenizer tok = getTSVClone(); 231 tok.reset(input); 232 return tok; 233 } 234 235 //----------------------------------------------------------------------- 236 /** 237 * Constructs a tokenizer splitting on space, tab, newline and formfeed 238 * as per StringTokenizer, but with no text to tokenize. 239 * <p> 240 * This constructor is normally used with {@link #reset(String)}. 241 */ 242 public StrTokenizer() { 243 super(); 244 this.chars = null; 245 } 246 247 /** 248 * Constructs a tokenizer splitting on space, tab, newline and formfeed 249 * as per StringTokenizer. 250 * 251 * @param input the string which is to be parsed 252 */ 253 public StrTokenizer(String input) { 254 super(); 255 if (input != null) { 256 chars = input.toCharArray(); 257 } else { 258 chars = null; 259 } 260 } 261 262 /** 263 * Constructs a tokenizer splitting on the specified delimiter character. 264 * 265 * @param input the string which is to be parsed 266 * @param delim the field delimiter character 267 */ 268 public StrTokenizer(String input, char delim) { 269 this(input); 270 setDelimiterChar(delim); 271 } 272 273 /** 274 * Constructs a tokenizer splitting on the specified delimiter string. 275 * 276 * @param input the string which is to be parsed 277 * @param delim the field delimiter string 278 */ 279 public StrTokenizer(String input, String delim) { 280 this(input); 281 setDelimiterString(delim); 282 } 283 284 /** 285 * Constructs a tokenizer splitting using the specified delimiter matcher. 286 * 287 * @param input the string which is to be parsed 288 * @param delim the field delimiter matcher 289 */ 290 public StrTokenizer(String input, StrMatcher delim) { 291 this(input); 292 setDelimiterMatcher(delim); 293 } 294 295 /** 296 * Constructs a tokenizer splitting on the specified delimiter character 297 * and handling quotes using the specified quote character. 298 * 299 * @param input the string which is to be parsed 300 * @param delim the field delimiter character 301 * @param quote the field quoted string character 302 */ 303 public StrTokenizer(String input, char delim, char quote) { 304 this(input, delim); 305 setQuoteChar(quote); 306 } 307 308 /** 309 * Constructs a tokenizer splitting using the specified delimiter matcher 310 * and handling quotes using the specified quote matcher. 311 * 312 * @param input the string which is to be parsed 313 * @param delim the field delimiter matcher 314 * @param quote the field quoted string matcher 315 */ 316 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) { 317 this(input, delim); 318 setQuoteMatcher(quote); 319 } 320 321 /** 322 * Constructs a tokenizer splitting on space, tab, newline and formfeed 323 * as per StringTokenizer. 324 * 325 * @param input the string which is to be parsed, not cloned 326 */ 327 public StrTokenizer(char[] input) { 328 super(); 329 this.chars = ArrayUtils.clone(input); 330 } 331 332 /** 333 * Constructs a tokenizer splitting on the specified character. 334 * 335 * @param input the string which is to be parsed, not cloned 336 * @param delim the field delimiter character 337 */ 338 public StrTokenizer(char[] input, char delim) { 339 this(input); 340 setDelimiterChar(delim); 341 } 342 343 /** 344 * Constructs a tokenizer splitting on the specified string. 345 * 346 * @param input the string which is to be parsed, not cloned 347 * @param delim the field delimiter string 348 */ 349 public StrTokenizer(char[] input, String delim) { 350 this(input); 351 setDelimiterString(delim); 352 } 353 354 /** 355 * Constructs a tokenizer splitting using the specified delimiter matcher. 356 * 357 * @param input the string which is to be parsed, not cloned 358 * @param delim the field delimiter matcher 359 */ 360 public StrTokenizer(char[] input, StrMatcher delim) { 361 this(input); 362 setDelimiterMatcher(delim); 363 } 364 365 /** 366 * Constructs a tokenizer splitting on the specified delimiter character 367 * and handling quotes using the specified quote character. 368 * 369 * @param input the string which is to be parsed, not cloned 370 * @param delim the field delimiter character 371 * @param quote the field quoted string character 372 */ 373 public StrTokenizer(char[] input, char delim, char quote) { 374 this(input, delim); 375 setQuoteChar(quote); 376 } 377 378 /** 379 * Constructs a tokenizer splitting using the specified delimiter matcher 380 * and handling quotes using the specified quote matcher. 381 * 382 * @param input the string which is to be parsed, not cloned 383 * @param delim the field delimiter character 384 * @param quote the field quoted string character 385 */ 386 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) { 387 this(input, delim); 388 setQuoteMatcher(quote); 389 } 390 391 // API 392 //----------------------------------------------------------------------- 393 /** 394 * Gets the number of tokens found in the String. 395 * 396 * @return the number of matched tokens 397 */ 398 public int size() { 399 checkTokenized(); 400 return tokens.length; 401 } 402 403 /** 404 * Gets the next token from the String. 405 * Equivalent to {@link #next()} except it returns null rather than 406 * throwing {@link NoSuchElementException} when no tokens remain. 407 * 408 * @return the next sequential token, or null when no more tokens are found 409 */ 410 public String nextToken() { 411 if (hasNext()) { 412 return tokens[tokenPos++]; 413 } 414 return null; 415 } 416 417 /** 418 * Gets the previous token from the String. 419 * 420 * @return the previous sequential token, or null when no more tokens are found 421 */ 422 public String previousToken() { 423 if (hasPrevious()) { 424 return tokens[--tokenPos]; 425 } 426 return null; 427 } 428 429 /** 430 * Gets a copy of the full token list as an independent modifiable array. 431 * 432 * @return the tokens as a String array 433 */ 434 public String[] getTokenArray() { 435 checkTokenized(); 436 return tokens.clone(); 437 } 438 439 /** 440 * Gets a copy of the full token list as an independent modifiable list. 441 * 442 * @return the tokens as a String array 443 */ 444 public List<String> getTokenList() { 445 checkTokenized(); 446 List<String> list = new ArrayList<String>(tokens.length); 447 for (String element : tokens) { 448 list.add(element); 449 } 450 return list; 451 } 452 453 /** 454 * Resets this tokenizer, forgetting all parsing and iteration already completed. 455 * <p> 456 * This method allows the same tokenizer to be reused for the same String. 457 * 458 * @return this, to enable chaining 459 */ 460 public StrTokenizer reset() { 461 tokenPos = 0; 462 tokens = null; 463 return this; 464 } 465 466 /** 467 * Reset this tokenizer, giving it a new input string to parse. 468 * In this manner you can re-use a tokenizer with the same settings 469 * on multiple input lines. 470 * 471 * @param input the new string to tokenize, null sets no text to parse 472 * @return this, to enable chaining 473 */ 474 public StrTokenizer reset(String input) { 475 reset(); 476 if (input != null) { 477 this.chars = input.toCharArray(); 478 } else { 479 this.chars = null; 480 } 481 return this; 482 } 483 484 /** 485 * Reset this tokenizer, giving it a new input string to parse. 486 * In this manner you can re-use a tokenizer with the same settings 487 * on multiple input lines. 488 * 489 * @param input the new character array to tokenize, not cloned, null sets no text to parse 490 * @return this, to enable chaining 491 */ 492 public StrTokenizer reset(char[] input) { 493 reset(); 494 this.chars = ArrayUtils.clone(input); 495 return this; 496 } 497 498 // ListIterator 499 //----------------------------------------------------------------------- 500 /** 501 * Checks whether there are any more tokens. 502 * 503 * @return true if there are more tokens 504 */ 505 public boolean hasNext() { 506 checkTokenized(); 507 return tokenPos < tokens.length; 508 } 509 510 /** 511 * Gets the next token. 512 * 513 * @return the next String token 514 * @throws NoSuchElementException if there are no more elements 515 */ 516 public String next() { 517 if (hasNext()) { 518 return tokens[tokenPos++]; 519 } 520 throw new NoSuchElementException(); 521 } 522 523 /** 524 * Gets the index of the next token to return. 525 * 526 * @return the next token index 527 */ 528 public int nextIndex() { 529 return tokenPos; 530 } 531 532 /** 533 * Checks whether there are any previous tokens that can be iterated to. 534 * 535 * @return true if there are previous tokens 536 */ 537 public boolean hasPrevious() { 538 checkTokenized(); 539 return tokenPos > 0; 540 } 541 542 /** 543 * Gets the token previous to the last returned token. 544 * 545 * @return the previous token 546 */ 547 public String previous() { 548 if (hasPrevious()) { 549 return tokens[--tokenPos]; 550 } 551 throw new NoSuchElementException(); 552 } 553 554 /** 555 * Gets the index of the previous token. 556 * 557 * @return the previous token index 558 */ 559 public int previousIndex() { 560 return tokenPos - 1; 561 } 562 563 /** 564 * Unsupported ListIterator operation. 565 * 566 * @throws UnsupportedOperationException always 567 */ 568 public void remove() { 569 throw new UnsupportedOperationException("remove() is unsupported"); 570 } 571 572 /** 573 * Unsupported ListIterator operation. 574 * @param obj this parameter ignored. 575 * @throws UnsupportedOperationException always 576 */ 577 public void set(String obj) { 578 throw new UnsupportedOperationException("set() is unsupported"); 579 } 580 581 /** 582 * Unsupported ListIterator operation. 583 * @param obj this parameter ignored. 584 * @throws UnsupportedOperationException always 585 */ 586 public void add(String obj) { 587 throw new UnsupportedOperationException("add() is unsupported"); 588 } 589 590 // Implementation 591 //----------------------------------------------------------------------- 592 /** 593 * Checks if tokenization has been done, and if not then do it. 594 */ 595 private void checkTokenized() { 596 if (tokens == null) { 597 if (chars == null) { 598 // still call tokenize as subclass may do some work 599 List<String> split = tokenize(null, 0, 0); 600 tokens = split.toArray(new String[split.size()]); 601 } else { 602 List<String> split = tokenize(chars, 0, chars.length); 603 tokens = split.toArray(new String[split.size()]); 604 } 605 } 606 } 607 608 /** 609 * Internal method to performs the tokenization. 610 * <p> 611 * Most users of this class do not need to call this method. This method 612 * will be called automatically by other (public) methods when required. 613 * <p> 614 * This method exists to allow subclasses to add code before or after the 615 * tokenization. For example, a subclass could alter the character array, 616 * offset or count to be parsed, or call the tokenizer multiple times on 617 * multiple strings. It is also be possible to filter the results. 618 * <p> 619 * <code>StrTokenizer</code> will always pass a zero offset and a count 620 * equal to the length of the array to this method, however a subclass 621 * may pass other values, or even an entirely different array. 622 * 623 * @param chars the character array being tokenized, may be null 624 * @param offset the start position within the character array, must be valid 625 * @param count the number of characters to tokenize, must be valid 626 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 627 */ 628 protected List<String> tokenize(char[] chars, int offset, int count) { 629 if (chars == null || count == 0) { 630 return Collections.emptyList(); 631 } 632 StrBuilder buf = new StrBuilder(); 633 List<String> tokens = new ArrayList<String>(); 634 int pos = offset; 635 636 // loop around the entire buffer 637 while (pos >= 0 && pos < count) { 638 // find next token 639 pos = readNextToken(chars, pos, count, buf, tokens); 640 641 // handle case where end of string is a delimiter 642 if (pos >= count) { 643 addToken(tokens, ""); 644 } 645 } 646 return tokens; 647 } 648 649 /** 650 * Adds a token to a list, paying attention to the parameters we've set. 651 * 652 * @param list the list to add to 653 * @param tok the token to add 654 */ 655 private void addToken(List<String> list, String tok) { 656 if (tok == null || tok.length() == 0) { 657 if (isIgnoreEmptyTokens()) { 658 return; 659 } 660 if (isEmptyTokenAsNull()) { 661 tok = null; 662 } 663 } 664 list.add(tok); 665 } 666 667 /** 668 * Reads character by character through the String to get the next token. 669 * 670 * @param chars the character array being tokenized 671 * @param start the first character of field 672 * @param len the length of the character array being tokenized 673 * @param workArea a temporary work area 674 * @param tokens the list of parsed tokens 675 * @return the starting position of the next field (the character 676 * immediately after the delimiter), or -1 if end of string found 677 */ 678 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) { 679 // skip all leading whitespace, unless it is the 680 // field delimiter or the quote character 681 while (start < len) { 682 int removeLen = Math.max( 683 getIgnoredMatcher().isMatch(chars, start, start, len), 684 getTrimmerMatcher().isMatch(chars, start, start, len)); 685 if (removeLen == 0 || 686 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 || 687 getQuoteMatcher().isMatch(chars, start, start, len) > 0) { 688 break; 689 } 690 start += removeLen; 691 } 692 693 // handle reaching end 694 if (start >= len) { 695 addToken(tokens, ""); 696 return -1; 697 } 698 699 // handle empty token 700 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len); 701 if (delimLen > 0) { 702 addToken(tokens, ""); 703 return start + delimLen; 704 } 705 706 // handle found token 707 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len); 708 if (quoteLen > 0) { 709 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen); 710 } 711 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0); 712 } 713 714 /** 715 * Reads a possibly quoted string token. 716 * 717 * @param chars the character array being tokenized 718 * @param start the first character of field 719 * @param len the length of the character array being tokenized 720 * @param workArea a temporary work area 721 * @param tokens the list of parsed tokens 722 * @param quoteStart the start position of the matched quote, 0 if no quoting 723 * @param quoteLen the length of the matched quote, 0 if no quoting 724 * @return the starting position of the next field (the character 725 * immediately after the delimiter, or if end of string found, 726 * then the length of string 727 */ 728 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea, 729 List<String> tokens, int quoteStart, int quoteLen) 730 { 731 // Loop until we've found the end of the quoted 732 // string or the end of the input 733 workArea.clear(); 734 int pos = start; 735 boolean quoting = (quoteLen > 0); 736 int trimStart = 0; 737 738 while (pos < len) { 739 // quoting mode can occur several times throughout a string 740 // we must switch between quoting and non-quoting until we 741 // encounter a non-quoted delimiter, or end of string 742 if (quoting) { 743 // In quoting mode 744 745 // If we've found a quote character, see if it's 746 // followed by a second quote. If so, then we need 747 // to actually put the quote character into the token 748 // rather than end the token. 749 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 750 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) { 751 // matched pair of quotes, thus an escaped quote 752 workArea.append(chars, pos, quoteLen); 753 pos += (quoteLen * 2); 754 trimStart = workArea.size(); 755 continue; 756 } 757 758 // end of quoting 759 quoting = false; 760 pos += quoteLen; 761 continue; 762 } 763 764 // copy regular character from inside quotes 765 workArea.append(chars[pos++]); 766 trimStart = workArea.size(); 767 768 } else { 769 // Not in quoting mode 770 771 // check for delimiter, and thus end of token 772 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len); 773 if (delimLen > 0) { 774 // return condition when end of token found 775 addToken(tokens, workArea.substring(0, trimStart)); 776 return pos + delimLen; 777 } 778 779 // check for quote, and thus back into quoting mode 780 if (quoteLen > 0) { 781 if (isQuote(chars, pos, len, quoteStart, quoteLen)) { 782 quoting = true; 783 pos += quoteLen; 784 continue; 785 } 786 } 787 788 // check for ignored (outside quotes), and ignore 789 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len); 790 if (ignoredLen > 0) { 791 pos += ignoredLen; 792 continue; 793 } 794 795 // check for trimmed character 796 // don't yet know if its at the end, so copy to workArea 797 // use trimStart to keep track of trim at the end 798 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len); 799 if (trimmedLen > 0) { 800 workArea.append(chars, pos, trimmedLen); 801 pos += trimmedLen; 802 continue; 803 } 804 805 // copy regular character from outside quotes 806 workArea.append(chars[pos++]); 807 trimStart = workArea.size(); 808 } 809 } 810 811 // return condition when end of string found 812 addToken(tokens, workArea.substring(0, trimStart)); 813 return -1; 814 } 815 816 /** 817 * Checks if the characters at the index specified match the quote 818 * already matched in readNextToken(). 819 * 820 * @param chars the character array being tokenized 821 * @param pos the position to check for a quote 822 * @param len the length of the character array being tokenized 823 * @param quoteStart the start position of the matched quote, 0 if no quoting 824 * @param quoteLen the length of the matched quote, 0 if no quoting 825 * @return true if a quote is matched 826 */ 827 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) { 828 for (int i = 0; i < quoteLen; i++) { 829 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) { 830 return false; 831 } 832 } 833 return true; 834 } 835 836 // Delimiter 837 //----------------------------------------------------------------------- 838 /** 839 * Gets the field delimiter matcher. 840 * 841 * @return the delimiter matcher in use 842 */ 843 public StrMatcher getDelimiterMatcher() { 844 return this.delimMatcher; 845 } 846 847 /** 848 * Sets the field delimiter matcher. 849 * <p> 850 * The delimitier is used to separate one token from another. 851 * 852 * @param delim the delimiter matcher to use 853 * @return this, to enable chaining 854 */ 855 public StrTokenizer setDelimiterMatcher(StrMatcher delim) { 856 if (delim == null) { 857 this.delimMatcher = StrMatcher.noneMatcher(); 858 } else { 859 this.delimMatcher = delim; 860 } 861 return this; 862 } 863 864 /** 865 * Sets the field delimiter character. 866 * 867 * @param delim the delimiter character to use 868 * @return this, to enable chaining 869 */ 870 public StrTokenizer setDelimiterChar(char delim) { 871 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 872 } 873 874 /** 875 * Sets the field delimiter string. 876 * 877 * @param delim the delimiter string to use 878 * @return this, to enable chaining 879 */ 880 public StrTokenizer setDelimiterString(String delim) { 881 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 882 } 883 884 // Quote 885 //----------------------------------------------------------------------- 886 /** 887 * Gets the quote matcher currently in use. 888 * <p> 889 * The quote character is used to wrap data between the tokens. 890 * This enables delimiters to be entered as data. 891 * The default value is '"' (double quote). 892 * 893 * @return the quote matcher in use 894 */ 895 public StrMatcher getQuoteMatcher() { 896 return quoteMatcher; 897 } 898 899 /** 900 * Set the quote matcher to use. 901 * <p> 902 * The quote character is used to wrap data between the tokens. 903 * This enables delimiters to be entered as data. 904 * 905 * @param quote the quote matcher to use, null ignored 906 * @return this, to enable chaining 907 */ 908 public StrTokenizer setQuoteMatcher(StrMatcher quote) { 909 if (quote != null) { 910 this.quoteMatcher = quote; 911 } 912 return this; 913 } 914 915 /** 916 * Sets the quote character to use. 917 * <p> 918 * The quote character is used to wrap data between the tokens. 919 * This enables delimiters to be entered as data. 920 * 921 * @param quote the quote character to use 922 * @return this, to enable chaining 923 */ 924 public StrTokenizer setQuoteChar(char quote) { 925 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 926 } 927 928 // Ignored 929 //----------------------------------------------------------------------- 930 /** 931 * Gets the ignored character matcher. 932 * <p> 933 * These characters are ignored when parsing the String, unless they are 934 * within a quoted region. 935 * The default value is not to ignore anything. 936 * 937 * @return the ignored matcher in use 938 */ 939 public StrMatcher getIgnoredMatcher() { 940 return ignoredMatcher; 941 } 942 943 /** 944 * Set the matcher for characters to ignore. 945 * <p> 946 * These characters are ignored when parsing the String, unless they are 947 * within a quoted region. 948 * 949 * @param ignored the ignored matcher to use, null ignored 950 * @return this, to enable chaining 951 */ 952 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) { 953 if (ignored != null) { 954 this.ignoredMatcher = ignored; 955 } 956 return this; 957 } 958 959 /** 960 * Set the character to ignore. 961 * <p> 962 * This character is ignored when parsing the String, unless it is 963 * within a quoted region. 964 * 965 * @param ignored the ignored character to use 966 * @return this, to enable chaining 967 */ 968 public StrTokenizer setIgnoredChar(char ignored) { 969 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 970 } 971 972 // Trimmer 973 //----------------------------------------------------------------------- 974 /** 975 * Gets the trimmer character matcher. 976 * <p> 977 * These characters are trimmed off on each side of the delimiter 978 * until the token or quote is found. 979 * The default value is not to trim anything. 980 * 981 * @return the trimmer matcher in use 982 */ 983 public StrMatcher getTrimmerMatcher() { 984 return trimmerMatcher; 985 } 986 987 /** 988 * Sets the matcher for characters to trim. 989 * <p> 990 * These characters are trimmed off on each side of the delimiter 991 * until the token or quote is found. 992 * 993 * @param trimmer the trimmer matcher to use, null ignored 994 * @return this, to enable chaining 995 */ 996 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) { 997 if (trimmer != null) { 998 this.trimmerMatcher = trimmer; 999 } 1000 return this; 1001 } 1002 1003 //----------------------------------------------------------------------- 1004 /** 1005 * Gets whether the tokenizer currently returns empty tokens as null. 1006 * The default for this property is false. 1007 * 1008 * @return true if empty tokens are returned as null 1009 */ 1010 public boolean isEmptyTokenAsNull() { 1011 return this.emptyAsNull; 1012 } 1013 1014 /** 1015 * Sets whether the tokenizer should return empty tokens as null. 1016 * The default for this property is false. 1017 * 1018 * @param emptyAsNull whether empty tokens are returned as null 1019 * @return this, to enable chaining 1020 */ 1021 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) { 1022 this.emptyAsNull = emptyAsNull; 1023 return this; 1024 } 1025 1026 //----------------------------------------------------------------------- 1027 /** 1028 * Gets whether the tokenizer currently ignores empty tokens. 1029 * The default for this property is true. 1030 * 1031 * @return true if empty tokens are not returned 1032 */ 1033 public boolean isIgnoreEmptyTokens() { 1034 return ignoreEmptyTokens; 1035 } 1036 1037 /** 1038 * Sets whether the tokenizer should ignore and not return empty tokens. 1039 * The default for this property is true. 1040 * 1041 * @param ignoreEmptyTokens whether empty tokens are not returned 1042 * @return this, to enable chaining 1043 */ 1044 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) { 1045 this.ignoreEmptyTokens = ignoreEmptyTokens; 1046 return this; 1047 } 1048 1049 //----------------------------------------------------------------------- 1050 /** 1051 * Gets the String content that the tokenizer is parsing. 1052 * 1053 * @return the string content being parsed 1054 */ 1055 public String getContent() { 1056 if (chars == null) { 1057 return null; 1058 } 1059 return new String(chars); 1060 } 1061 1062 //----------------------------------------------------------------------- 1063 /** 1064 * Creates a new instance of this Tokenizer. The new instance is reset so 1065 * that it will be at the start of the token list. 1066 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1067 * 1068 * @return a new instance of this Tokenizer which has been reset. 1069 */ 1070 @Override 1071 public Object clone() { 1072 try { 1073 return cloneReset(); 1074 } catch (CloneNotSupportedException ex) { 1075 return null; 1076 } 1077 } 1078 1079 /** 1080 * Creates a new instance of this Tokenizer. The new instance is reset so that 1081 * it will be at the start of the token list. 1082 * 1083 * @return a new instance of this Tokenizer which has been reset. 1084 * @throws CloneNotSupportedException if there is a problem cloning 1085 */ 1086 Object cloneReset() throws CloneNotSupportedException { 1087 // this method exists to enable 100% test coverage 1088 StrTokenizer cloned = (StrTokenizer) super.clone(); 1089 if (cloned.chars != null) { 1090 cloned.chars = cloned.chars.clone(); 1091 } 1092 cloned.reset(); 1093 return cloned; 1094 } 1095 1096 //----------------------------------------------------------------------- 1097 /** 1098 * Gets the String content that the tokenizer is parsing. 1099 * 1100 * @return the string content being parsed 1101 */ 1102 @Override 1103 public String toString() { 1104 if (tokens == null) { 1105 return "StrTokenizer[not tokenized yet]"; 1106 } 1107 return "StrTokenizer" + getTokenList(); 1108 } 1109 1110 }