001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.lang3.ArrayUtils; 026import org.apache.commons.lang3.StringUtils; 027 028/** 029 * Tokenizes a string based based on delimiters (separators) 030 * and supporting quoting and ignored character concepts. 031 * <p> 032 * This class can split a String into many smaller strings. It aims 033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 034 * however it offers much more control and flexibility including implementing 035 * the <code>ListIterator</code> interface. By default, it is set up 036 * like <code>StringTokenizer</code>. 037 * <p> 038 * The input String is split into a number of <i>tokens</i>. 039 * Each token is separated from the next String by a <i>delimiter</i>. 040 * One or more delimiter characters must be specified. 041 * <p> 042 * Each token may be surrounded by quotes. 043 * The <i>quote</i> matcher specifies the quote character(s). 044 * A quote may be escaped within a quoted section by duplicating itself. 045 * <p> 046 * Between each token and the delimiter are potentially characters that need trimming. 047 * The <i>trimmer</i> matcher specifies these characters. 048 * One usage might be to trim whitespace characters. 049 * <p> 050 * At any point outside the quotes there might potentially be invalid characters. 051 * The <i>ignored</i> matcher specifies these characters to be removed. 052 * One usage might be to remove new line characters. 053 * <p> 054 * Empty tokens may be removed or returned as null. 055 * <pre> 056 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 057 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 059 * </pre> 060 * <p> 061 * 062 * This tokenizer has the following properties and options: 063 * 064 * <table> 065 * <tr> 066 * <th>Property</th><th>Type</th><th>Default</th> 067 * </tr> 068 * <tr> 069 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 070 * </tr> 071 * <tr> 072 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 073 * </tr> 074 * <tr> 075 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 076 * </tr> 077 * <tr> 078 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 079 * </tr> 080 * <tr> 081 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 082 * </tr> 083 * </table> 084 * 085 * @since 2.2 086 * @version $Id: StrTokenizer.java 1533551 2013-10-18 16:49:15Z sebb $ 087 */ 088public class StrTokenizer implements ListIterator<String>, Cloneable { 089 090 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 091 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 092 static { 093 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 094 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 097 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 098 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 099 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 100 101 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 102 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 105 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 106 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 107 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 108 } 109 110 /** The text to work on. */ 111 private char chars[]; 112 /** The parsed tokens */ 113 private String tokens[]; 114 /** The current iteration position */ 115 private int tokenPos; 116 117 /** The delimiter matcher */ 118 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 119 /** The quote matcher */ 120 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 121 /** The ignored matcher */ 122 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 123 /** The trimmer matcher */ 124 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 125 126 /** Whether to return empty tokens as null */ 127 private boolean emptyAsNull = false; 128 /** Whether to ignore empty tokens */ 129 private boolean ignoreEmptyTokens = true; 130 131 //----------------------------------------------------------------------- 132 133 /** 134 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 135 * 136 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 137 */ 138 private static StrTokenizer getCSVClone() { 139 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 140 } 141 142 /** 143 * Gets a new tokenizer instance which parses Comma Separated Value strings 144 * initializing it with the given input. The default for CSV processing 145 * will be trim whitespace from both ends (which can be overridden with 146 * the setTrimmer method). 147 * <p> 148 * You must call a "reset" method to set the string which you want to parse. 149 * @return a new tokenizer instance which parses Comma Separated Value strings 150 */ 151 public static StrTokenizer getCSVInstance() { 152 return getCSVClone(); 153 } 154 155 /** 156 * Gets a new tokenizer instance which parses Comma Separated Value strings 157 * initializing it with the given input. The default for CSV processing 158 * will be trim whitespace from both ends (which can be overridden with 159 * the setTrimmer method). 160 * 161 * @param input the text to parse 162 * @return a new tokenizer instance which parses Comma Separated Value strings 163 */ 164 public static StrTokenizer getCSVInstance(final String input) { 165 final StrTokenizer tok = getCSVClone(); 166 tok.reset(input); 167 return tok; 168 } 169 170 /** 171 * Gets a new tokenizer instance which parses Comma Separated Value strings 172 * initializing it with the given input. The default for CSV processing 173 * will be trim whitespace from both ends (which can be overridden with 174 * the setTrimmer method). 175 * 176 * @param input the text to parse 177 * @return a new tokenizer instance which parses Comma Separated Value strings 178 */ 179 public static StrTokenizer getCSVInstance(final char[] input) { 180 final StrTokenizer tok = getCSVClone(); 181 tok.reset(input); 182 return tok; 183 } 184 185 /** 186 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 187 * 188 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 189 */ 190 private static StrTokenizer getTSVClone() { 191 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 192 } 193 194 195 /** 196 * Gets a new tokenizer instance which parses Tab Separated Value strings. 197 * The default for CSV processing will be trim whitespace from both ends 198 * (which can be overridden with the setTrimmer method). 199 * <p> 200 * You must call a "reset" method to set the string which you want to parse. 201 * @return a new tokenizer instance which parses Tab Separated Value strings. 202 */ 203 public static StrTokenizer getTSVInstance() { 204 return getTSVClone(); 205 } 206 207 /** 208 * Gets a new tokenizer instance which parses Tab Separated Value strings. 209 * The default for CSV processing will be trim whitespace from both ends 210 * (which can be overridden with the setTrimmer method). 211 * @param input the string to parse 212 * @return a new tokenizer instance which parses Tab Separated Value strings. 213 */ 214 public static StrTokenizer getTSVInstance(final String input) { 215 final StrTokenizer tok = getTSVClone(); 216 tok.reset(input); 217 return tok; 218 } 219 220 /** 221 * Gets a new tokenizer instance which parses Tab Separated Value strings. 222 * The default for CSV processing will be trim whitespace from both ends 223 * (which can be overridden with the setTrimmer method). 224 * @param input the string to parse 225 * @return a new tokenizer instance which parses Tab Separated Value strings. 226 */ 227 public static StrTokenizer getTSVInstance(final char[] input) { 228 final StrTokenizer tok = getTSVClone(); 229 tok.reset(input); 230 return tok; 231 } 232 233 //----------------------------------------------------------------------- 234 /** 235 * Constructs a tokenizer splitting on space, tab, newline and formfeed 236 * as per StringTokenizer, but with no text to tokenize. 237 * <p> 238 * This constructor is normally used with {@link #reset(String)}. 239 */ 240 public StrTokenizer() { 241 super(); 242 this.chars = null; 243 } 244 245 /** 246 * Constructs a tokenizer splitting on space, tab, newline and formfeed 247 * as per StringTokenizer. 248 * 249 * @param input the string which is to be parsed 250 */ 251 public StrTokenizer(final String input) { 252 super(); 253 if (input != null) { 254 chars = input.toCharArray(); 255 } else { 256 chars = null; 257 } 258 } 259 260 /** 261 * Constructs a tokenizer splitting on the specified delimiter character. 262 * 263 * @param input the string which is to be parsed 264 * @param delim the field delimiter character 265 */ 266 public StrTokenizer(final String input, final char delim) { 267 this(input); 268 setDelimiterChar(delim); 269 } 270 271 /** 272 * Constructs a tokenizer splitting on the specified delimiter string. 273 * 274 * @param input the string which is to be parsed 275 * @param delim the field delimiter string 276 */ 277 public StrTokenizer(final String input, final String delim) { 278 this(input); 279 setDelimiterString(delim); 280 } 281 282 /** 283 * Constructs a tokenizer splitting using the specified delimiter matcher. 284 * 285 * @param input the string which is to be parsed 286 * @param delim the field delimiter matcher 287 */ 288 public StrTokenizer(final String input, final StrMatcher delim) { 289 this(input); 290 setDelimiterMatcher(delim); 291 } 292 293 /** 294 * Constructs a tokenizer splitting on the specified delimiter character 295 * and handling quotes using the specified quote character. 296 * 297 * @param input the string which is to be parsed 298 * @param delim the field delimiter character 299 * @param quote the field quoted string character 300 */ 301 public StrTokenizer(final String input, final char delim, final char quote) { 302 this(input, delim); 303 setQuoteChar(quote); 304 } 305 306 /** 307 * Constructs a tokenizer splitting using the specified delimiter matcher 308 * and handling quotes using the specified quote matcher. 309 * 310 * @param input the string which is to be parsed 311 * @param delim the field delimiter matcher 312 * @param quote the field quoted string matcher 313 */ 314 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 315 this(input, delim); 316 setQuoteMatcher(quote); 317 } 318 319 /** 320 * Constructs a tokenizer splitting on space, tab, newline and formfeed 321 * as per StringTokenizer. 322 * 323 * @param input the string which is to be parsed, not cloned 324 */ 325 public StrTokenizer(final char[] input) { 326 super(); 327 this.chars = ArrayUtils.clone(input); 328 } 329 330 /** 331 * Constructs a tokenizer splitting on the specified character. 332 * 333 * @param input the string which is to be parsed, not cloned 334 * @param delim the field delimiter character 335 */ 336 public StrTokenizer(final char[] input, final char delim) { 337 this(input); 338 setDelimiterChar(delim); 339 } 340 341 /** 342 * Constructs a tokenizer splitting on the specified string. 343 * 344 * @param input the string which is to be parsed, not cloned 345 * @param delim the field delimiter string 346 */ 347 public StrTokenizer(final char[] input, final String delim) { 348 this(input); 349 setDelimiterString(delim); 350 } 351 352 /** 353 * Constructs a tokenizer splitting using the specified delimiter matcher. 354 * 355 * @param input the string which is to be parsed, not cloned 356 * @param delim the field delimiter matcher 357 */ 358 public StrTokenizer(final char[] input, final StrMatcher delim) { 359 this(input); 360 setDelimiterMatcher(delim); 361 } 362 363 /** 364 * Constructs a tokenizer splitting on the specified delimiter character 365 * and handling quotes using the specified quote character. 366 * 367 * @param input the string which is to be parsed, not cloned 368 * @param delim the field delimiter character 369 * @param quote the field quoted string character 370 */ 371 public StrTokenizer(final char[] input, final char delim, final char quote) { 372 this(input, delim); 373 setQuoteChar(quote); 374 } 375 376 /** 377 * Constructs a tokenizer splitting using the specified delimiter matcher 378 * and handling quotes using the specified quote matcher. 379 * 380 * @param input the string which is to be parsed, not cloned 381 * @param delim the field delimiter character 382 * @param quote the field quoted string character 383 */ 384 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 385 this(input, delim); 386 setQuoteMatcher(quote); 387 } 388 389 // API 390 //----------------------------------------------------------------------- 391 /** 392 * Gets the number of tokens found in the String. 393 * 394 * @return the number of matched tokens 395 */ 396 public int size() { 397 checkTokenized(); 398 return tokens.length; 399 } 400 401 /** 402 * Gets the next token from the String. 403 * Equivalent to {@link #next()} except it returns null rather than 404 * throwing {@link NoSuchElementException} when no tokens remain. 405 * 406 * @return the next sequential token, or null when no more tokens are found 407 */ 408 public String nextToken() { 409 if (hasNext()) { 410 return tokens[tokenPos++]; 411 } 412 return null; 413 } 414 415 /** 416 * Gets the previous token from the String. 417 * 418 * @return the previous sequential token, or null when no more tokens are found 419 */ 420 public String previousToken() { 421 if (hasPrevious()) { 422 return tokens[--tokenPos]; 423 } 424 return null; 425 } 426 427 /** 428 * Gets a copy of the full token list as an independent modifiable array. 429 * 430 * @return the tokens as a String array 431 */ 432 public String[] getTokenArray() { 433 checkTokenized(); 434 return tokens.clone(); 435 } 436 437 /** 438 * Gets a copy of the full token list as an independent modifiable list. 439 * 440 * @return the tokens as a String array 441 */ 442 public List<String> getTokenList() { 443 checkTokenized(); 444 final List<String> list = new ArrayList<String>(tokens.length); 445 for (final String element : tokens) { 446 list.add(element); 447 } 448 return list; 449 } 450 451 /** 452 * Resets this tokenizer, forgetting all parsing and iteration already completed. 453 * <p> 454 * This method allows the same tokenizer to be reused for the same String. 455 * 456 * @return this, to enable chaining 457 */ 458 public StrTokenizer reset() { 459 tokenPos = 0; 460 tokens = null; 461 return this; 462 } 463 464 /** 465 * Reset this tokenizer, giving it a new input string to parse. 466 * In this manner you can re-use a tokenizer with the same settings 467 * on multiple input lines. 468 * 469 * @param input the new string to tokenize, null sets no text to parse 470 * @return this, to enable chaining 471 */ 472 public StrTokenizer reset(final String input) { 473 reset(); 474 if (input != null) { 475 this.chars = input.toCharArray(); 476 } else { 477 this.chars = null; 478 } 479 return this; 480 } 481 482 /** 483 * Reset this tokenizer, giving it a new input string to parse. 484 * In this manner you can re-use a tokenizer with the same settings 485 * on multiple input lines. 486 * 487 * @param input the new character array to tokenize, not cloned, null sets no text to parse 488 * @return this, to enable chaining 489 */ 490 public StrTokenizer reset(final char[] input) { 491 reset(); 492 this.chars = ArrayUtils.clone(input); 493 return this; 494 } 495 496 // ListIterator 497 //----------------------------------------------------------------------- 498 /** 499 * Checks whether there are any more tokens. 500 * 501 * @return true if there are more tokens 502 */ 503 @Override 504 public boolean hasNext() { 505 checkTokenized(); 506 return tokenPos < tokens.length; 507 } 508 509 /** 510 * Gets the next token. 511 * 512 * @return the next String token 513 * @throws NoSuchElementException if there are no more elements 514 */ 515 @Override 516 public String next() { 517 if (hasNext()) { 518 return tokens[tokenPos++]; 519 } 520 throw new NoSuchElementException(); 521 } 522 523 /** 524 * Gets the index of the next token to return. 525 * 526 * @return the next token index 527 */ 528 @Override 529 public int nextIndex() { 530 return tokenPos; 531 } 532 533 /** 534 * Checks whether there are any previous tokens that can be iterated to. 535 * 536 * @return true if there are previous tokens 537 */ 538 @Override 539 public boolean hasPrevious() { 540 checkTokenized(); 541 return tokenPos > 0; 542 } 543 544 /** 545 * Gets the token previous to the last returned token. 546 * 547 * @return the previous token 548 */ 549 @Override 550 public String previous() { 551 if (hasPrevious()) { 552 return tokens[--tokenPos]; 553 } 554 throw new NoSuchElementException(); 555 } 556 557 /** 558 * Gets the index of the previous token. 559 * 560 * @return the previous token index 561 */ 562 @Override 563 public int previousIndex() { 564 return tokenPos - 1; 565 } 566 567 /** 568 * Unsupported ListIterator operation. 569 * 570 * @throws UnsupportedOperationException always 571 */ 572 @Override 573 public void remove() { 574 throw new UnsupportedOperationException("remove() is unsupported"); 575 } 576 577 /** 578 * Unsupported ListIterator operation. 579 * @param obj this parameter ignored. 580 * @throws UnsupportedOperationException always 581 */ 582 @Override 583 public void set(final String obj) { 584 throw new UnsupportedOperationException("set() is unsupported"); 585 } 586 587 /** 588 * Unsupported ListIterator operation. 589 * @param obj this parameter ignored. 590 * @throws UnsupportedOperationException always 591 */ 592 @Override 593 public void add(final String obj) { 594 throw new UnsupportedOperationException("add() is unsupported"); 595 } 596 597 // Implementation 598 //----------------------------------------------------------------------- 599 /** 600 * Checks if tokenization has been done, and if not then do it. 601 */ 602 private void checkTokenized() { 603 if (tokens == null) { 604 if (chars == null) { 605 // still call tokenize as subclass may do some work 606 final List<String> split = tokenize(null, 0, 0); 607 tokens = split.toArray(new String[split.size()]); 608 } else { 609 final List<String> split = tokenize(chars, 0, chars.length); 610 tokens = split.toArray(new String[split.size()]); 611 } 612 } 613 } 614 615 /** 616 * Internal method to performs the tokenization. 617 * <p> 618 * Most users of this class do not need to call this method. This method 619 * will be called automatically by other (public) methods when required. 620 * <p> 621 * This method exists to allow subclasses to add code before or after the 622 * tokenization. For example, a subclass could alter the character array, 623 * offset or count to be parsed, or call the tokenizer multiple times on 624 * multiple strings. It is also be possible to filter the results. 625 * <p> 626 * <code>StrTokenizer</code> will always pass a zero offset and a count 627 * equal to the length of the array to this method, however a subclass 628 * may pass other values, or even an entirely different array. 629 * 630 * @param srcChars the character array being tokenized, may be null 631 * @param offset the start position within the character array, must be valid 632 * @param count the number of characters to tokenize, must be valid 633 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 634 */ 635 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 636 if (srcChars == null || count == 0) { 637 return Collections.emptyList(); 638 } 639 final StrBuilder buf = new StrBuilder(); 640 final List<String> tokenList = new ArrayList<String>(); 641 int pos = offset; 642 643 // loop around the entire buffer 644 while (pos >= 0 && pos < count) { 645 // find next token 646 pos = readNextToken(srcChars, pos, count, buf, tokenList); 647 648 // handle case where end of string is a delimiter 649 if (pos >= count) { 650 addToken(tokenList, ""); 651 } 652 } 653 return tokenList; 654 } 655 656 /** 657 * Adds a token to a list, paying attention to the parameters we've set. 658 * 659 * @param list the list to add to 660 * @param tok the token to add 661 */ 662 private void addToken(final List<String> list, String tok) { 663 if (StringUtils.isEmpty(tok)) { 664 if (isIgnoreEmptyTokens()) { 665 return; 666 } 667 if (isEmptyTokenAsNull()) { 668 tok = null; 669 } 670 } 671 list.add(tok); 672 } 673 674 /** 675 * Reads character by character through the String to get the next token. 676 * 677 * @param srcChars the character array being tokenized 678 * @param start the first character of field 679 * @param len the length of the character array being tokenized 680 * @param workArea a temporary work area 681 * @param tokenList the list of parsed tokens 682 * @return the starting position of the next field (the character 683 * immediately after the delimiter), or -1 if end of string found 684 */ 685 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 686 // skip all leading whitespace, unless it is the 687 // field delimiter or the quote character 688 while (start < len) { 689 final int removeLen = Math.max( 690 getIgnoredMatcher().isMatch(srcChars, start, start, len), 691 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 692 if (removeLen == 0 || 693 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 694 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 695 break; 696 } 697 start += removeLen; 698 } 699 700 // handle reaching end 701 if (start >= len) { 702 addToken(tokenList, ""); 703 return -1; 704 } 705 706 // handle empty token 707 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 708 if (delimLen > 0) { 709 addToken(tokenList, ""); 710 return start + delimLen; 711 } 712 713 // handle found token 714 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 715 if (quoteLen > 0) { 716 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 717 } 718 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 719 } 720 721 /** 722 * Reads a possibly quoted string token. 723 * 724 * @param srcChars the character array being tokenized 725 * @param start the first character of field 726 * @param len the length of the character array being tokenized 727 * @param workArea a temporary work area 728 * @param tokenList the list of parsed tokens 729 * @param quoteStart the start position of the matched quote, 0 if no quoting 730 * @param quoteLen the length of the matched quote, 0 if no quoting 731 * @return the starting position of the next field (the character 732 * immediately after the delimiter, or if end of string found, 733 * then the length of string 734 */ 735 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 736 final List<String> tokenList, final int quoteStart, final int quoteLen) { 737 // Loop until we've found the end of the quoted 738 // string or the end of the input 739 workArea.clear(); 740 int pos = start; 741 boolean quoting = quoteLen > 0; 742 int trimStart = 0; 743 744 while (pos < len) { 745 // quoting mode can occur several times throughout a string 746 // we must switch between quoting and non-quoting until we 747 // encounter a non-quoted delimiter, or end of string 748 if (quoting) { 749 // In quoting mode 750 751 // If we've found a quote character, see if it's 752 // followed by a second quote. If so, then we need 753 // to actually put the quote character into the token 754 // rather than end the token. 755 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 756 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 757 // matched pair of quotes, thus an escaped quote 758 workArea.append(srcChars, pos, quoteLen); 759 pos += quoteLen * 2; 760 trimStart = workArea.size(); 761 continue; 762 } 763 764 // end of quoting 765 quoting = false; 766 pos += quoteLen; 767 continue; 768 } 769 770 // copy regular character from inside quotes 771 workArea.append(srcChars[pos++]); 772 trimStart = workArea.size(); 773 774 } else { 775 // Not in quoting mode 776 777 // check for delimiter, and thus end of token 778 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 779 if (delimLen > 0) { 780 // return condition when end of token found 781 addToken(tokenList, workArea.substring(0, trimStart)); 782 return pos + delimLen; 783 } 784 785 // check for quote, and thus back into quoting mode 786 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 787 quoting = true; 788 pos += quoteLen; 789 continue; 790 } 791 792 // check for ignored (outside quotes), and ignore 793 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 794 if (ignoredLen > 0) { 795 pos += ignoredLen; 796 continue; 797 } 798 799 // check for trimmed character 800 // don't yet know if its at the end, so copy to workArea 801 // use trimStart to keep track of trim at the end 802 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 803 if (trimmedLen > 0) { 804 workArea.append(srcChars, pos, trimmedLen); 805 pos += trimmedLen; 806 continue; 807 } 808 809 // copy regular character from outside quotes 810 workArea.append(srcChars[pos++]); 811 trimStart = workArea.size(); 812 } 813 } 814 815 // return condition when end of string found 816 addToken(tokenList, workArea.substring(0, trimStart)); 817 return -1; 818 } 819 820 /** 821 * Checks if the characters at the index specified match the quote 822 * already matched in readNextToken(). 823 * 824 * @param srcChars the character array being tokenized 825 * @param pos the position to check for a quote 826 * @param len the length of the character array being tokenized 827 * @param quoteStart the start position of the matched quote, 0 if no quoting 828 * @param quoteLen the length of the matched quote, 0 if no quoting 829 * @return true if a quote is matched 830 */ 831 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 832 for (int i = 0; i < quoteLen; i++) { 833 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 834 return false; 835 } 836 } 837 return true; 838 } 839 840 // Delimiter 841 //----------------------------------------------------------------------- 842 /** 843 * Gets the field delimiter matcher. 844 * 845 * @return the delimiter matcher in use 846 */ 847 public StrMatcher getDelimiterMatcher() { 848 return this.delimMatcher; 849 } 850 851 /** 852 * Sets the field delimiter matcher. 853 * <p> 854 * The delimitier is used to separate one token from another. 855 * 856 * @param delim the delimiter matcher to use 857 * @return this, to enable chaining 858 */ 859 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 860 if (delim == null) { 861 this.delimMatcher = StrMatcher.noneMatcher(); 862 } else { 863 this.delimMatcher = delim; 864 } 865 return this; 866 } 867 868 /** 869 * Sets the field delimiter character. 870 * 871 * @param delim the delimiter character to use 872 * @return this, to enable chaining 873 */ 874 public StrTokenizer setDelimiterChar(final char delim) { 875 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 876 } 877 878 /** 879 * Sets the field delimiter string. 880 * 881 * @param delim the delimiter string to use 882 * @return this, to enable chaining 883 */ 884 public StrTokenizer setDelimiterString(final String delim) { 885 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 886 } 887 888 // Quote 889 //----------------------------------------------------------------------- 890 /** 891 * Gets the quote matcher currently in use. 892 * <p> 893 * The quote character is used to wrap data between the tokens. 894 * This enables delimiters to be entered as data. 895 * The default value is '"' (double quote). 896 * 897 * @return the quote matcher in use 898 */ 899 public StrMatcher getQuoteMatcher() { 900 return quoteMatcher; 901 } 902 903 /** 904 * Set the quote matcher to use. 905 * <p> 906 * The quote character is used to wrap data between the tokens. 907 * This enables delimiters to be entered as data. 908 * 909 * @param quote the quote matcher to use, null ignored 910 * @return this, to enable chaining 911 */ 912 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 913 if (quote != null) { 914 this.quoteMatcher = quote; 915 } 916 return this; 917 } 918 919 /** 920 * Sets the quote character to use. 921 * <p> 922 * The quote character is used to wrap data between the tokens. 923 * This enables delimiters to be entered as data. 924 * 925 * @param quote the quote character to use 926 * @return this, to enable chaining 927 */ 928 public StrTokenizer setQuoteChar(final char quote) { 929 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 930 } 931 932 // Ignored 933 //----------------------------------------------------------------------- 934 /** 935 * Gets the ignored character matcher. 936 * <p> 937 * These characters are ignored when parsing the String, unless they are 938 * within a quoted region. 939 * The default value is not to ignore anything. 940 * 941 * @return the ignored matcher in use 942 */ 943 public StrMatcher getIgnoredMatcher() { 944 return ignoredMatcher; 945 } 946 947 /** 948 * Set the matcher for characters to ignore. 949 * <p> 950 * These characters are ignored when parsing the String, unless they are 951 * within a quoted region. 952 * 953 * @param ignored the ignored matcher to use, null ignored 954 * @return this, to enable chaining 955 */ 956 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 957 if (ignored != null) { 958 this.ignoredMatcher = ignored; 959 } 960 return this; 961 } 962 963 /** 964 * Set the character to ignore. 965 * <p> 966 * This character is ignored when parsing the String, unless it is 967 * within a quoted region. 968 * 969 * @param ignored the ignored character to use 970 * @return this, to enable chaining 971 */ 972 public StrTokenizer setIgnoredChar(final char ignored) { 973 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 974 } 975 976 // Trimmer 977 //----------------------------------------------------------------------- 978 /** 979 * Gets the trimmer character matcher. 980 * <p> 981 * These characters are trimmed off on each side of the delimiter 982 * until the token or quote is found. 983 * The default value is not to trim anything. 984 * 985 * @return the trimmer matcher in use 986 */ 987 public StrMatcher getTrimmerMatcher() { 988 return trimmerMatcher; 989 } 990 991 /** 992 * Sets the matcher for characters to trim. 993 * <p> 994 * These characters are trimmed off on each side of the delimiter 995 * until the token or quote is found. 996 * 997 * @param trimmer the trimmer matcher to use, null ignored 998 * @return this, to enable chaining 999 */ 1000 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1001 if (trimmer != null) { 1002 this.trimmerMatcher = trimmer; 1003 } 1004 return this; 1005 } 1006 1007 //----------------------------------------------------------------------- 1008 /** 1009 * Gets whether the tokenizer currently returns empty tokens as null. 1010 * The default for this property is false. 1011 * 1012 * @return true if empty tokens are returned as null 1013 */ 1014 public boolean isEmptyTokenAsNull() { 1015 return this.emptyAsNull; 1016 } 1017 1018 /** 1019 * Sets whether the tokenizer should return empty tokens as null. 1020 * The default for this property is false. 1021 * 1022 * @param emptyAsNull whether empty tokens are returned as null 1023 * @return this, to enable chaining 1024 */ 1025 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1026 this.emptyAsNull = emptyAsNull; 1027 return this; 1028 } 1029 1030 //----------------------------------------------------------------------- 1031 /** 1032 * Gets whether the tokenizer currently ignores empty tokens. 1033 * The default for this property is true. 1034 * 1035 * @return true if empty tokens are not returned 1036 */ 1037 public boolean isIgnoreEmptyTokens() { 1038 return ignoreEmptyTokens; 1039 } 1040 1041 /** 1042 * Sets whether the tokenizer should ignore and not return empty tokens. 1043 * The default for this property is true. 1044 * 1045 * @param ignoreEmptyTokens whether empty tokens are not returned 1046 * @return this, to enable chaining 1047 */ 1048 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1049 this.ignoreEmptyTokens = ignoreEmptyTokens; 1050 return this; 1051 } 1052 1053 //----------------------------------------------------------------------- 1054 /** 1055 * Gets the String content that the tokenizer is parsing. 1056 * 1057 * @return the string content being parsed 1058 */ 1059 public String getContent() { 1060 if (chars == null) { 1061 return null; 1062 } 1063 return new String(chars); 1064 } 1065 1066 //----------------------------------------------------------------------- 1067 /** 1068 * Creates a new instance of this Tokenizer. The new instance is reset so 1069 * that it will be at the start of the token list. 1070 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1071 * 1072 * @return a new instance of this Tokenizer which has been reset. 1073 */ 1074 @Override 1075 public Object clone() { 1076 try { 1077 return cloneReset(); 1078 } catch (final CloneNotSupportedException ex) { 1079 return null; 1080 } 1081 } 1082 1083 /** 1084 * Creates a new instance of this Tokenizer. The new instance is reset so that 1085 * it will be at the start of the token list. 1086 * 1087 * @return a new instance of this Tokenizer which has been reset. 1088 * @throws CloneNotSupportedException if there is a problem cloning 1089 */ 1090 Object cloneReset() throws CloneNotSupportedException { 1091 // this method exists to enable 100% test coverage 1092 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1093 if (cloned.chars != null) { 1094 cloned.chars = cloned.chars.clone(); 1095 } 1096 cloned.reset(); 1097 return cloned; 1098 } 1099 1100 //----------------------------------------------------------------------- 1101 /** 1102 * Gets the String content that the tokenizer is parsing. 1103 * 1104 * @return the string content being parsed 1105 */ 1106 @Override 1107 public String toString() { 1108 if (tokens == null) { 1109 return "StrTokenizer[not tokenized yet]"; 1110 } 1111 return "StrTokenizer" + getTokenList(); 1112 } 1113 1114}