1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text; 18 19 import java.util.ArrayList; 20 import java.util.Collections; 21 import java.util.List; 22 import java.util.ListIterator; 23 import java.util.NoSuchElementException; 24 25 /** 26 * Tokenizes a string based based on delimiters (separators) 27 * and supporting quoting and ignored character concepts. 28 * <p> 29 * This class can split a String into many smaller strings. It aims 30 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 31 * however it offers much more control and flexibility including implementing 32 * the <code>ListIterator</code> interface. By default, it is set up 33 * like <code>StringTokenizer</code>. 34 * <p> 35 * The input String is split into a number of <i>tokens</i>. 36 * Each token is separated from the next String by a <i>delimiter</i>. 37 * One or more delimiter characters must be specified. 38 * <p> 39 * Each token may be surrounded by quotes. 40 * The <i>quote</i> matcher specifies the quote character(s). 41 * A quote may be escaped within a quoted section by duplicating itself. 42 * <p> 43 * Between each token and the delimiter are potentially characters that need trimming. 44 * The <i>trimmer</i> matcher specifies these characters. 45 * One usage might be to trim whitespace characters. 46 * <p> 47 * At any point outside the quotes there might potentially be invalid characters. 48 * The <i>ignored</i> matcher specifies these characters to be removed. 49 * One usage might be to remove new line characters. 50 * <p> 51 * Empty tokens may be removed or returned as null. 52 * <pre> 53 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 54 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 55 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 56 * </pre> 57 * <p> 58 * 59 * This tokenizer has the following properties and options: 60 * 61 * <table summary="Tokenizer Properties"> 62 * <tr> 63 * <th>Property</th><th>Type</th><th>Default</th> 64 * </tr> 65 * <tr> 66 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 67 * </tr> 68 * <tr> 69 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 70 * </tr> 71 * <tr> 72 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 73 * </tr> 74 * <tr> 75 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 76 * </tr> 77 * <tr> 78 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 79 * </tr> 80 * </table> 81 * 82 * @since 1.0 83 */ 84 public class StrTokenizer implements ListIterator<String>, Cloneable { 85 86 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 87 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 88 static { 89 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 90 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 91 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 92 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 93 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 94 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 95 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 96 97 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 98 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 99 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 100 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 101 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 102 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 103 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 104 } 105 106 /** The text to work on. */ 107 private char chars[]; 108 /** The parsed tokens */ 109 private String tokens[]; 110 /** The current iteration position */ 111 private int tokenPos; 112 113 /** The delimiter matcher */ 114 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 115 /** The quote matcher */ 116 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 117 /** The ignored matcher */ 118 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 119 /** The trimmer matcher */ 120 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 121 122 /** Whether to return empty tokens as null */ 123 private boolean emptyAsNull = false; 124 /** Whether to ignore empty tokens */ 125 private boolean ignoreEmptyTokens = true; 126 127 //----------------------------------------------------------------------- 128 129 /** 130 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 131 * 132 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 133 */ 134 private static StrTokenizer getCSVClone() { 135 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 136 } 137 138 /** 139 * Gets a new tokenizer instance which parses Comma Separated Value strings 140 * initializing it with the given input. The default for CSV processing 141 * will be trim whitespace from both ends (which can be overridden with 142 * the setTrimmer method). 143 * <p> 144 * You must call a "reset" method to set the string which you want to parse. 145 * @return a new tokenizer instance which parses Comma Separated Value strings 146 */ 147 public static StrTokenizer getCSVInstance() { 148 return getCSVClone(); 149 } 150 151 /** 152 * Gets a new tokenizer instance which parses Comma Separated Value strings 153 * initializing it with the given input. The default for CSV processing 154 * will be trim whitespace from both ends (which can be overridden with 155 * the setTrimmer method). 156 * 157 * @param input the text to parse 158 * @return a new tokenizer instance which parses Comma Separated Value strings 159 */ 160 public static StrTokenizer getCSVInstance(final String input) { 161 final StrTokenizer tok = getCSVClone(); 162 tok.reset(input); 163 return tok; 164 } 165 166 /** 167 * Gets a new tokenizer instance which parses Comma Separated Value strings 168 * initializing it with the given input. The default for CSV processing 169 * will be trim whitespace from both ends (which can be overridden with 170 * the setTrimmer method). 171 * 172 * @param input the text to parse 173 * @return a new tokenizer instance which parses Comma Separated Value strings 174 */ 175 public static StrTokenizer getCSVInstance(final char[] input) { 176 final StrTokenizer tok = getCSVClone(); 177 tok.reset(input); 178 return tok; 179 } 180 181 /** 182 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 183 * 184 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 185 */ 186 private static StrTokenizer getTSVClone() { 187 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 188 } 189 190 191 /** 192 * Gets a new tokenizer instance which parses Tab Separated Value strings. 193 * The default for CSV processing will be trim whitespace from both ends 194 * (which can be overridden with the setTrimmer method). 195 * <p> 196 * You must call a "reset" method to set the string which you want to parse. 197 * @return a new tokenizer instance which parses Tab Separated Value strings. 198 */ 199 public static StrTokenizer getTSVInstance() { 200 return getTSVClone(); 201 } 202 203 /** 204 * Gets a new tokenizer instance which parses Tab Separated Value strings. 205 * The default for CSV processing will be trim whitespace from both ends 206 * (which can be overridden with the setTrimmer method). 207 * @param input the string to parse 208 * @return a new tokenizer instance which parses Tab Separated Value strings. 209 */ 210 public static StrTokenizer getTSVInstance(final String input) { 211 final StrTokenizer tok = getTSVClone(); 212 tok.reset(input); 213 return tok; 214 } 215 216 /** 217 * Gets a new tokenizer instance which parses Tab Separated Value strings. 218 * The default for CSV processing will be trim whitespace from both ends 219 * (which can be overridden with the setTrimmer method). 220 * @param input the string to parse 221 * @return a new tokenizer instance which parses Tab Separated Value strings. 222 */ 223 public static StrTokenizer getTSVInstance(final char[] input) { 224 final StrTokenizer tok = getTSVClone(); 225 tok.reset(input); 226 return tok; 227 } 228 229 //----------------------------------------------------------------------- 230 /** 231 * Constructs a tokenizer splitting on space, tab, newline and formfeed 232 * as per StringTokenizer, but with no text to tokenize. 233 * <p> 234 * This constructor is normally used with {@link #reset(String)}. 235 */ 236 public StrTokenizer() { 237 super(); 238 this.chars = null; 239 } 240 241 /** 242 * Constructs a tokenizer splitting on space, tab, newline and formfeed 243 * as per StringTokenizer. 244 * 245 * @param input the string which is to be parsed 246 */ 247 public StrTokenizer(final String input) { 248 super(); 249 if (input != null) { 250 chars = input.toCharArray(); 251 } else { 252 chars = null; 253 } 254 } 255 256 /** 257 * Constructs a tokenizer splitting on the specified delimiter character. 258 * 259 * @param input the string which is to be parsed 260 * @param delim the field delimiter character 261 */ 262 public StrTokenizer(final String input, final char delim) { 263 this(input); 264 setDelimiterChar(delim); 265 } 266 267 /** 268 * Constructs a tokenizer splitting on the specified delimiter string. 269 * 270 * @param input the string which is to be parsed 271 * @param delim the field delimiter string 272 */ 273 public StrTokenizer(final String input, final String delim) { 274 this(input); 275 setDelimiterString(delim); 276 } 277 278 /** 279 * Constructs a tokenizer splitting using the specified delimiter matcher. 280 * 281 * @param input the string which is to be parsed 282 * @param delim the field delimiter matcher 283 */ 284 public StrTokenizer(final String input, final StrMatcher delim) { 285 this(input); 286 setDelimiterMatcher(delim); 287 } 288 289 /** 290 * Constructs a tokenizer splitting on the specified delimiter character 291 * and handling quotes using the specified quote character. 292 * 293 * @param input the string which is to be parsed 294 * @param delim the field delimiter character 295 * @param quote the field quoted string character 296 */ 297 public StrTokenizer(final String input, final char delim, final char quote) { 298 this(input, delim); 299 setQuoteChar(quote); 300 } 301 302 /** 303 * Constructs a tokenizer splitting using the specified delimiter matcher 304 * and handling quotes using the specified quote matcher. 305 * 306 * @param input the string which is to be parsed 307 * @param delim the field delimiter matcher 308 * @param quote the field quoted string matcher 309 */ 310 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 311 this(input, delim); 312 setQuoteMatcher(quote); 313 } 314 315 /** 316 * Constructs a tokenizer splitting on space, tab, newline and formfeed 317 * as per StringTokenizer. 318 * 319 * @param input the string which is to be parsed, not cloned 320 */ 321 public StrTokenizer(final char[] input) { 322 super(); 323 if (input == null) { 324 this.chars = null; 325 } else { 326 this.chars = input.clone(); 327 } 328 } 329 330 /** 331 * Constructs a tokenizer splitting on the specified character. 332 * 333 * @param input the string which is to be parsed, not cloned 334 * @param delim the field delimiter character 335 */ 336 public StrTokenizer(final char[] input, final char delim) { 337 this(input); 338 setDelimiterChar(delim); 339 } 340 341 /** 342 * Constructs a tokenizer splitting on the specified string. 343 * 344 * @param input the string which is to be parsed, not cloned 345 * @param delim the field delimiter string 346 */ 347 public StrTokenizer(final char[] input, final String delim) { 348 this(input); 349 setDelimiterString(delim); 350 } 351 352 /** 353 * Constructs a tokenizer splitting using the specified delimiter matcher. 354 * 355 * @param input the string which is to be parsed, not cloned 356 * @param delim the field delimiter matcher 357 */ 358 public StrTokenizer(final char[] input, final StrMatcher delim) { 359 this(input); 360 setDelimiterMatcher(delim); 361 } 362 363 /** 364 * Constructs a tokenizer splitting on the specified delimiter character 365 * and handling quotes using the specified quote character. 366 * 367 * @param input the string which is to be parsed, not cloned 368 * @param delim the field delimiter character 369 * @param quote the field quoted string character 370 */ 371 public StrTokenizer(final char[] input, final char delim, final char quote) { 372 this(input, delim); 373 setQuoteChar(quote); 374 } 375 376 /** 377 * Constructs a tokenizer splitting using the specified delimiter matcher 378 * and handling quotes using the specified quote matcher. 379 * 380 * @param input the string which is to be parsed, not cloned 381 * @param delim the field delimiter character 382 * @param quote the field quoted string character 383 */ 384 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 385 this(input, delim); 386 setQuoteMatcher(quote); 387 } 388 389 // API 390 //----------------------------------------------------------------------- 391 /** 392 * Gets the number of tokens found in the String. 393 * 394 * @return the number of matched tokens 395 */ 396 public int size() { 397 checkTokenized(); 398 return tokens.length; 399 } 400 401 /** 402 * Gets the next token from the String. 403 * Equivalent to {@link #next()} except it returns null rather than 404 * throwing {@link NoSuchElementException} when no tokens remain. 405 * 406 * @return the next sequential token, or null when no more tokens are found 407 */ 408 public String nextToken() { 409 if (hasNext()) { 410 return tokens[tokenPos++]; 411 } 412 return null; 413 } 414 415 /** 416 * Gets the previous token from the String. 417 * 418 * @return the previous sequential token, or null when no more tokens are found 419 */ 420 public String previousToken() { 421 if (hasPrevious()) { 422 return tokens[--tokenPos]; 423 } 424 return null; 425 } 426 427 /** 428 * Gets a copy of the full token list as an independent modifiable array. 429 * 430 * @return the tokens as a String array 431 */ 432 public String[] getTokenArray() { 433 checkTokenized(); 434 return tokens.clone(); 435 } 436 437 /** 438 * Gets a copy of the full token list as an independent modifiable list. 439 * 440 * @return the tokens as a String array 441 */ 442 public List<String> getTokenList() { 443 checkTokenized(); 444 final List<String> list = new ArrayList<>(tokens.length); 445 for (final String element : tokens) { 446 list.add(element); 447 } 448 return list; 449 } 450 451 /** 452 * Resets this tokenizer, forgetting all parsing and iteration already completed. 453 * <p> 454 * This method allows the same tokenizer to be reused for the same String. 455 * 456 * @return this, to enable chaining 457 */ 458 public org.apache.commons.text.StrTokenizer reset() { 459 tokenPos = 0; 460 tokens = null; 461 return this; 462 } 463 464 /** 465 * Reset this tokenizer, giving it a new input string to parse. 466 * In this manner you can re-use a tokenizer with the same settings 467 * on multiple input lines. 468 * 469 * @param input the new string to tokenize, null sets no text to parse 470 * @return this, to enable chaining 471 */ 472 public org.apache.commons.text.StrTokenizer reset(final String input) { 473 reset(); 474 if (input != null) { 475 this.chars = input.toCharArray(); 476 } else { 477 this.chars = null; 478 } 479 return this; 480 } 481 482 /** 483 * Reset this tokenizer, giving it a new input string to parse. 484 * In this manner you can re-use a tokenizer with the same settings 485 * on multiple input lines. 486 * 487 * @param input the new character array to tokenize, not cloned, null sets no text to parse 488 * @return this, to enable chaining 489 */ 490 public org.apache.commons.text.StrTokenizer reset(final char[] input) { 491 reset(); 492 if (input != null) { 493 this.chars = input; 494 } else { 495 this.chars = null; 496 } 497 return this; 498 } 499 500 // ListIterator 501 //----------------------------------------------------------------------- 502 /** 503 * Checks whether there are any more tokens. 504 * 505 * @return true if there are more tokens 506 */ 507 @Override 508 public boolean hasNext() { 509 checkTokenized(); 510 return tokenPos < tokens.length; 511 } 512 513 /** 514 * Gets the next token. 515 * 516 * @return the next String token 517 * @throws NoSuchElementException if there are no more elements 518 */ 519 @Override 520 public String next() { 521 if (hasNext()) { 522 return tokens[tokenPos++]; 523 } 524 throw new NoSuchElementException(); 525 } 526 527 /** 528 * Gets the index of the next token to return. 529 * 530 * @return the next token index 531 */ 532 @Override 533 public int nextIndex() { 534 return tokenPos; 535 } 536 537 /** 538 * Checks whether there are any previous tokens that can be iterated to. 539 * 540 * @return true if there are previous tokens 541 */ 542 @Override 543 public boolean hasPrevious() { 544 checkTokenized(); 545 return tokenPos > 0; 546 } 547 548 /** 549 * Gets the token previous to the last returned token. 550 * 551 * @return the previous token 552 */ 553 @Override 554 public String previous() { 555 if (hasPrevious()) { 556 return tokens[--tokenPos]; 557 } 558 throw new NoSuchElementException(); 559 } 560 561 /** 562 * Gets the index of the previous token. 563 * 564 * @return the previous token index 565 */ 566 @Override 567 public int previousIndex() { 568 return tokenPos - 1; 569 } 570 571 /** 572 * Unsupported ListIterator operation. 573 * 574 * @throws UnsupportedOperationException always 575 */ 576 @Override 577 public void remove() { 578 throw new UnsupportedOperationException("remove() is unsupported"); 579 } 580 581 /** 582 * Unsupported ListIterator operation. 583 * @param obj this parameter ignored. 584 * @throws UnsupportedOperationException always 585 */ 586 @Override 587 public void set(final String obj) { 588 throw new UnsupportedOperationException("set() is unsupported"); 589 } 590 591 /** 592 * Unsupported ListIterator operation. 593 * @param obj this parameter ignored. 594 * @throws UnsupportedOperationException always 595 */ 596 @Override 597 public void add(final String obj) { 598 throw new UnsupportedOperationException("add() is unsupported"); 599 } 600 601 // Implementation 602 //----------------------------------------------------------------------- 603 /** 604 * Checks if tokenization has been done, and if not then do it. 605 */ 606 private void checkTokenized() { 607 if (tokens == null) { 608 if (chars == null) { 609 // still call tokenize as subclass may do some work 610 final List<String> split = tokenize(null, 0, 0); 611 tokens = split.toArray(new String[split.size()]); 612 } else { 613 final List<String> split = tokenize(chars, 0, chars.length); 614 tokens = split.toArray(new String[split.size()]); 615 } 616 } 617 } 618 619 /** 620 * Internal method to performs the tokenization. 621 * <p> 622 * Most users of this class do not need to call this method. This method 623 * will be called automatically by other (public) methods when required. 624 * <p> 625 * This method exists to allow subclasses to add code before or after the 626 * tokenization. For example, a subclass could alter the character array, 627 * offset or count to be parsed, or call the tokenizer multiple times on 628 * multiple strings. It is also be possible to filter the results. 629 * <p> 630 * <code>StrTokenizer</code> will always pass a zero offset and a count 631 * equal to the length of the array to this method, however a subclass 632 * may pass other values, or even an entirely different array. 633 * 634 * @param srcChars the character array being tokenized, may be null 635 * @param offset the start position within the character array, must be valid 636 * @param count the number of characters to tokenize, must be valid 637 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 638 */ 639 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 640 if (srcChars == null || count == 0) { 641 return Collections.emptyList(); 642 } 643 final StrBuilder buf = new StrBuilder(); 644 final List<String> tokenList = new ArrayList<>(); 645 int pos = offset; 646 647 // loop around the entire buffer 648 while (pos >= 0 && pos < count) { 649 // find next token 650 pos = readNextToken(srcChars, pos, count, buf, tokenList); 651 652 // handle case where end of string is a delimiter 653 if (pos >= count) { 654 addToken(tokenList, ""); 655 } 656 } 657 return tokenList; 658 } 659 660 /** 661 * Adds a token to a list, paying attention to the parameters we've set. 662 * 663 * @param list the list to add to 664 * @param tok the token to add 665 */ 666 private void addToken(final List<String> list, String tok) { 667 if (tok == null || tok.length() == 0) { 668 if (isIgnoreEmptyTokens()) { 669 return; 670 } 671 if (isEmptyTokenAsNull()) { 672 tok = null; 673 } 674 } 675 list.add(tok); 676 } 677 678 /** 679 * Reads character by character through the String to get the next token. 680 * 681 * @param srcChars the character array being tokenized 682 * @param start the first character of field 683 * @param len the length of the character array being tokenized 684 * @param workArea a temporary work area 685 * @param tokenList the list of parsed tokens 686 * @return the starting position of the next field (the character 687 * immediately after the delimiter), or -1 if end of string found 688 */ 689 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 690 // skip all leading whitespace, unless it is the 691 // field delimiter or the quote character 692 while (start < len) { 693 final int removeLen = Math.max( 694 getIgnoredMatcher().isMatch(srcChars, start, start, len), 695 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 696 if (removeLen == 0 || 697 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 698 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 699 break; 700 } 701 start += removeLen; 702 } 703 704 // handle reaching end 705 if (start >= len) { 706 addToken(tokenList, ""); 707 return -1; 708 } 709 710 // handle empty token 711 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 712 if (delimLen > 0) { 713 addToken(tokenList, ""); 714 return start + delimLen; 715 } 716 717 // handle found token 718 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 719 if (quoteLen > 0) { 720 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 721 } 722 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 723 } 724 725 /** 726 * Reads a possibly quoted string token. 727 * 728 * @param srcChars the character array being tokenized 729 * @param start the first character of field 730 * @param len the length of the character array being tokenized 731 * @param workArea a temporary work area 732 * @param tokenList the list of parsed tokens 733 * @param quoteStart the start position of the matched quote, 0 if no quoting 734 * @param quoteLen the length of the matched quote, 0 if no quoting 735 * @return the starting position of the next field (the character 736 * immediately after the delimiter, or if end of string found, 737 * then the length of string 738 */ 739 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 740 final List<String> tokenList, final int quoteStart, final int quoteLen) { 741 // Loop until we've found the end of the quoted 742 // string or the end of the input 743 workArea.clear(); 744 int pos = start; 745 boolean quoting = quoteLen > 0; 746 int trimStart = 0; 747 748 while (pos < len) { 749 // quoting mode can occur several times throughout a string 750 // we must switch between quoting and non-quoting until we 751 // encounter a non-quoted delimiter, or end of string 752 if (quoting) { 753 // In quoting mode 754 755 // If we've found a quote character, see if it's 756 // followed by a second quote. If so, then we need 757 // to actually put the quote character into the token 758 // rather than end the token. 759 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 760 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 761 // matched pair of quotes, thus an escaped quote 762 workArea.append(srcChars, pos, quoteLen); 763 pos += quoteLen * 2; 764 trimStart = workArea.size(); 765 continue; 766 } 767 768 // end of quoting 769 quoting = false; 770 pos += quoteLen; 771 continue; 772 } 773 774 // copy regular character from inside quotes 775 workArea.append(srcChars[pos++]); 776 trimStart = workArea.size(); 777 778 } else { 779 // Not in quoting mode 780 781 // check for delimiter, and thus end of token 782 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 783 if (delimLen > 0) { 784 // return condition when end of token found 785 addToken(tokenList, workArea.substring(0, trimStart)); 786 return pos + delimLen; 787 } 788 789 // check for quote, and thus back into quoting mode 790 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 791 quoting = true; 792 pos += quoteLen; 793 continue; 794 } 795 796 // check for ignored (outside quotes), and ignore 797 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 798 if (ignoredLen > 0) { 799 pos += ignoredLen; 800 continue; 801 } 802 803 // check for trimmed character 804 // don't yet know if its at the end, so copy to workArea 805 // use trimStart to keep track of trim at the end 806 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 807 if (trimmedLen > 0) { 808 workArea.append(srcChars, pos, trimmedLen); 809 pos += trimmedLen; 810 continue; 811 } 812 813 // copy regular character from outside quotes 814 workArea.append(srcChars[pos++]); 815 trimStart = workArea.size(); 816 } 817 } 818 819 // return condition when end of string found 820 addToken(tokenList, workArea.substring(0, trimStart)); 821 return -1; 822 } 823 824 /** 825 * Checks if the characters at the index specified match the quote 826 * already matched in readNextToken(). 827 * 828 * @param srcChars the character array being tokenized 829 * @param pos the position to check for a quote 830 * @param len the length of the character array being tokenized 831 * @param quoteStart the start position of the matched quote, 0 if no quoting 832 * @param quoteLen the length of the matched quote, 0 if no quoting 833 * @return true if a quote is matched 834 */ 835 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 836 for (int i = 0; i < quoteLen; i++) { 837 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 838 return false; 839 } 840 } 841 return true; 842 } 843 844 // Delimiter 845 //----------------------------------------------------------------------- 846 /** 847 * Gets the field delimiter matcher. 848 * 849 * @return the delimiter matcher in use 850 */ 851 public StrMatcher getDelimiterMatcher() { 852 return this.delimMatcher; 853 } 854 855 /** 856 * Sets the field delimiter matcher. 857 * <p> 858 * The delimitier is used to separate one token from another. 859 * 860 * @param delim the delimiter matcher to use 861 * @return this, to enable chaining 862 */ 863 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 864 if (delim == null) { 865 this.delimMatcher = StrMatcher.noneMatcher(); 866 } else { 867 this.delimMatcher = delim; 868 } 869 return this; 870 } 871 872 /** 873 * Sets the field delimiter character. 874 * 875 * @param delim the delimiter character to use 876 * @return this, to enable chaining 877 */ 878 public StrTokenizer setDelimiterChar(final char delim) { 879 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 880 } 881 882 /** 883 * Sets the field delimiter string. 884 * 885 * @param delim the delimiter string to use 886 * @return this, to enable chaining 887 */ 888 public StrTokenizer setDelimiterString(final String delim) { 889 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 890 } 891 892 // Quote 893 //----------------------------------------------------------------------- 894 /** 895 * Gets the quote matcher currently in use. 896 * <p> 897 * The quote character is used to wrap data between the tokens. 898 * This enables delimiters to be entered as data. 899 * The default value is '"' (double quote). 900 * 901 * @return the quote matcher in use 902 */ 903 public StrMatcher getQuoteMatcher() { 904 return quoteMatcher; 905 } 906 907 /** 908 * Set the quote matcher to use. 909 * <p> 910 * The quote character is used to wrap data between the tokens. 911 * This enables delimiters to be entered as data. 912 * 913 * @param quote the quote matcher to use, null ignored 914 * @return this, to enable chaining 915 */ 916 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 917 if (quote != null) { 918 this.quoteMatcher = quote; 919 } 920 return this; 921 } 922 923 /** 924 * Sets the quote character to use. 925 * <p> 926 * The quote character is used to wrap data between the tokens. 927 * This enables delimiters to be entered as data. 928 * 929 * @param quote the quote character to use 930 * @return this, to enable chaining 931 */ 932 public StrTokenizer setQuoteChar(final char quote) { 933 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 934 } 935 936 // Ignored 937 //----------------------------------------------------------------------- 938 /** 939 * Gets the ignored character matcher. 940 * <p> 941 * These characters are ignored when parsing the String, unless they are 942 * within a quoted region. 943 * The default value is not to ignore anything. 944 * 945 * @return the ignored matcher in use 946 */ 947 public StrMatcher getIgnoredMatcher() { 948 return ignoredMatcher; 949 } 950 951 /** 952 * Set the matcher for characters to ignore. 953 * <p> 954 * These characters are ignored when parsing the String, unless they are 955 * within a quoted region. 956 * 957 * @param ignored the ignored matcher to use, null ignored 958 * @return this, to enable chaining 959 */ 960 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 961 if (ignored != null) { 962 this.ignoredMatcher = ignored; 963 } 964 return this; 965 } 966 967 /** 968 * Set the character to ignore. 969 * <p> 970 * This character is ignored when parsing the String, unless it is 971 * within a quoted region. 972 * 973 * @param ignored the ignored character to use 974 * @return this, to enable chaining 975 */ 976 public StrTokenizer setIgnoredChar(final char ignored) { 977 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 978 } 979 980 // Trimmer 981 //----------------------------------------------------------------------- 982 /** 983 * Gets the trimmer character matcher. 984 * <p> 985 * These characters are trimmed off on each side of the delimiter 986 * until the token or quote is found. 987 * The default value is not to trim anything. 988 * 989 * @return the trimmer matcher in use 990 */ 991 public StrMatcher getTrimmerMatcher() { 992 return trimmerMatcher; 993 } 994 995 /** 996 * Sets the matcher for characters to trim. 997 * <p> 998 * These characters are trimmed off on each side of the delimiter 999 * until the token or quote is found. 1000 * 1001 * @param trimmer the trimmer matcher to use, null ignored 1002 * @return this, to enable chaining 1003 */ 1004 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1005 if (trimmer != null) { 1006 this.trimmerMatcher = trimmer; 1007 } 1008 return this; 1009 } 1010 1011 //----------------------------------------------------------------------- 1012 /** 1013 * Gets whether the tokenizer currently returns empty tokens as null. 1014 * The default for this property is false. 1015 * 1016 * @return true if empty tokens are returned as null 1017 */ 1018 public boolean isEmptyTokenAsNull() { 1019 return this.emptyAsNull; 1020 } 1021 1022 /** 1023 * Sets whether the tokenizer should return empty tokens as null. 1024 * The default for this property is false. 1025 * 1026 * @param emptyAsNull whether empty tokens are returned as null 1027 * @return this, to enable chaining 1028 */ 1029 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1030 this.emptyAsNull = emptyAsNull; 1031 return this; 1032 } 1033 1034 //----------------------------------------------------------------------- 1035 /** 1036 * Gets whether the tokenizer currently ignores empty tokens. 1037 * The default for this property is true. 1038 * 1039 * @return true if empty tokens are not returned 1040 */ 1041 public boolean isIgnoreEmptyTokens() { 1042 return ignoreEmptyTokens; 1043 } 1044 1045 /** 1046 * Sets whether the tokenizer should ignore and not return empty tokens. 1047 * The default for this property is true. 1048 * 1049 * @param ignoreEmptyTokens whether empty tokens are not returned 1050 * @return this, to enable chaining 1051 */ 1052 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1053 this.ignoreEmptyTokens = ignoreEmptyTokens; 1054 return this; 1055 } 1056 1057 //----------------------------------------------------------------------- 1058 /** 1059 * Gets the String content that the tokenizer is parsing. 1060 * 1061 * @return the string content being parsed 1062 */ 1063 public String getContent() { 1064 if (chars == null) { 1065 return null; 1066 } 1067 return new String(chars); 1068 } 1069 1070 //----------------------------------------------------------------------- 1071 /** 1072 * Creates a new instance of this Tokenizer. The new instance is reset so 1073 * that it will be at the start of the token list. 1074 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1075 * 1076 * @return a new instance of this Tokenizer which has been reset. 1077 */ 1078 @Override 1079 public Object clone() { 1080 try { 1081 return cloneReset(); 1082 } catch (final CloneNotSupportedException ex) { 1083 return null; 1084 } 1085 } 1086 1087 /** 1088 * Creates a new instance of this Tokenizer. The new instance is reset so that 1089 * it will be at the start of the token list. 1090 * 1091 * @return a new instance of this Tokenizer which has been reset. 1092 * @throws CloneNotSupportedException if there is a problem cloning 1093 */ 1094 Object cloneReset() throws CloneNotSupportedException { 1095 // this method exists to enable 100% test coverage 1096 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1097 if (cloned.chars != null) { 1098 cloned.chars = cloned.chars.clone(); 1099 } 1100 cloned.reset(); 1101 return cloned; 1102 } 1103 1104 //----------------------------------------------------------------------- 1105 /** 1106 * Gets the String content that the tokenizer is parsing. 1107 * 1108 * @return the string content being parsed 1109 */ 1110 @Override 1111 public String toString() { 1112 if (tokens == null) { 1113 return "StrTokenizer[not tokenized yet]"; 1114 } 1115 return "StrTokenizer" + getTokenList(); 1116 } 1117 1118 }