001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.lang3.text;
018
019 import java.util.ArrayList;
020 import java.util.Collections;
021 import java.util.List;
022 import java.util.ListIterator;
023 import java.util.NoSuchElementException;
024
025 import org.apache.commons.lang3.ArrayUtils;
026
027 /**
028 * Tokenizes a string based based on delimiters (separators)
029 * and supporting quoting and ignored character concepts.
030 * <p>
031 * This class can split a String into many smaller strings. It aims
032 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
033 * however it offers much more control and flexibility including implementing
034 * the <code>ListIterator</code> interface. By default, it is set up
035 * like <code>StringTokenizer</code>.
036 * <p>
037 * The input String is split into a number of <i>tokens</i>.
038 * Each token is separated from the next String by a <i>delimiter</i>.
039 * One or more delimiter characters must be specified.
040 * <p>
041 * Each token may be surrounded by quotes.
042 * The <i>quote</i> matcher specifies the quote character(s).
043 * A quote may be escaped within a quoted section by duplicating itself.
044 * <p>
045 * Between each token and the delimiter are potentially characters that need trimming.
046 * The <i>trimmer</i> matcher specifies these characters.
047 * One usage might be to trim whitespace characters.
048 * <p>
049 * At any point outside the quotes there might potentially be invalid characters.
050 * The <i>ignored</i> matcher specifies these characters to be removed.
051 * One usage might be to remove new line characters.
052 * <p>
053 * Empty tokens may be removed or returned as null.
054 * <pre>
055 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
056 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
057 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
058 * </pre>
059 * <p>
060 *
061 * This tokenizer has the following properties and options:
062 *
063 * <table>
064 * <tr>
065 * <th>Property</th><th>Type</th><th>Default</th>
066 * </tr>
067 * <tr>
068 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
069 * </tr>
070 * <tr>
071 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
072 * </tr>
073 * <tr>
074 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
075 * </tr>
076 * <tr>
077 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
078 * </tr>
079 * <tr>
080 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
081 * </tr>
082 * </table>
083 *
084 * @since 2.2
085 * @version $Id: StrTokenizer.java 1088899 2011-04-05 05:31:27Z bayard $
086 */
087 public class StrTokenizer implements ListIterator<String>, Cloneable {
088
089 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
090 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
091 static {
092 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
093 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
094 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
095 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
096 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
097 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
098 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
099
100 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
101 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
102 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
103 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
104 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
105 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
106 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
107 }
108
109 /** The text to work on. */
110 private char chars[];
111 /** The parsed tokens */
112 private String tokens[];
113 /** The current iteration position */
114 private int tokenPos;
115
116 /** The delimiter matcher */
117 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
118 /** The quote matcher */
119 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
120 /** The ignored matcher */
121 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
122 /** The trimmer matcher */
123 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
124
125 /** Whether to return empty tokens as null */
126 private boolean emptyAsNull = false;
127 /** Whether to ignore empty tokens */
128 private boolean ignoreEmptyTokens = true;
129
130 //-----------------------------------------------------------------------
131
132 /**
133 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
134 *
135 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
136 */
137 private static StrTokenizer getCSVClone() {
138 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
139 }
140
141 /**
142 * Gets a new tokenizer instance which parses Comma Separated Value strings
143 * initializing it with the given input. The default for CSV processing
144 * will be trim whitespace from both ends (which can be overridden with
145 * the setTrimmer method).
146 * <p>
147 * You must call a "reset" method to set the string which you want to parse.
148 * @return a new tokenizer instance which parses Comma Separated Value strings
149 */
150 public static StrTokenizer getCSVInstance() {
151 return getCSVClone();
152 }
153
154 /**
155 * Gets a new tokenizer instance which parses Comma Separated Value strings
156 * initializing it with the given input. The default for CSV processing
157 * will be trim whitespace from both ends (which can be overridden with
158 * the setTrimmer method).
159 *
160 * @param input the text to parse
161 * @return a new tokenizer instance which parses Comma Separated Value strings
162 */
163 public static StrTokenizer getCSVInstance(String input) {
164 StrTokenizer tok = getCSVClone();
165 tok.reset(input);
166 return tok;
167 }
168
169 /**
170 * Gets a new tokenizer instance which parses Comma Separated Value strings
171 * initializing it with the given input. The default for CSV processing
172 * will be trim whitespace from both ends (which can be overridden with
173 * the setTrimmer method).
174 *
175 * @param input the text to parse
176 * @return a new tokenizer instance which parses Comma Separated Value strings
177 */
178 public static StrTokenizer getCSVInstance(char[] input) {
179 StrTokenizer tok = getCSVClone();
180 tok.reset(input);
181 return tok;
182 }
183
184 /**
185 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
186 *
187 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
188 */
189 private static StrTokenizer getTSVClone() {
190 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
191 }
192
193
194 /**
195 * Gets a new tokenizer instance which parses Tab Separated Value strings.
196 * The default for CSV processing will be trim whitespace from both ends
197 * (which can be overridden with the setTrimmer method).
198 * <p>
199 * You must call a "reset" method to set the string which you want to parse.
200 * @return a new tokenizer instance which parses Tab Separated Value strings.
201 */
202 public static StrTokenizer getTSVInstance() {
203 return getTSVClone();
204 }
205
206 /**
207 * Gets a new tokenizer instance which parses Tab Separated Value strings.
208 * The default for CSV processing will be trim whitespace from both ends
209 * (which can be overridden with the setTrimmer method).
210 * @param input the string to parse
211 * @return a new tokenizer instance which parses Tab Separated Value strings.
212 */
213 public static StrTokenizer getTSVInstance(String input) {
214 StrTokenizer tok = getTSVClone();
215 tok.reset(input);
216 return tok;
217 }
218
219 /**
220 * Gets a new tokenizer instance which parses Tab Separated Value strings.
221 * The default for CSV processing will be trim whitespace from both ends
222 * (which can be overridden with the setTrimmer method).
223 * @param input the string to parse
224 * @return a new tokenizer instance which parses Tab Separated Value strings.
225 */
226 public static StrTokenizer getTSVInstance(char[] input) {
227 StrTokenizer tok = getTSVClone();
228 tok.reset(input);
229 return tok;
230 }
231
232 //-----------------------------------------------------------------------
233 /**
234 * Constructs a tokenizer splitting on space, tab, newline and formfeed
235 * as per StringTokenizer, but with no text to tokenize.
236 * <p>
237 * This constructor is normally used with {@link #reset(String)}.
238 */
239 public StrTokenizer() {
240 super();
241 this.chars = null;
242 }
243
244 /**
245 * Constructs a tokenizer splitting on space, tab, newline and formfeed
246 * as per StringTokenizer.
247 *
248 * @param input the string which is to be parsed
249 */
250 public StrTokenizer(String input) {
251 super();
252 if (input != null) {
253 chars = input.toCharArray();
254 } else {
255 chars = null;
256 }
257 }
258
259 /**
260 * Constructs a tokenizer splitting on the specified delimiter character.
261 *
262 * @param input the string which is to be parsed
263 * @param delim the field delimiter character
264 */
265 public StrTokenizer(String input, char delim) {
266 this(input);
267 setDelimiterChar(delim);
268 }
269
270 /**
271 * Constructs a tokenizer splitting on the specified delimiter string.
272 *
273 * @param input the string which is to be parsed
274 * @param delim the field delimiter string
275 */
276 public StrTokenizer(String input, String delim) {
277 this(input);
278 setDelimiterString(delim);
279 }
280
281 /**
282 * Constructs a tokenizer splitting using the specified delimiter matcher.
283 *
284 * @param input the string which is to be parsed
285 * @param delim the field delimiter matcher
286 */
287 public StrTokenizer(String input, StrMatcher delim) {
288 this(input);
289 setDelimiterMatcher(delim);
290 }
291
292 /**
293 * Constructs a tokenizer splitting on the specified delimiter character
294 * and handling quotes using the specified quote character.
295 *
296 * @param input the string which is to be parsed
297 * @param delim the field delimiter character
298 * @param quote the field quoted string character
299 */
300 public StrTokenizer(String input, char delim, char quote) {
301 this(input, delim);
302 setQuoteChar(quote);
303 }
304
305 /**
306 * Constructs a tokenizer splitting using the specified delimiter matcher
307 * and handling quotes using the specified quote matcher.
308 *
309 * @param input the string which is to be parsed
310 * @param delim the field delimiter matcher
311 * @param quote the field quoted string matcher
312 */
313 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
314 this(input, delim);
315 setQuoteMatcher(quote);
316 }
317
318 /**
319 * Constructs a tokenizer splitting on space, tab, newline and formfeed
320 * as per StringTokenizer.
321 *
322 * @param input the string which is to be parsed, not cloned
323 */
324 public StrTokenizer(char[] input) {
325 super();
326 this.chars = ArrayUtils.clone(input);
327 }
328
329 /**
330 * Constructs a tokenizer splitting on the specified character.
331 *
332 * @param input the string which is to be parsed, not cloned
333 * @param delim the field delimiter character
334 */
335 public StrTokenizer(char[] input, char delim) {
336 this(input);
337 setDelimiterChar(delim);
338 }
339
340 /**
341 * Constructs a tokenizer splitting on the specified string.
342 *
343 * @param input the string which is to be parsed, not cloned
344 * @param delim the field delimiter string
345 */
346 public StrTokenizer(char[] input, String delim) {
347 this(input);
348 setDelimiterString(delim);
349 }
350
351 /**
352 * Constructs a tokenizer splitting using the specified delimiter matcher.
353 *
354 * @param input the string which is to be parsed, not cloned
355 * @param delim the field delimiter matcher
356 */
357 public StrTokenizer(char[] input, StrMatcher delim) {
358 this(input);
359 setDelimiterMatcher(delim);
360 }
361
362 /**
363 * Constructs a tokenizer splitting on the specified delimiter character
364 * and handling quotes using the specified quote character.
365 *
366 * @param input the string which is to be parsed, not cloned
367 * @param delim the field delimiter character
368 * @param quote the field quoted string character
369 */
370 public StrTokenizer(char[] input, char delim, char quote) {
371 this(input, delim);
372 setQuoteChar(quote);
373 }
374
375 /**
376 * Constructs a tokenizer splitting using the specified delimiter matcher
377 * and handling quotes using the specified quote matcher.
378 *
379 * @param input the string which is to be parsed, not cloned
380 * @param delim the field delimiter character
381 * @param quote the field quoted string character
382 */
383 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
384 this(input, delim);
385 setQuoteMatcher(quote);
386 }
387
388 // API
389 //-----------------------------------------------------------------------
390 /**
391 * Gets the number of tokens found in the String.
392 *
393 * @return the number of matched tokens
394 */
395 public int size() {
396 checkTokenized();
397 return tokens.length;
398 }
399
400 /**
401 * Gets the next token from the String.
402 * Equivalent to {@link #next()} except it returns null rather than
403 * throwing {@link NoSuchElementException} when no tokens remain.
404 *
405 * @return the next sequential token, or null when no more tokens are found
406 */
407 public String nextToken() {
408 if (hasNext()) {
409 return tokens[tokenPos++];
410 }
411 return null;
412 }
413
414 /**
415 * Gets the previous token from the String.
416 *
417 * @return the previous sequential token, or null when no more tokens are found
418 */
419 public String previousToken() {
420 if (hasPrevious()) {
421 return tokens[--tokenPos];
422 }
423 return null;
424 }
425
426 /**
427 * Gets a copy of the full token list as an independent modifiable array.
428 *
429 * @return the tokens as a String array
430 */
431 public String[] getTokenArray() {
432 checkTokenized();
433 return tokens.clone();
434 }
435
436 /**
437 * Gets a copy of the full token list as an independent modifiable list.
438 *
439 * @return the tokens as a String array
440 */
441 public List<String> getTokenList() {
442 checkTokenized();
443 List<String> list = new ArrayList<String>(tokens.length);
444 for (String element : tokens) {
445 list.add(element);
446 }
447 return list;
448 }
449
450 /**
451 * Resets this tokenizer, forgetting all parsing and iteration already completed.
452 * <p>
453 * This method allows the same tokenizer to be reused for the same String.
454 *
455 * @return this, to enable chaining
456 */
457 public StrTokenizer reset() {
458 tokenPos = 0;
459 tokens = null;
460 return this;
461 }
462
463 /**
464 * Reset this tokenizer, giving it a new input string to parse.
465 * In this manner you can re-use a tokenizer with the same settings
466 * on multiple input lines.
467 *
468 * @param input the new string to tokenize, null sets no text to parse
469 * @return this, to enable chaining
470 */
471 public StrTokenizer reset(String input) {
472 reset();
473 if (input != null) {
474 this.chars = input.toCharArray();
475 } else {
476 this.chars = null;
477 }
478 return this;
479 }
480
481 /**
482 * Reset this tokenizer, giving it a new input string to parse.
483 * In this manner you can re-use a tokenizer with the same settings
484 * on multiple input lines.
485 *
486 * @param input the new character array to tokenize, not cloned, null sets no text to parse
487 * @return this, to enable chaining
488 */
489 public StrTokenizer reset(char[] input) {
490 reset();
491 this.chars = ArrayUtils.clone(input);
492 return this;
493 }
494
495 // ListIterator
496 //-----------------------------------------------------------------------
497 /**
498 * Checks whether there are any more tokens.
499 *
500 * @return true if there are more tokens
501 */
502 public boolean hasNext() {
503 checkTokenized();
504 return tokenPos < tokens.length;
505 }
506
507 /**
508 * Gets the next token.
509 *
510 * @return the next String token
511 * @throws NoSuchElementException if there are no more elements
512 */
513 public String next() {
514 if (hasNext()) {
515 return tokens[tokenPos++];
516 }
517 throw new NoSuchElementException();
518 }
519
520 /**
521 * Gets the index of the next token to return.
522 *
523 * @return the next token index
524 */
525 public int nextIndex() {
526 return tokenPos;
527 }
528
529 /**
530 * Checks whether there are any previous tokens that can be iterated to.
531 *
532 * @return true if there are previous tokens
533 */
534 public boolean hasPrevious() {
535 checkTokenized();
536 return tokenPos > 0;
537 }
538
539 /**
540 * Gets the token previous to the last returned token.
541 *
542 * @return the previous token
543 */
544 public String previous() {
545 if (hasPrevious()) {
546 return tokens[--tokenPos];
547 }
548 throw new NoSuchElementException();
549 }
550
551 /**
552 * Gets the index of the previous token.
553 *
554 * @return the previous token index
555 */
556 public int previousIndex() {
557 return tokenPos - 1;
558 }
559
560 /**
561 * Unsupported ListIterator operation.
562 *
563 * @throws UnsupportedOperationException always
564 */
565 public void remove() {
566 throw new UnsupportedOperationException("remove() is unsupported");
567 }
568
569 /**
570 * Unsupported ListIterator operation.
571 * @param obj this parameter ignored.
572 * @throws UnsupportedOperationException always
573 */
574 public void set(String obj) {
575 throw new UnsupportedOperationException("set() is unsupported");
576 }
577
578 /**
579 * Unsupported ListIterator operation.
580 * @param obj this parameter ignored.
581 * @throws UnsupportedOperationException always
582 */
583 public void add(String obj) {
584 throw new UnsupportedOperationException("add() is unsupported");
585 }
586
587 // Implementation
588 //-----------------------------------------------------------------------
589 /**
590 * Checks if tokenization has been done, and if not then do it.
591 */
592 private void checkTokenized() {
593 if (tokens == null) {
594 if (chars == null) {
595 // still call tokenize as subclass may do some work
596 List<String> split = tokenize(null, 0, 0);
597 tokens = split.toArray(new String[split.size()]);
598 } else {
599 List<String> split = tokenize(chars, 0, chars.length);
600 tokens = split.toArray(new String[split.size()]);
601 }
602 }
603 }
604
605 /**
606 * Internal method to performs the tokenization.
607 * <p>
608 * Most users of this class do not need to call this method. This method
609 * will be called automatically by other (public) methods when required.
610 * <p>
611 * This method exists to allow subclasses to add code before or after the
612 * tokenization. For example, a subclass could alter the character array,
613 * offset or count to be parsed, or call the tokenizer multiple times on
614 * multiple strings. It is also be possible to filter the results.
615 * <p>
616 * <code>StrTokenizer</code> will always pass a zero offset and a count
617 * equal to the length of the array to this method, however a subclass
618 * may pass other values, or even an entirely different array.
619 *
620 * @param chars the character array being tokenized, may be null
621 * @param offset the start position within the character array, must be valid
622 * @param count the number of characters to tokenize, must be valid
623 * @return the modifiable list of String tokens, unmodifiable if null array or zero count
624 */
625 protected List<String> tokenize(char[] chars, int offset, int count) {
626 if (chars == null || count == 0) {
627 return Collections.emptyList();
628 }
629 StrBuilder buf = new StrBuilder();
630 List<String> tokens = new ArrayList<String>();
631 int pos = offset;
632
633 // loop around the entire buffer
634 while (pos >= 0 && pos < count) {
635 // find next token
636 pos = readNextToken(chars, pos, count, buf, tokens);
637
638 // handle case where end of string is a delimiter
639 if (pos >= count) {
640 addToken(tokens, "");
641 }
642 }
643 return tokens;
644 }
645
646 /**
647 * Adds a token to a list, paying attention to the parameters we've set.
648 *
649 * @param list the list to add to
650 * @param tok the token to add
651 */
652 private void addToken(List<String> list, String tok) {
653 if (tok == null || tok.length() == 0) {
654 if (isIgnoreEmptyTokens()) {
655 return;
656 }
657 if (isEmptyTokenAsNull()) {
658 tok = null;
659 }
660 }
661 list.add(tok);
662 }
663
664 /**
665 * Reads character by character through the String to get the next token.
666 *
667 * @param chars the character array being tokenized
668 * @param start the first character of field
669 * @param len the length of the character array being tokenized
670 * @param workArea a temporary work area
671 * @param tokens the list of parsed tokens
672 * @return the starting position of the next field (the character
673 * immediately after the delimiter), or -1 if end of string found
674 */
675 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List<String> tokens) {
676 // skip all leading whitespace, unless it is the
677 // field delimiter or the quote character
678 while (start < len) {
679 int removeLen = Math.max(
680 getIgnoredMatcher().isMatch(chars, start, start, len),
681 getTrimmerMatcher().isMatch(chars, start, start, len));
682 if (removeLen == 0 ||
683 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
684 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
685 break;
686 }
687 start += removeLen;
688 }
689
690 // handle reaching end
691 if (start >= len) {
692 addToken(tokens, "");
693 return -1;
694 }
695
696 // handle empty token
697 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
698 if (delimLen > 0) {
699 addToken(tokens, "");
700 return start + delimLen;
701 }
702
703 // handle found token
704 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
705 if (quoteLen > 0) {
706 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
707 }
708 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
709 }
710
711 /**
712 * Reads a possibly quoted string token.
713 *
714 * @param chars the character array being tokenized
715 * @param start the first character of field
716 * @param len the length of the character array being tokenized
717 * @param workArea a temporary work area
718 * @param tokens the list of parsed tokens
719 * @param quoteStart the start position of the matched quote, 0 if no quoting
720 * @param quoteLen the length of the matched quote, 0 if no quoting
721 * @return the starting position of the next field (the character
722 * immediately after the delimiter, or if end of string found,
723 * then the length of string
724 */
725 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea,
726 List<String> tokens, int quoteStart, int quoteLen) {
727 // Loop until we've found the end of the quoted
728 // string or the end of the input
729 workArea.clear();
730 int pos = start;
731 boolean quoting = (quoteLen > 0);
732 int trimStart = 0;
733
734 while (pos < len) {
735 // quoting mode can occur several times throughout a string
736 // we must switch between quoting and non-quoting until we
737 // encounter a non-quoted delimiter, or end of string
738 if (quoting) {
739 // In quoting mode
740
741 // If we've found a quote character, see if it's
742 // followed by a second quote. If so, then we need
743 // to actually put the quote character into the token
744 // rather than end the token.
745 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
746 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
747 // matched pair of quotes, thus an escaped quote
748 workArea.append(chars, pos, quoteLen);
749 pos += (quoteLen * 2);
750 trimStart = workArea.size();
751 continue;
752 }
753
754 // end of quoting
755 quoting = false;
756 pos += quoteLen;
757 continue;
758 }
759
760 // copy regular character from inside quotes
761 workArea.append(chars[pos++]);
762 trimStart = workArea.size();
763
764 } else {
765 // Not in quoting mode
766
767 // check for delimiter, and thus end of token
768 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
769 if (delimLen > 0) {
770 // return condition when end of token found
771 addToken(tokens, workArea.substring(0, trimStart));
772 return pos + delimLen;
773 }
774
775 // check for quote, and thus back into quoting mode
776 if (quoteLen > 0) {
777 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
778 quoting = true;
779 pos += quoteLen;
780 continue;
781 }
782 }
783
784 // check for ignored (outside quotes), and ignore
785 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
786 if (ignoredLen > 0) {
787 pos += ignoredLen;
788 continue;
789 }
790
791 // check for trimmed character
792 // don't yet know if its at the end, so copy to workArea
793 // use trimStart to keep track of trim at the end
794 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
795 if (trimmedLen > 0) {
796 workArea.append(chars, pos, trimmedLen);
797 pos += trimmedLen;
798 continue;
799 }
800
801 // copy regular character from outside quotes
802 workArea.append(chars[pos++]);
803 trimStart = workArea.size();
804 }
805 }
806
807 // return condition when end of string found
808 addToken(tokens, workArea.substring(0, trimStart));
809 return -1;
810 }
811
812 /**
813 * Checks if the characters at the index specified match the quote
814 * already matched in readNextToken().
815 *
816 * @param chars the character array being tokenized
817 * @param pos the position to check for a quote
818 * @param len the length of the character array being tokenized
819 * @param quoteStart the start position of the matched quote, 0 if no quoting
820 * @param quoteLen the length of the matched quote, 0 if no quoting
821 * @return true if a quote is matched
822 */
823 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
824 for (int i = 0; i < quoteLen; i++) {
825 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
826 return false;
827 }
828 }
829 return true;
830 }
831
832 // Delimiter
833 //-----------------------------------------------------------------------
834 /**
835 * Gets the field delimiter matcher.
836 *
837 * @return the delimiter matcher in use
838 */
839 public StrMatcher getDelimiterMatcher() {
840 return this.delimMatcher;
841 }
842
843 /**
844 * Sets the field delimiter matcher.
845 * <p>
846 * The delimitier is used to separate one token from another.
847 *
848 * @param delim the delimiter matcher to use
849 * @return this, to enable chaining
850 */
851 public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
852 if (delim == null) {
853 this.delimMatcher = StrMatcher.noneMatcher();
854 } else {
855 this.delimMatcher = delim;
856 }
857 return this;
858 }
859
860 /**
861 * Sets the field delimiter character.
862 *
863 * @param delim the delimiter character to use
864 * @return this, to enable chaining
865 */
866 public StrTokenizer setDelimiterChar(char delim) {
867 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
868 }
869
870 /**
871 * Sets the field delimiter string.
872 *
873 * @param delim the delimiter string to use
874 * @return this, to enable chaining
875 */
876 public StrTokenizer setDelimiterString(String delim) {
877 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
878 }
879
880 // Quote
881 //-----------------------------------------------------------------------
882 /**
883 * Gets the quote matcher currently in use.
884 * <p>
885 * The quote character is used to wrap data between the tokens.
886 * This enables delimiters to be entered as data.
887 * The default value is '"' (double quote).
888 *
889 * @return the quote matcher in use
890 */
891 public StrMatcher getQuoteMatcher() {
892 return quoteMatcher;
893 }
894
895 /**
896 * Set the quote matcher to use.
897 * <p>
898 * The quote character is used to wrap data between the tokens.
899 * This enables delimiters to be entered as data.
900 *
901 * @param quote the quote matcher to use, null ignored
902 * @return this, to enable chaining
903 */
904 public StrTokenizer setQuoteMatcher(StrMatcher quote) {
905 if (quote != null) {
906 this.quoteMatcher = quote;
907 }
908 return this;
909 }
910
911 /**
912 * Sets the quote character to use.
913 * <p>
914 * The quote character is used to wrap data between the tokens.
915 * This enables delimiters to be entered as data.
916 *
917 * @param quote the quote character to use
918 * @return this, to enable chaining
919 */
920 public StrTokenizer setQuoteChar(char quote) {
921 return setQuoteMatcher(StrMatcher.charMatcher(quote));
922 }
923
924 // Ignored
925 //-----------------------------------------------------------------------
926 /**
927 * Gets the ignored character matcher.
928 * <p>
929 * These characters are ignored when parsing the String, unless they are
930 * within a quoted region.
931 * The default value is not to ignore anything.
932 *
933 * @return the ignored matcher in use
934 */
935 public StrMatcher getIgnoredMatcher() {
936 return ignoredMatcher;
937 }
938
939 /**
940 * Set the matcher for characters to ignore.
941 * <p>
942 * These characters are ignored when parsing the String, unless they are
943 * within a quoted region.
944 *
945 * @param ignored the ignored matcher to use, null ignored
946 * @return this, to enable chaining
947 */
948 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
949 if (ignored != null) {
950 this.ignoredMatcher = ignored;
951 }
952 return this;
953 }
954
955 /**
956 * Set the character to ignore.
957 * <p>
958 * This character is ignored when parsing the String, unless it is
959 * within a quoted region.
960 *
961 * @param ignored the ignored character to use
962 * @return this, to enable chaining
963 */
964 public StrTokenizer setIgnoredChar(char ignored) {
965 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
966 }
967
968 // Trimmer
969 //-----------------------------------------------------------------------
970 /**
971 * Gets the trimmer character matcher.
972 * <p>
973 * These characters are trimmed off on each side of the delimiter
974 * until the token or quote is found.
975 * The default value is not to trim anything.
976 *
977 * @return the trimmer matcher in use
978 */
979 public StrMatcher getTrimmerMatcher() {
980 return trimmerMatcher;
981 }
982
983 /**
984 * Sets the matcher for characters to trim.
985 * <p>
986 * These characters are trimmed off on each side of the delimiter
987 * until the token or quote is found.
988 *
989 * @param trimmer the trimmer matcher to use, null ignored
990 * @return this, to enable chaining
991 */
992 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
993 if (trimmer != null) {
994 this.trimmerMatcher = trimmer;
995 }
996 return this;
997 }
998
999 //-----------------------------------------------------------------------
1000 /**
1001 * Gets whether the tokenizer currently returns empty tokens as null.
1002 * The default for this property is false.
1003 *
1004 * @return true if empty tokens are returned as null
1005 */
1006 public boolean isEmptyTokenAsNull() {
1007 return this.emptyAsNull;
1008 }
1009
1010 /**
1011 * Sets whether the tokenizer should return empty tokens as null.
1012 * The default for this property is false.
1013 *
1014 * @param emptyAsNull whether empty tokens are returned as null
1015 * @return this, to enable chaining
1016 */
1017 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1018 this.emptyAsNull = emptyAsNull;
1019 return this;
1020 }
1021
1022 //-----------------------------------------------------------------------
1023 /**
1024 * Gets whether the tokenizer currently ignores empty tokens.
1025 * The default for this property is true.
1026 *
1027 * @return true if empty tokens are not returned
1028 */
1029 public boolean isIgnoreEmptyTokens() {
1030 return ignoreEmptyTokens;
1031 }
1032
1033 /**
1034 * Sets whether the tokenizer should ignore and not return empty tokens.
1035 * The default for this property is true.
1036 *
1037 * @param ignoreEmptyTokens whether empty tokens are not returned
1038 * @return this, to enable chaining
1039 */
1040 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1041 this.ignoreEmptyTokens = ignoreEmptyTokens;
1042 return this;
1043 }
1044
1045 //-----------------------------------------------------------------------
1046 /**
1047 * Gets the String content that the tokenizer is parsing.
1048 *
1049 * @return the string content being parsed
1050 */
1051 public String getContent() {
1052 if (chars == null) {
1053 return null;
1054 }
1055 return new String(chars);
1056 }
1057
1058 //-----------------------------------------------------------------------
1059 /**
1060 * Creates a new instance of this Tokenizer. The new instance is reset so
1061 * that it will be at the start of the token list.
1062 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1063 *
1064 * @return a new instance of this Tokenizer which has been reset.
1065 */
1066 @Override
1067 public Object clone() {
1068 try {
1069 return cloneReset();
1070 } catch (CloneNotSupportedException ex) {
1071 return null;
1072 }
1073 }
1074
1075 /**
1076 * Creates a new instance of this Tokenizer. The new instance is reset so that
1077 * it will be at the start of the token list.
1078 *
1079 * @return a new instance of this Tokenizer which has been reset.
1080 * @throws CloneNotSupportedException if there is a problem cloning
1081 */
1082 Object cloneReset() throws CloneNotSupportedException {
1083 // this method exists to enable 100% test coverage
1084 StrTokenizer cloned = (StrTokenizer) super.clone();
1085 if (cloned.chars != null) {
1086 cloned.chars = cloned.chars.clone();
1087 }
1088 cloned.reset();
1089 return cloned;
1090 }
1091
1092 //-----------------------------------------------------------------------
1093 /**
1094 * Gets the String content that the tokenizer is parsing.
1095 *
1096 * @return the string content being parsed
1097 */
1098 @Override
1099 public String toString() {
1100 if (tokens == null) {
1101 return "StrTokenizer[not tokenized yet]";
1102 }
1103 return "StrTokenizer" + getTokenList();
1104 }
1105
1106 }