1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.lang3.text;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.List;
23 import java.util.ListIterator;
24 import java.util.NoSuchElementException;
25 import java.util.StringTokenizer;
26
27 import org.apache.commons.lang3.ArrayUtils;
28 import org.apache.commons.lang3.StringUtils;
29
30 /**
31 * Tokenizes a string based on delimiters (separators)
32 * and supporting quoting and ignored character concepts.
33 * <p>
34 * This class can split a String into many smaller strings. It aims
35 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36 * however it offers much more control and flexibility including implementing
37 * the {@link ListIterator} interface. By default, it is set up
38 * like {@link StringTokenizer}.
39 * </p>
40 * <p>
41 * The input String is split into a number of <em>tokens</em>.
42 * Each token is separated from the next String by a <em>delimiter</em>.
43 * One or more delimiter characters must be specified.
44 * </p>
45 * <p>
46 * Each token may be surrounded by quotes.
47 * The <em>quote</em> matcher specifies the quote character(s).
48 * A quote may be escaped within a quoted section by duplicating itself.
49 * </p>
50 * <p>
51 * Between each token and the delimiter are potentially characters that need trimming.
52 * The <em>trimmer</em> matcher specifies these characters.
53 * One usage might be to trim whitespace characters.
54 * </p>
55 * <p>
56 * At any point outside the quotes there might potentially be invalid characters.
57 * The <em>ignored</em> matcher specifies these characters to be removed.
58 * One usage might be to remove new line characters.
59 * </p>
60 * <p>
61 * Empty tokens may be removed or returned as null.
62 * </p>
63 * <pre>
64 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
65 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
66 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67 * </pre>
68 *
69 * <table>
70 * <caption>StrTokenizer properties and options</caption>
71 * <tr>
72 * <th>Property</th><th>Type</th><th>Default</th>
73 * </tr>
74 * <tr>
75 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76 * </tr>
77 * <tr>
78 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
79 * </tr>
80 * <tr>
81 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82 * </tr>
83 * <tr>
84 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85 * </tr>
86 * <tr>
87 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88 * </tr>
89 * </table>
90 *
91 * @since 2.2
92 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
93 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94 * StringTokenizer</a>.
95 */
96 @Deprecated
97 public class StrTokenizer implements ListIterator<String>, Cloneable {
98
99 // @formatter:off
100 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101 .setDelimiterMatcher(StrMatcher.commaMatcher())
102 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103 .setIgnoredMatcher(StrMatcher.noneMatcher())
104 .setTrimmerMatcher(StrMatcher.trimMatcher())
105 .setEmptyTokenAsNull(false)
106 .setIgnoreEmptyTokens(false);
107 // @formatter:on
108
109 // @formatter:off
110 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111 .setDelimiterMatcher(StrMatcher.tabMatcher())
112 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113 .setIgnoredMatcher(StrMatcher.noneMatcher())
114 .setTrimmerMatcher(StrMatcher.trimMatcher())
115 .setEmptyTokenAsNull(false)
116 .setIgnoreEmptyTokens(false);
117 // @formatter:on
118
119 /**
120 * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121 *
122 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123 */
124 private static StrTokenizer getCSVClone() {
125 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126 }
127 /**
128 * Gets a new tokenizer instance which parses Comma Separated Value strings
129 * initializing it with the given input. The default for CSV processing
130 * will be trim whitespace from both ends (which can be overridden with
131 * the setTrimmer method).
132 * <p>
133 * You must call a "reset" method to set the string which you want to parse.
134 * </p>
135 * @return a new tokenizer instance which parses Comma Separated Value strings
136 */
137 public static StrTokenizer getCSVInstance() {
138 return getCSVClone();
139 }
140 /**
141 * Gets a new tokenizer instance which parses Comma Separated Value strings
142 * initializing it with the given input. The default for CSV processing
143 * will be trim whitespace from both ends (which can be overridden with
144 * the setTrimmer method).
145 *
146 * @param input the text to parse
147 * @return a new tokenizer instance which parses Comma Separated Value strings
148 */
149 public static StrTokenizer getCSVInstance(final char[] input) {
150 final StrTokenizer tok = getCSVClone();
151 tok.reset(input);
152 return tok;
153 }
154
155 /**
156 * Gets a new tokenizer instance which parses Comma Separated Value strings
157 * initializing it with the given input. The default for CSV processing
158 * will be trim whitespace from both ends (which can be overridden with
159 * the setTrimmer method).
160 *
161 * @param input the text to parse
162 * @return a new tokenizer instance which parses Comma Separated Value strings
163 */
164 public static StrTokenizer getCSVInstance(final String input) {
165 final StrTokenizer tok = getCSVClone();
166 tok.reset(input);
167 return tok;
168 }
169 /**
170 * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171 *
172 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173 */
174 private static StrTokenizer getTSVClone() {
175 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176 }
177
178 /**
179 * Gets a new tokenizer instance which parses Tab Separated Value strings.
180 * The default for CSV processing will be trim whitespace from both ends
181 * (which can be overridden with the setTrimmer method).
182 * <p>
183 * You must call a "reset" method to set the string which you want to parse.
184 * </p>
185 * @return a new tokenizer instance which parses Tab Separated Value strings.
186 */
187 public static StrTokenizer getTSVInstance() {
188 return getTSVClone();
189 }
190
191 /**
192 * Gets a new tokenizer instance which parses Tab Separated Value strings.
193 * The default for CSV processing will be trim whitespace from both ends
194 * (which can be overridden with the setTrimmer method).
195 * @param input the string to parse
196 * @return a new tokenizer instance which parses Tab Separated Value strings.
197 */
198 public static StrTokenizer getTSVInstance(final char[] input) {
199 final StrTokenizer tok = getTSVClone();
200 tok.reset(input);
201 return tok;
202 }
203
204 /**
205 * Gets a new tokenizer instance which parses Tab Separated Value strings.
206 * The default for CSV processing will be trim whitespace from both ends
207 * (which can be overridden with the setTrimmer method).
208 * @param input the string to parse
209 * @return a new tokenizer instance which parses Tab Separated Value strings.
210 */
211 public static StrTokenizer getTSVInstance(final String input) {
212 final StrTokenizer tok = getTSVClone();
213 tok.reset(input);
214 return tok;
215 }
216 /** The text to work on. */
217 private char[] chars;
218
219 /** The parsed tokens */
220 private String[] tokens;
221
222 /** The current iteration position */
223 private int tokenPos;
224
225 /** The delimiter matcher */
226 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
227
228 /** The quote matcher */
229 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
230
231 /** The ignored matcher */
232 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
233
234 /** The trimmer matcher */
235 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
236
237 /** Whether to return empty tokens as null */
238 private boolean emptyAsNull;
239
240 /** Whether to ignore empty tokens */
241 private boolean ignoreEmptyTokens = true;
242
243 /**
244 * Constructs a tokenizer splitting on space, tab, newline and formfeed
245 * as per StringTokenizer, but with no text to tokenize.
246 * <p>
247 * This constructor is normally used with {@link #reset(String)}.
248 * </p>
249 */
250 public StrTokenizer() {
251 this.chars = null;
252 }
253
254 /**
255 * Constructs a tokenizer splitting on space, tab, newline and formfeed
256 * as per StringTokenizer.
257 *
258 * @param input the string which is to be parsed, not cloned
259 */
260 public StrTokenizer(final char[] input) {
261 this.chars = ArrayUtils.clone(input);
262 }
263
264 /**
265 * Constructs a tokenizer splitting on the specified character.
266 *
267 * @param input the string which is to be parsed, not cloned
268 * @param delim the field delimiter character
269 */
270 public StrTokenizer(final char[] input, final char delim) {
271 this(input);
272 setDelimiterChar(delim);
273 }
274
275 /**
276 * Constructs a tokenizer splitting on the specified delimiter character
277 * and handling quotes using the specified quote character.
278 *
279 * @param input the string which is to be parsed, not cloned
280 * @param delim the field delimiter character
281 * @param quote the field quoted string character
282 */
283 public StrTokenizer(final char[] input, final char delim, final char quote) {
284 this(input, delim);
285 setQuoteChar(quote);
286 }
287
288 /**
289 * Constructs a tokenizer splitting on the specified string.
290 *
291 * @param input the string which is to be parsed, not cloned
292 * @param delim the field delimiter string
293 */
294 public StrTokenizer(final char[] input, final String delim) {
295 this(input);
296 setDelimiterString(delim);
297 }
298
299 /**
300 * Constructs a tokenizer splitting using the specified delimiter matcher.
301 *
302 * @param input the string which is to be parsed, not cloned
303 * @param delim the field delimiter matcher
304 */
305 public StrTokenizer(final char[] input, final StrMatcher delim) {
306 this(input);
307 setDelimiterMatcher(delim);
308 }
309
310 /**
311 * Constructs a tokenizer splitting using the specified delimiter matcher
312 * and handling quotes using the specified quote matcher.
313 *
314 * @param input the string which is to be parsed, not cloned
315 * @param delim the field delimiter character
316 * @param quote the field quoted string character
317 */
318 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
319 this(input, delim);
320 setQuoteMatcher(quote);
321 }
322
323 /**
324 * Constructs a tokenizer splitting on space, tab, newline and formfeed
325 * as per StringTokenizer.
326 *
327 * @param input the string which is to be parsed
328 */
329 public StrTokenizer(final String input) {
330 if (input != null) {
331 chars = input.toCharArray();
332 } else {
333 chars = null;
334 }
335 }
336
337 /**
338 * Constructs a tokenizer splitting on the specified delimiter character.
339 *
340 * @param input the string which is to be parsed
341 * @param delim the field delimiter character
342 */
343 public StrTokenizer(final String input, final char delim) {
344 this(input);
345 setDelimiterChar(delim);
346 }
347
348 /**
349 * Constructs a tokenizer splitting on the specified delimiter character
350 * and handling quotes using the specified quote character.
351 *
352 * @param input the string which is to be parsed
353 * @param delim the field delimiter character
354 * @param quote the field quoted string character
355 */
356 public StrTokenizer(final String input, final char delim, final char quote) {
357 this(input, delim);
358 setQuoteChar(quote);
359 }
360
361 /**
362 * Constructs a tokenizer splitting on the specified delimiter string.
363 *
364 * @param input the string which is to be parsed
365 * @param delim the field delimiter string
366 */
367 public StrTokenizer(final String input, final String delim) {
368 this(input);
369 setDelimiterString(delim);
370 }
371
372 /**
373 * Constructs a tokenizer splitting using the specified delimiter matcher.
374 *
375 * @param input the string which is to be parsed
376 * @param delim the field delimiter matcher
377 */
378 public StrTokenizer(final String input, final StrMatcher delim) {
379 this(input);
380 setDelimiterMatcher(delim);
381 }
382
383 /**
384 * Constructs a tokenizer splitting using the specified delimiter matcher
385 * and handling quotes using the specified quote matcher.
386 *
387 * @param input the string which is to be parsed
388 * @param delim the field delimiter matcher
389 * @param quote the field quoted string matcher
390 */
391 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
392 this(input, delim);
393 setQuoteMatcher(quote);
394 }
395
396 /**
397 * Unsupported ListIterator operation.
398 * @param obj this parameter ignored.
399 * @throws UnsupportedOperationException always
400 */
401 @Override
402 public void add(final String obj) {
403 throw new UnsupportedOperationException("add() is unsupported");
404 }
405
406 /**
407 * Adds a token to a list, paying attention to the parameters we've set.
408 *
409 * @param list the list to add to
410 * @param tok the token to add
411 */
412 private void addToken(final List<String> list, String tok) {
413 if (StringUtils.isEmpty(tok)) {
414 if (isIgnoreEmptyTokens()) {
415 return;
416 }
417 if (isEmptyTokenAsNull()) {
418 tok = null;
419 }
420 }
421 list.add(tok);
422 }
423
424 /**
425 * Checks if tokenization has been done, and if not then do it.
426 */
427 private void checkTokenized() {
428 if (tokens == null) {
429 if (chars == null) {
430 // still call tokenize as subclass may do some work
431 final List<String> split = tokenize(null, 0, 0);
432 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
433 } else {
434 final List<String> split = tokenize(chars, 0, chars.length);
435 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
436 }
437 }
438 }
439
440 /**
441 * Creates a new instance of this Tokenizer. The new instance is reset so
442 * that it will be at the start of the token list.
443 * If a {@link CloneNotSupportedException} is caught, return {@code null}.
444 *
445 * @return a new instance of this Tokenizer which has been reset.
446 */
447 @Override
448 public Object clone() {
449 try {
450 return cloneReset();
451 } catch (final CloneNotSupportedException ex) {
452 return null;
453 }
454 }
455
456 /**
457 * Creates a new instance of this Tokenizer. The new instance is reset so that
458 * it will be at the start of the token list.
459 *
460 * @return a new instance of this Tokenizer which has been reset.
461 * @throws CloneNotSupportedException if there is a problem cloning
462 */
463 Object cloneReset() throws CloneNotSupportedException {
464 // this method exists to enable 100% test coverage
465 final StrTokenizer cloned = (StrTokenizer) super.clone();
466 if (cloned.chars != null) {
467 cloned.chars = cloned.chars.clone();
468 }
469 cloned.reset();
470 return cloned;
471 }
472
473 /**
474 * Gets the String content that the tokenizer is parsing.
475 *
476 * @return the string content being parsed
477 */
478 public String getContent() {
479 if (chars == null) {
480 return null;
481 }
482 return new String(chars);
483 }
484
485 /**
486 * Gets the field delimiter matcher.
487 *
488 * @return the delimiter matcher in use
489 */
490 public StrMatcher getDelimiterMatcher() {
491 return this.delimMatcher;
492 }
493
494 // Ignored
495 /**
496 * Gets the ignored character matcher.
497 * <p>
498 * These characters are ignored when parsing the String, unless they are
499 * within a quoted region.
500 * The default value is not to ignore anything.
501 * </p>
502 *
503 * @return the ignored matcher in use
504 */
505 public StrMatcher getIgnoredMatcher() {
506 return ignoredMatcher;
507 }
508
509 /**
510 * Gets the quote matcher currently in use.
511 * <p>
512 * The quote character is used to wrap data between the tokens.
513 * This enables delimiters to be entered as data.
514 * The default value is '"' (double quote).
515 * </p>
516 *
517 * @return the quote matcher in use
518 */
519 public StrMatcher getQuoteMatcher() {
520 return quoteMatcher;
521 }
522
523 /**
524 * Gets a copy of the full token list as an independent modifiable array.
525 *
526 * @return the tokens as a String array
527 */
528 public String[] getTokenArray() {
529 checkTokenized();
530 return tokens.clone();
531 }
532
533 /**
534 * Gets a copy of the full token list as an independent modifiable list.
535 *
536 * @return the tokens as a String array
537 */
538 public List<String> getTokenList() {
539 checkTokenized();
540 final List<String> list = new ArrayList<>(tokens.length);
541 list.addAll(Arrays.asList(tokens));
542 return list;
543 }
544
545 /**
546 * Gets the trimmer character matcher.
547 * <p>
548 * These characters are trimmed off on each side of the delimiter
549 * until the token or quote is found.
550 * The default value is not to trim anything.
551 * </p>
552 *
553 * @return the trimmer matcher in use
554 */
555 public StrMatcher getTrimmerMatcher() {
556 return trimmerMatcher;
557 }
558
559 /**
560 * Checks whether there are any more tokens.
561 *
562 * @return true if there are more tokens
563 */
564 @Override
565 public boolean hasNext() {
566 checkTokenized();
567 return tokenPos < tokens.length;
568 }
569
570 /**
571 * Checks whether there are any previous tokens that can be iterated to.
572 *
573 * @return true if there are previous tokens
574 */
575 @Override
576 public boolean hasPrevious() {
577 checkTokenized();
578 return tokenPos > 0;
579 }
580
581 /**
582 * Gets whether the tokenizer currently returns empty tokens as null.
583 * The default for this property is false.
584 *
585 * @return true if empty tokens are returned as null
586 */
587 public boolean isEmptyTokenAsNull() {
588 return this.emptyAsNull;
589 }
590
591 /**
592 * Gets whether the tokenizer currently ignores empty tokens.
593 * The default for this property is true.
594 *
595 * @return true if empty tokens are not returned
596 */
597 public boolean isIgnoreEmptyTokens() {
598 return ignoreEmptyTokens;
599 }
600
601 /**
602 * Checks if the characters at the index specified match the quote
603 * already matched in readNextToken().
604 *
605 * @param srcChars the character array being tokenized
606 * @param pos the position to check for a quote
607 * @param len the length of the character array being tokenized
608 * @param quoteStart the start position of the matched quote, 0 if no quoting
609 * @param quoteLen the length of the matched quote, 0 if no quoting
610 * @return true if a quote is matched
611 */
612 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
613 for (int i = 0; i < quoteLen; i++) {
614 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
615 return false;
616 }
617 }
618 return true;
619 }
620
621 /**
622 * Gets the next token.
623 *
624 * @return the next String token
625 * @throws NoSuchElementException if there are no more elements
626 */
627 @Override
628 public String next() {
629 if (hasNext()) {
630 return tokens[tokenPos++];
631 }
632 throw new NoSuchElementException();
633 }
634
635 /**
636 * Gets the index of the next token to return.
637 *
638 * @return the next token index
639 */
640 @Override
641 public int nextIndex() {
642 return tokenPos;
643 }
644
645 /**
646 * Gets the next token from the String.
647 * Equivalent to {@link #next()} except it returns null rather than
648 * throwing {@link NoSuchElementException} when no tokens remain.
649 *
650 * @return the next sequential token, or null when no more tokens are found
651 */
652 public String nextToken() {
653 if (hasNext()) {
654 return tokens[tokenPos++];
655 }
656 return null;
657 }
658
659 /**
660 * Gets the token previous to the last returned token.
661 *
662 * @return the previous token
663 */
664 @Override
665 public String previous() {
666 if (hasPrevious()) {
667 return tokens[--tokenPos];
668 }
669 throw new NoSuchElementException();
670 }
671
672 /**
673 * Gets the index of the previous token.
674 *
675 * @return the previous token index
676 */
677 @Override
678 public int previousIndex() {
679 return tokenPos - 1;
680 }
681
682 /**
683 * Gets the previous token from the String.
684 *
685 * @return the previous sequential token, or null when no more tokens are found
686 */
687 public String previousToken() {
688 if (hasPrevious()) {
689 return tokens[--tokenPos];
690 }
691 return null;
692 }
693
694 /**
695 * Reads character by character through the String to get the next token.
696 *
697 * @param srcChars the character array being tokenized
698 * @param start the first character of field
699 * @param len the length of the character array being tokenized
700 * @param workArea a temporary work area
701 * @param tokenList the list of parsed tokens
702 * @return the starting position of the next field (the character
703 * immediately after the delimiter), or -1 if end of string found
704 */
705 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
706 // skip all leading whitespace, unless it is the
707 // field delimiter or the quote character
708 while (start < len) {
709 final int removeLen = Math.max(
710 getIgnoredMatcher().isMatch(srcChars, start, start, len),
711 getTrimmerMatcher().isMatch(srcChars, start, start, len));
712 if (removeLen == 0 ||
713 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
714 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
715 break;
716 }
717 start += removeLen;
718 }
719
720 // handle reaching end
721 if (start >= len) {
722 addToken(tokenList, StringUtils.EMPTY);
723 return -1;
724 }
725
726 // handle empty token
727 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
728 if (delimLen > 0) {
729 addToken(tokenList, StringUtils.EMPTY);
730 return start + delimLen;
731 }
732
733 // handle found token
734 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
735 if (quoteLen > 0) {
736 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
737 }
738 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
739 }
740
741 /**
742 * Reads a possibly quoted string token.
743 *
744 * @param srcChars the character array being tokenized
745 * @param start the first character of field
746 * @param len the length of the character array being tokenized
747 * @param workArea a temporary work area
748 * @param tokenList the list of parsed tokens
749 * @param quoteStart the start position of the matched quote, 0 if no quoting
750 * @param quoteLen the length of the matched quote, 0 if no quoting
751 * @return the starting position of the next field (the character
752 * immediately after the delimiter, or if end of string found,
753 * then the length of string
754 */
755 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
756 final List<String> tokenList, final int quoteStart, final int quoteLen) {
757 // Loop until we've found the end of the quoted
758 // string or the end of the input
759 workArea.clear();
760 int pos = start;
761 boolean quoting = quoteLen > 0;
762 int trimStart = 0;
763
764 while (pos < len) {
765 // quoting mode can occur several times throughout a string
766 // we must switch between quoting and non-quoting until we
767 // encounter a non-quoted delimiter, or end of string
768 if (quoting) {
769 // In quoting mode
770
771 // If we've found a quote character, see if it's
772 // followed by a second quote. If so, then we need
773 // to actually put the quote character into the token
774 // rather than end the token.
775 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
776 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
777 // matched pair of quotes, thus an escaped quote
778 workArea.append(srcChars, pos, quoteLen);
779 pos += quoteLen * 2;
780 trimStart = workArea.size();
781 continue;
782 }
783
784 // end of quoting
785 quoting = false;
786 pos += quoteLen;
787 continue;
788 }
789
790 } else {
791 // Not in quoting mode
792
793 // check for delimiter, and thus end of token
794 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
795 if (delimLen > 0) {
796 // return condition when end of token found
797 addToken(tokenList, workArea.substring(0, trimStart));
798 return pos + delimLen;
799 }
800
801 // check for quote, and thus back into quoting mode
802 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
803 quoting = true;
804 pos += quoteLen;
805 continue;
806 }
807
808 // check for ignored (outside quotes), and ignore
809 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
810 if (ignoredLen > 0) {
811 pos += ignoredLen;
812 continue;
813 }
814
815 // check for trimmed character
816 // don't yet know if it's at the end, so copy to workArea
817 // use trimStart to keep track of trim at the end
818 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
819 if (trimmedLen > 0) {
820 workArea.append(srcChars, pos, trimmedLen);
821 pos += trimmedLen;
822 continue;
823 }
824 }
825 // copy regular character from inside quotes
826 workArea.append(srcChars[pos++]);
827 trimStart = workArea.size();
828 }
829
830 // return condition when end of string found
831 addToken(tokenList, workArea.substring(0, trimStart));
832 return -1;
833 }
834
835 /**
836 * Unsupported ListIterator operation.
837 *
838 * @throws UnsupportedOperationException always
839 */
840 @Override
841 public void remove() {
842 throw new UnsupportedOperationException("remove() is unsupported");
843 }
844
845 /**
846 * Resets this tokenizer, forgetting all parsing and iteration already completed.
847 * <p>
848 * This method allows the same tokenizer to be reused for the same String.
849 * </p>
850 *
851 * @return this, to enable chaining
852 */
853 public StrTokenizer reset() {
854 tokenPos = 0;
855 tokens = null;
856 return this;
857 }
858
859 /**
860 * Reset this tokenizer, giving it a new input string to parse.
861 * In this manner you can re-use a tokenizer with the same settings
862 * on multiple input lines.
863 *
864 * @param input the new character array to tokenize, not cloned, null sets no text to parse
865 * @return this, to enable chaining
866 */
867 public StrTokenizer reset(final char[] input) {
868 reset();
869 this.chars = ArrayUtils.clone(input);
870 return this;
871 }
872
873 /**
874 * Reset this tokenizer, giving it a new input string to parse.
875 * In this manner you can re-use a tokenizer with the same settings
876 * on multiple input lines.
877 *
878 * @param input the new string to tokenize, null sets no text to parse
879 * @return this, to enable chaining
880 */
881 public StrTokenizer reset(final String input) {
882 reset();
883 if (input != null) {
884 this.chars = input.toCharArray();
885 } else {
886 this.chars = null;
887 }
888 return this;
889 }
890
891 /**
892 * Unsupported ListIterator operation.
893 * @param obj this parameter ignored.
894 * @throws UnsupportedOperationException always
895 */
896 @Override
897 public void set(final String obj) {
898 throw new UnsupportedOperationException("set() is unsupported");
899 }
900
901 /**
902 * Sets the field delimiter character.
903 *
904 * @param delim the delimiter character to use
905 * @return this, to enable chaining
906 */
907 public StrTokenizer setDelimiterChar(final char delim) {
908 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
909 }
910
911 /**
912 * Sets the field delimiter matcher.
913 * <p>
914 * The delimiter is used to separate one token from another.
915 * </p>
916 *
917 * @param delim the delimiter matcher to use
918 * @return this, to enable chaining
919 */
920 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
921 if (delim == null) {
922 this.delimMatcher = StrMatcher.noneMatcher();
923 } else {
924 this.delimMatcher = delim;
925 }
926 return this;
927 }
928
929 /**
930 * Sets the field delimiter string.
931 *
932 * @param delim the delimiter string to use
933 * @return this, to enable chaining
934 */
935 public StrTokenizer setDelimiterString(final String delim) {
936 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
937 }
938
939 /**
940 * Sets whether the tokenizer should return empty tokens as null.
941 * The default for this property is false.
942 *
943 * @param emptyAsNull whether empty tokens are returned as null
944 * @return this, to enable chaining
945 */
946 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
947 this.emptyAsNull = emptyAsNull;
948 return this;
949 }
950
951 /**
952 * Sets the character to ignore.
953 * <p>
954 * This character is ignored when parsing the String, unless it is
955 * within a quoted region.
956 *
957 * @param ignored the ignored character to use
958 * @return this, to enable chaining
959 */
960 public StrTokenizer setIgnoredChar(final char ignored) {
961 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
962 }
963
964 /**
965 * Sets the matcher for characters to ignore.
966 * <p>
967 * These characters are ignored when parsing the String, unless they are
968 * within a quoted region.
969 * </p>
970 *
971 * @param ignored the ignored matcher to use, null ignored
972 * @return this, to enable chaining
973 */
974 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
975 if (ignored != null) {
976 this.ignoredMatcher = ignored;
977 }
978 return this;
979 }
980
981 /**
982 * Sets whether the tokenizer should ignore and not return empty tokens.
983 * The default for this property is true.
984 *
985 * @param ignoreEmptyTokens whether empty tokens are not returned
986 * @return this, to enable chaining
987 */
988 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
989 this.ignoreEmptyTokens = ignoreEmptyTokens;
990 return this;
991 }
992
993 /**
994 * Sets the quote character to use.
995 * <p>
996 * The quote character is used to wrap data between the tokens.
997 * This enables delimiters to be entered as data.
998 * </p>
999 *
1000 * @param quote the quote character to use
1001 * @return this, to enable chaining
1002 */
1003 public StrTokenizer setQuoteChar(final char quote) {
1004 return setQuoteMatcher(StrMatcher.charMatcher(quote));
1005 }
1006
1007 /**
1008 * Sets the quote matcher to use.
1009 * <p>
1010 * The quote character is used to wrap data between the tokens.
1011 * This enables delimiters to be entered as data.
1012 * </p>
1013 *
1014 * @param quote the quote matcher to use, null ignored
1015 * @return this, to enable chaining
1016 */
1017 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1018 if (quote != null) {
1019 this.quoteMatcher = quote;
1020 }
1021 return this;
1022 }
1023
1024 /**
1025 * Sets the matcher for characters to trim.
1026 * <p>
1027 * These characters are trimmed off on each side of the delimiter
1028 * until the token or quote is found.
1029 * </p>
1030 *
1031 * @param trimmer the trimmer matcher to use, null ignored
1032 * @return this, to enable chaining
1033 */
1034 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1035 if (trimmer != null) {
1036 this.trimmerMatcher = trimmer;
1037 }
1038 return this;
1039 }
1040
1041 // API
1042 /**
1043 * Gets the number of tokens found in the String.
1044 *
1045 * @return the number of matched tokens
1046 */
1047 public int size() {
1048 checkTokenized();
1049 return tokens.length;
1050 }
1051
1052 /**
1053 * Internal method to performs the tokenization.
1054 * <p>
1055 * Most users of this class do not need to call this method. This method
1056 * will be called automatically by other (public) methods when required.
1057 * </p>
1058 * <p>
1059 * This method exists to allow subclasses to add code before or after the
1060 * tokenization. For example, a subclass could alter the character array,
1061 * offset or count to be parsed, or call the tokenizer multiple times on
1062 * multiple strings. It is also be possible to filter the results.
1063 * </p>
1064 * <p>
1065 * {@link StrTokenizer} will always pass a zero offset and a count
1066 * equal to the length of the array to this method, however a subclass
1067 * may pass other values, or even an entirely different array.
1068 * </p>
1069 *
1070 * @param srcChars the character array being tokenized, may be null
1071 * @param offset the start position within the character array, must be valid
1072 * @param count the number of characters to tokenize, must be valid
1073 * @return the modifiable list of String tokens, unmodifiable if null array or zero count
1074 */
1075 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1076 if (ArrayUtils.isEmpty(srcChars)) {
1077 return Collections.emptyList();
1078 }
1079 final StrBuilder buf = new StrBuilder();
1080 final List<String> tokenList = new ArrayList<>();
1081 int pos = offset;
1082
1083 // loop around the entire buffer
1084 while (pos >= 0 && pos < count) {
1085 // find next token
1086 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1087
1088 // handle case where end of string is a delimiter
1089 if (pos >= count) {
1090 addToken(tokenList, StringUtils.EMPTY);
1091 }
1092 }
1093 return tokenList;
1094 }
1095
1096 /**
1097 * Gets the String content that the tokenizer is parsing.
1098 *
1099 * @return the string content being parsed
1100 */
1101 @Override
1102 public String toString() {
1103 if (tokens == null) {
1104 return "StrTokenizer[not tokenized yet]";
1105 }
1106 return "StrTokenizer" + getTokenList();
1107 }
1108
1109 }