1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.text.similarity; 18 19 import java.util.ArrayList; 20 import java.util.List; 21 import java.util.regex.Matcher; 22 import java.util.regex.Pattern; 23 24 import org.apache.commons.lang3.ArrayUtils; 25 import org.apache.commons.lang3.StringUtils; 26 import org.apache.commons.lang3.Validate; 27 28 /** 29 * A simple word {@link Tokenizer} that utilizes a regex to find words. It applies a regex {@code (\w)+} over the input text to extract words from a given 30 * character sequence. 31 * <p> 32 * Instances of this class are immutable and are safe for use by multiple concurrent threads. 33 * </p> 34 * 35 * @since 1.0 36 */ 37 final class RegexTokenizer implements CharSequenceTokenizer<CharSequence> { 38 39 /** The whitespace pattern. */ 40 private static final Pattern PATTERN = Pattern.compile("(\\w)+"); 41 42 /** 43 * Singleton instance. 44 */ 45 static final RegexTokenizer INSTANCE = new RegexTokenizer(); 46 47 /** 48 * {@inheritDoc} 49 * 50 * @throws IllegalArgumentException if the input text is blank 51 */ 52 @Override 53 public CharSequence[] apply(final CharSequence text) { 54 Validate.isTrue(StringUtils.isNotBlank(text), "Invalid text"); 55 final Matcher matcher = PATTERN.matcher(text); 56 final List<String> tokens = new ArrayList<>(); 57 while (matcher.find()) { 58 tokens.add(matcher.group(0)); 59 } 60 return tokens.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 61 } 62 63 }