View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import static org.junit.jupiter.api.Assertions.assertEquals;
20  import static org.junit.jupiter.api.Assertions.assertThrows;
21  
22  import java.util.ArrayList;
23  import java.util.Collection;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.List;
28  import java.util.Set;
29  import java.util.function.Function;
30  import java.util.regex.Pattern;
31  
32  import org.junit.jupiter.api.Test;
33  
34  /**
35   * Tests {@link IntersectionSimilarity}.
36   */
37  class IntersectionSimilarityTest {
38      private static <T> void assertIntersection(final IntersectionSimilarity<T> similarity, final CharSequence cs1, final CharSequence cs2, final int sizeA,
39              final int sizeB, final int intersection) {
40          final IntersectionResult result = similarity.apply(cs1, cs2);
41          assertEquals(sizeA, result.getSizeA(), "Size A error");
42          assertEquals(sizeB, result.getSizeB(), "Size B error");
43          assertEquals(intersection, result.getIntersection(), "Intersection error");
44      }
45  
46      /**
47       * Convert the {@link CharSequence} to a {@link List} of bigrams (pairs of characters). These are represented using 2 16-bit chars packed into a 32-bit int.
48       *
49       * @param sequence the sequence
50       * @return the list
51       */
52      private static List<Integer> toBigramList(final CharSequence sequence) {
53          final int length = sequence.length();
54          final List<Integer> list = new ArrayList<>(length);
55          if (length > 1) {
56              char ch2 = sequence.charAt(0);
57              for (int i = 1; i < length; i++) {
58                  final char ch1 = ch2;
59                  ch2 = sequence.charAt(i);
60                  list.add(Integer.valueOf(ch1 << 16 | ch2));
61              }
62          }
63          return list;
64      }
65  
66      /**
67       * Convert the {@link CharSequence} to a {@link Set} of bigrams (pairs of characters). These are represented using 2 16-bit chars packed into a 32-bit int.
68       *
69       * @param sequence the sequence
70       * @return the set
71       */
72      private static Set<Integer> toBigramSet(final CharSequence sequence) {
73          final int length = sequence.length();
74          final Set<Integer> set = new HashSet<>(length);
75          if (length > 1) {
76              char ch2 = sequence.charAt(0);
77              for (int i = 1; i < length; i++) {
78                  final char ch1 = ch2;
79                  ch2 = sequence.charAt(i);
80                  set.add(Integer.valueOf(ch1 << 16 | ch2));
81              }
82          }
83          return set;
84      }
85  
86      /**
87       * Convert the {@link CharSequence} to a {@link List} of {@link Character}s.
88       *
89       * @param sequence the sequence
90       * @return the list
91       */
92      private static List<Character> toCharacterList(final CharSequence sequence) {
93          final int length = sequence.length();
94          final List<Character> list = new ArrayList<>(length);
95          for (int i = 0; i < length; i++) {
96              list.add(sequence.charAt(i));
97          }
98          return list;
99      }
100 
101     /**
102      * Convert the {@link CharSequence} to a {@link Set} of {@link Character}s.
103      *
104      * @param sequence the sequence
105      * @return the set
106      */
107     private static Set<Character> toCharacterSet(final CharSequence sequence) {
108         final int length = sequence.length();
109         final Set<Character> set = new HashSet<>(length);
110         for (int i = 0; i < length; i++) {
111             set.add(sequence.charAt(i));
112         }
113         return set;
114     }
115 
116     private static int toF1ScorePercent(final IntersectionResult result) {
117         final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
118         // Convert to percentage
119         return (int) Math.round(value * 100);
120     }
121 
122     @Test
123     void testApplyNullNull() {
124         assertThrows(IllegalArgumentException.class, () -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, null));
125     }
126 
127     @Test
128     void testApplyNullString() {
129         assertThrows(IllegalArgumentException.class,
130                 () -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, "right"));
131     }
132 
133     @Test
134     void testApplyStringNull() {
135         assertThrows(IllegalArgumentException.class,
136                 () -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply("left", null));
137     }
138 
139     @Test
140     void testConstructorWithNullConverterThrows() {
141         assertThrows(IllegalArgumentException.class, () -> new IntersectionSimilarity<>(null));
142     }
143 
144     @Test
145     void testF1ScoreUsingListWordBigrams() {
146         // Example of a word letter pairs algorithm by Simon White:
147         // http://www.catalysoft.com/articles/StrikeAMatch.html
148         // This splits into words using whitespace and then computes uppercase
149         // bigrams for each word.
150 
151         // Split on whitespace
152         final Pattern pattern = Pattern.compile("\\s+");
153 
154         // Compute using pairs of characters (bigrams) for each word.
155         // This can be done using a 32-bit int to store two 16-bit characters
156         final Function<CharSequence, Collection<Integer>> converter = cs -> {
157             final List<Integer> set = new ArrayList<>();
158             for (final String word : pattern.split(cs)) {
159                 if (word.length() > 1) {
160                     // The strings are converted to upper case
161                     char ch2 = Character.toUpperCase(word.charAt(0));
162                     for (int i = 1; i < word.length(); i++) {
163                         final char ch1 = ch2;
164                         ch2 = Character.toUpperCase(word.charAt(i));
165                         set.add(Integer.valueOf(ch1 << 16 | ch2));
166                     }
167                 }
168             }
169             return set;
170         };
171         final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
172 
173         String bookTitle;
174         final String search1 = "Web Database Applications";
175         final String search2 = "PHP Web Applications";
176         final String search3 = "Web Aplications";
177         bookTitle = "Web Database Applications with PHP & MySQL";
178         assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
179         assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
180         assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
181         bookTitle = "Creating Database Web Applications with PHP and ASP";
182         assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
183         assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
184         assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
185         bookTitle = "Building Database Applications on the Web Using PHP3";
186         assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
187         assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
188         assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
189         bookTitle = "Building Web Database Applications with Visual Studio 6";
190         assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
191         assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
192         assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
193         bookTitle = "Web Application Development With PHP";
194         assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
195         assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
196         assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
197         bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
198         assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
199         assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
200         assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
201         bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
202         assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
203         assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
204         assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
205         bookTitle = "How to Find a Scholarship Online";
206         assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
207         assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
208         assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
209     }
210 
211     @Test
212     void testIntersectionUsingListBigrams() {
213         // Compute using pairs of characters (bigrams).
214         // This can be done using a 32-bit int to store two 16-bit characters.
215         // This test uses a list and so duplicates should be matched.
216         final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
217 
218         // Expected:
219         // size A or B = sequence length - 1
220         // intersection = count of matching bigrams (include duplicates)
221         assertIntersection(similarity, "", "", 0, 0, 0);
222         assertIntersection(similarity, "a", "", 0, 0, 0);
223         assertIntersection(similarity, "a", "a", 0, 0, 0);
224         assertIntersection(similarity, "a", "b", 0, 0, 0);
225         assertIntersection(similarity, "aa", "ab", 1, 1, 0);
226         assertIntersection(similarity, "ab", "ab", 1, 1, 1);
227         assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
228         assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
229         assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
230         assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
231         assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
232         assertIntersection(similarity, "the same", "the same", 7, 7, 7);
233         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
234     }
235 
236     @Test
237     void testIntersectionUsingListCharacter() {
238         // Compute using single characters.
239         // This test uses a list and so duplicates should be matched.
240         final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
241 
242         // Expected:
243         // size A or B = sequence length
244         // intersection = count of matching characters (include duplicates)
245         assertIntersection(similarity, "", "", 0, 0, 0);
246         assertIntersection(similarity, "a", "", 1, 0, 0);
247         assertIntersection(similarity, "a", "a", 1, 1, 1);
248         assertIntersection(similarity, "a", "b", 1, 1, 0);
249         assertIntersection(similarity, "aa", "ab", 2, 2, 1);
250         assertIntersection(similarity, "ab", "ab", 2, 2, 2);
251         assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
252         assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
253         assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
254         assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
255         assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
256         assertIntersection(similarity, "the same", "the same", 8, 8, 8);
257         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
258     }
259 
260     @Test
261     void testIntersectionUsingSetBigrams() {
262         // Compute using pairs of characters (bigrams).
263         // This can be done using a 32-bit int to store two 16-bit characters.
264         // This test uses a set and so should not allow duplicates.
265         final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
266 
267         // Expected:
268         // size A or B = count of unique bigrams (exclude duplicates)
269         // intersection = count of unique matching bigrams (exclude duplicates)
270         assertIntersection(similarity, "", "", 0, 0, 0);
271         assertIntersection(similarity, "a", "", 0, 0, 0);
272         assertIntersection(similarity, "a", "a", 0, 0, 0);
273         assertIntersection(similarity, "a", "b", 0, 0, 0);
274         assertIntersection(similarity, "aa", "ab", 1, 1, 0);
275         assertIntersection(similarity, "ab", "ab", 1, 1, 1);
276         assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
277         assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
278         assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
279         assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
280         assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
281         assertIntersection(similarity, "the same", "the same", 7, 7, 7);
282         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
283     }
284 
285     @Test
286     void testIntersectionUsingSetCharacter() {
287         // Compute using single characters.
288         // This test uses a set and so should not allow duplicates.
289         final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
290 
291         // Expected:
292         // size A or B = count of unique characters (exclude duplicates)
293         // intersection = count of unique matching characters (exclude duplicates)
294         assertIntersection(similarity, "", "", 0, 0, 0);
295         assertIntersection(similarity, "a", "", 1, 0, 0);
296         assertIntersection(similarity, "a", "a", 1, 1, 1);
297         assertIntersection(similarity, "a", "b", 1, 1, 0);
298         assertIntersection(similarity, "aa", "ab", 1, 2, 1);
299         assertIntersection(similarity, "ab", "ab", 2, 2, 2);
300         assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
301         assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
302         assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
303         assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
304         assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
305         assertIntersection(similarity, "the same", "the same", 7, 7, 7);
306         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
307     }
308 
309     @Test
310     void testIntersectionUsingSetCharacterListCharacter() {
311         // Compute using a custom converter that returns a Set and a List.
312         // This is an edge-case test.
313         final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
314         final String sequence1 = "aabbccdd";
315         final String sequence2 = "aaaaaabbbfffff";
316         converter.put(sequence1, toCharacterSet(sequence1));
317         converter.put(sequence2, toCharacterList(sequence2));
318         final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(converter::get);
319 
320         // Expected:
321         // size A = count of unique characters (exclude duplicates)
322         // size B = sequence length
323         // intersection = count of matching characters (exclude duplicates)
324         assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
325         assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
326     }
327 }