1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.text.similarity;
18
19 import static org.junit.jupiter.api.Assertions.assertEquals;
20 import static org.junit.jupiter.api.Assertions.assertThrows;
21
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Set;
29 import java.util.function.Function;
30 import java.util.regex.Pattern;
31
32 import org.junit.jupiter.api.Test;
33
34
35
36
37 class IntersectionSimilarityTest {
38 private static <T> void assertIntersection(final IntersectionSimilarity<T> similarity, final CharSequence cs1, final CharSequence cs2, final int sizeA,
39 final int sizeB, final int intersection) {
40 final IntersectionResult result = similarity.apply(cs1, cs2);
41 assertEquals(sizeA, result.getSizeA(), "Size A error");
42 assertEquals(sizeB, result.getSizeB(), "Size B error");
43 assertEquals(intersection, result.getIntersection(), "Intersection error");
44 }
45
46
47
48
49
50
51
52 private static List<Integer> toBigramList(final CharSequence sequence) {
53 final int length = sequence.length();
54 final List<Integer> list = new ArrayList<>(length);
55 if (length > 1) {
56 char ch2 = sequence.charAt(0);
57 for (int i = 1; i < length; i++) {
58 final char ch1 = ch2;
59 ch2 = sequence.charAt(i);
60 list.add(Integer.valueOf(ch1 << 16 | ch2));
61 }
62 }
63 return list;
64 }
65
66
67
68
69
70
71
72 private static Set<Integer> toBigramSet(final CharSequence sequence) {
73 final int length = sequence.length();
74 final Set<Integer> set = new HashSet<>(length);
75 if (length > 1) {
76 char ch2 = sequence.charAt(0);
77 for (int i = 1; i < length; i++) {
78 final char ch1 = ch2;
79 ch2 = sequence.charAt(i);
80 set.add(Integer.valueOf(ch1 << 16 | ch2));
81 }
82 }
83 return set;
84 }
85
86
87
88
89
90
91
92 private static List<Character> toCharacterList(final CharSequence sequence) {
93 final int length = sequence.length();
94 final List<Character> list = new ArrayList<>(length);
95 for (int i = 0; i < length; i++) {
96 list.add(sequence.charAt(i));
97 }
98 return list;
99 }
100
101
102
103
104
105
106
107 private static Set<Character> toCharacterSet(final CharSequence sequence) {
108 final int length = sequence.length();
109 final Set<Character> set = new HashSet<>(length);
110 for (int i = 0; i < length; i++) {
111 set.add(sequence.charAt(i));
112 }
113 return set;
114 }
115
116 private static int toF1ScorePercent(final IntersectionResult result) {
117 final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
118
119 return (int) Math.round(value * 100);
120 }
121
122 @Test
123 void testApplyNullNull() {
124 assertThrows(IllegalArgumentException.class, () -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, null));
125 }
126
127 @Test
128 void testApplyNullString() {
129 assertThrows(IllegalArgumentException.class,
130 () -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, "right"));
131 }
132
133 @Test
134 void testApplyStringNull() {
135 assertThrows(IllegalArgumentException.class,
136 () -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply("left", null));
137 }
138
139 @Test
140 void testConstructorWithNullConverterThrows() {
141 assertThrows(IllegalArgumentException.class, () -> new IntersectionSimilarity<>(null));
142 }
143
144 @Test
145 void testF1ScoreUsingListWordBigrams() {
146
147
148
149
150
151
152 final Pattern pattern = Pattern.compile("\\s+");
153
154
155
156 final Function<CharSequence, Collection<Integer>> converter = cs -> {
157 final List<Integer> set = new ArrayList<>();
158 for (final String word : pattern.split(cs)) {
159 if (word.length() > 1) {
160
161 char ch2 = Character.toUpperCase(word.charAt(0));
162 for (int i = 1; i < word.length(); i++) {
163 final char ch1 = ch2;
164 ch2 = Character.toUpperCase(word.charAt(i));
165 set.add(Integer.valueOf(ch1 << 16 | ch2));
166 }
167 }
168 }
169 return set;
170 };
171 final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
172
173 String bookTitle;
174 final String search1 = "Web Database Applications";
175 final String search2 = "PHP Web Applications";
176 final String search3 = "Web Aplications";
177 bookTitle = "Web Database Applications with PHP & MySQL";
178 assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
179 assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
180 assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
181 bookTitle = "Creating Database Web Applications with PHP and ASP";
182 assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
183 assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
184 assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
185 bookTitle = "Building Database Applications on the Web Using PHP3";
186 assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
187 assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
188 assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
189 bookTitle = "Building Web Database Applications with Visual Studio 6";
190 assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
191 assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
192 assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
193 bookTitle = "Web Application Development With PHP";
194 assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
195 assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
196 assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
197 bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
198 assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
199 assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
200 assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
201 bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
202 assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
203 assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
204 assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
205 bookTitle = "How to Find a Scholarship Online";
206 assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
207 assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
208 assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
209 }
210
211 @Test
212 void testIntersectionUsingListBigrams() {
213
214
215
216 final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
217
218
219
220
221 assertIntersection(similarity, "", "", 0, 0, 0);
222 assertIntersection(similarity, "a", "", 0, 0, 0);
223 assertIntersection(similarity, "a", "a", 0, 0, 0);
224 assertIntersection(similarity, "a", "b", 0, 0, 0);
225 assertIntersection(similarity, "aa", "ab", 1, 1, 0);
226 assertIntersection(similarity, "ab", "ab", 1, 1, 1);
227 assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
228 assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
229 assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
230 assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
231 assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
232 assertIntersection(similarity, "the same", "the same", 7, 7, 7);
233 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
234 }
235
236 @Test
237 void testIntersectionUsingListCharacter() {
238
239
240 final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
241
242
243
244
245 assertIntersection(similarity, "", "", 0, 0, 0);
246 assertIntersection(similarity, "a", "", 1, 0, 0);
247 assertIntersection(similarity, "a", "a", 1, 1, 1);
248 assertIntersection(similarity, "a", "b", 1, 1, 0);
249 assertIntersection(similarity, "aa", "ab", 2, 2, 1);
250 assertIntersection(similarity, "ab", "ab", 2, 2, 2);
251 assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
252 assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
253 assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
254 assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
255 assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
256 assertIntersection(similarity, "the same", "the same", 8, 8, 8);
257 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
258 }
259
260 @Test
261 void testIntersectionUsingSetBigrams() {
262
263
264
265 final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
266
267
268
269
270 assertIntersection(similarity, "", "", 0, 0, 0);
271 assertIntersection(similarity, "a", "", 0, 0, 0);
272 assertIntersection(similarity, "a", "a", 0, 0, 0);
273 assertIntersection(similarity, "a", "b", 0, 0, 0);
274 assertIntersection(similarity, "aa", "ab", 1, 1, 0);
275 assertIntersection(similarity, "ab", "ab", 1, 1, 1);
276 assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
277 assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
278 assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
279 assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
280 assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
281 assertIntersection(similarity, "the same", "the same", 7, 7, 7);
282 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
283 }
284
285 @Test
286 void testIntersectionUsingSetCharacter() {
287
288
289 final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
290
291
292
293
294 assertIntersection(similarity, "", "", 0, 0, 0);
295 assertIntersection(similarity, "a", "", 1, 0, 0);
296 assertIntersection(similarity, "a", "a", 1, 1, 1);
297 assertIntersection(similarity, "a", "b", 1, 1, 0);
298 assertIntersection(similarity, "aa", "ab", 1, 2, 1);
299 assertIntersection(similarity, "ab", "ab", 2, 2, 2);
300 assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
301 assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
302 assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
303 assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
304 assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
305 assertIntersection(similarity, "the same", "the same", 7, 7, 7);
306 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
307 }
308
309 @Test
310 void testIntersectionUsingSetCharacterListCharacter() {
311
312
313 final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
314 final String sequence1 = "aabbccdd";
315 final String sequence2 = "aaaaaabbbfffff";
316 converter.put(sequence1, toCharacterSet(sequence1));
317 converter.put(sequence2, toCharacterList(sequence2));
318 final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(converter::get);
319
320
321
322
323
324 assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
325 assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
326 }
327 }