View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import java.util.Collection;
20  import java.util.HashMap;
21  import java.util.Map;
22  import java.util.Map.Entry;
23  import java.util.Set;
24  import java.util.function.Function;
25  
26  /**
27   * Measures the intersection of two sets created from a pair of character sequences.
28   *
29   * <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage
30   * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
31   * {@link Object#equals(Object)} and {@link Object#hashCode()}.</p>
32   *
33   * @param <T> the type of the elements extracted from the character sequence
34   * @since 1.7
35   * @see Set
36   * @see HashMap
37   */
38  public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> {
39  
40      /**
41       * Mutable counter class for storing the count of elements.
42       */
43      private static final class BagCount {
44  
45          /** Private, mutable but must be used as immutable. */
46          private static final BagCount ZERO = new BagCount();
47  
48          /** The count. */
49          int count;
50  
51          private BagCount() {
52              this.count = 0;
53          }
54      }
55  
56      // The following is adapted from commons-collections for a Bag.
57      // A Bag is a collection that can store the count of the number
58      // of copies of each element.
59  
60      /**
61       * A minimal implementation of a Bag that can store elements and a count.
62       *
63       * <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not
64       * even have to know its own size.
65       */
66      private class TinyBag {
67          /** The backing map. */
68          private final Map<T, BagCount> map;
69  
70          /**
71           * Create a new tiny bag.
72           *
73           * @param initialCapacity the initial capacity
74           */
75          TinyBag(final int initialCapacity) {
76              map = new HashMap<>(initialCapacity);
77          }
78  
79          /**
80           * Adds a new element to the bag, incrementing its count in the underlying map.
81           *
82           * @param object the object to add
83           */
84          void add(final T object) {
85              map.computeIfAbsent(object, k -> new BagCount()).count++;
86          }
87  
88          /**
89           * Returns a Set view of the mappings contained in this bag.
90           *
91           * @return The Set view
92           */
93          Set<Entry<T, BagCount>> entrySet() {
94              return map.entrySet();
95          }
96  
97          /**
98           * Returns the number of occurrence of the given element in this bag by
99           * looking up its count in the underlying map.
100          *
101          * @param object the object to search for
102          * @return The number of occurrences of the object, zero if not found
103          */
104         int getCount(final Object object) {
105             return map.getOrDefault(object, BagCount.ZERO).count;
106         }
107 
108         /**
109          * Gets the number of unique elements in the bag.
110          *
111          * @return The unique element size
112          */
113         int uniqueElementSize() {
114             return map.size();
115         }
116     }
117 
118     /**
119      * Computes the intersection between two sets. This is the count of all the elements
120      * that are within both sets.
121      *
122      * @param <T> the type of the elements in the set
123      * @param setA the set A
124      * @param setB the set B
125      * @return The intersection
126      */
127     private static <T> int getIntersection(final Set<T> setA, final Set<T> setB) {
128         int intersection = 0;
129         for (final T element : setA) {
130             if (setB.contains(element)) {
131                 intersection++;
132             }
133         }
134         return intersection;
135     }
136 
137     /** The converter used to create the elements from the characters. */
138     private final Function<CharSequence, Collection<T>> converter;
139 
140     /**
141      * Create a new intersection similarity using the provided converter.
142      *
143      * <p>
144      * If the converter returns a {@link Set} then the intersection result will
145      * not include duplicates. Any other {@link Collection} is used to produce a result
146      * that will include duplicates in the intersect and union.
147      * </p>
148      *
149      * @param converter the converter used to create the elements from the characters
150      * @throws IllegalArgumentException if the converter is null
151      */
152     public IntersectionSimilarity(final Function<CharSequence, Collection<T>> converter) {
153         if (converter == null) {
154             throw new IllegalArgumentException("Converter must not be null");
155         }
156         this.converter = converter;
157     }
158 
159     /**
160      * Calculates the intersection of two character sequences passed as input.
161      *
162      * @param left first character sequence
163      * @param right second character sequence
164      * @return The intersection result
165      * @throws IllegalArgumentException if either input sequence is {@code null}
166      */
167     @Override
168     public IntersectionResult apply(final CharSequence left, final CharSequence right) {
169         if (left == null || right == null) {
170             throw new IllegalArgumentException("Input cannot be null");
171         }
172 
173         // Create the elements from the sequences
174         final Collection<T> objectsA = converter.apply(left);
175         final Collection<T> objectsB = converter.apply(right);
176         final int sizeA = objectsA.size();
177         final int sizeB = objectsB.size();
178 
179         // Short-cut if either collection is empty
180         if (Math.min(sizeA, sizeB) == 0) {
181             // No intersection
182             return new IntersectionResult(sizeA, sizeB, 0);
183         }
184 
185         // Intersection = count the number of shared elements
186         final int intersection;
187         if (objectsA instanceof Set && objectsB instanceof Set) {
188             // If a Set then the elements will only have a count of 1.
189             // Iterate over the smaller set.
190             intersection = sizeA < sizeB
191                     ? getIntersection((Set<T>) objectsA, (Set<T>) objectsB)
192                     : getIntersection((Set<T>) objectsB, (Set<T>) objectsA);
193         } else  {
194             // Create a bag for each collection
195             final TinyBag bagA = toBag(objectsA);
196             final TinyBag bagB = toBag(objectsB);
197             // Iterate over the smaller number of unique elements
198             intersection = bagA.uniqueElementSize() < bagB.uniqueElementSize()
199                     ? getIntersection(bagA, bagB)
200                     : getIntersection(bagB, bagA);
201         }
202 
203         return new IntersectionResult(sizeA, sizeB, intersection);
204     }
205 
206     /**
207      * Computes the intersection between two bags. This is the sum of the minimum
208      * count of each element that is within both sets.
209      *
210      * @param bagA the bag A
211      * @param bagB the bag B
212      * @return The intersection
213      */
214     private int getIntersection(final TinyBag bagA, final TinyBag bagB) {
215         int intersection = 0;
216         for (final Entry<T, BagCount> entry : bagA.entrySet()) {
217             final T element = entry.getKey();
218             final int count = entry.getValue().count;
219             // The intersection of this entry in both bags is the minimum count
220             intersection += Math.min(count, bagB.getCount(element));
221         }
222         return intersection;
223     }
224 
225     /**
226      * Converts the collection to a bag. The bag will contain the count of each element
227      * in the collection.
228      *
229      * @param objects the objects
230      * @return The bag
231      */
232     private TinyBag toBag(final Collection<T> objects) {
233         final TinyBag bag = new TinyBag(objects.size());
234         objects.forEach(bag::add);
235         return bag;
236     }
237 }