|
| 1 | +""" |
| 2 | +The Jaccard similarity coefficient is a commonly used indicator of the |
| 3 | +similarity between two sets. Let U be a set and A and B be subsets of U, |
| 4 | +then the Jaccard index/similarity is defined to be the ratio of the number |
| 5 | +of elements of their intersection and the number of elements of their union. |
| 6 | +
|
| 7 | +Inspired from Wikipedia and |
| 8 | +the book Mining of Massive Datasets [MMDS 2nd Edition, Chapter 3] |
| 9 | +
|
| 10 | +https://en.wikipedia.org/wiki/Jaccard_index |
| 11 | +https://mmds.org |
| 12 | +
|
| 13 | +Jaccard similarity is widely used with MinHashing. |
| 14 | +""" |
| 15 | + |
| 16 | + |
| 17 | +def jaccard_similariy(setA, setB, alternativeUnion=False): |
| 18 | + """ |
| 19 | + Finds the jaccard similarity between two sets. |
| 20 | + Essentially, its intersection over union. |
| 21 | +
|
| 22 | + The alternative way to calculate this is to take union as sum of the |
| 23 | + number of items in the two sets. This will lead to jaccard similarity |
| 24 | + of a set with itself be 1/2 instead of 1. [MMDS 2nd Edition, Page 77] |
| 25 | +
|
| 26 | + Parameters: |
| 27 | + :setA (set,list,tuple): A non-empty set/list |
| 28 | + :setB (set,list,tuple): A non-empty set/list |
| 29 | + :alternativeUnion (boolean): If True, use sum of number of |
| 30 | + items as union |
| 31 | +
|
| 32 | + Output: |
| 33 | + (float) The jaccard similarity between the two sets. |
| 34 | +
|
| 35 | + Examples: |
| 36 | + >>> setA = {'a', 'b', 'c', 'd', 'e'} |
| 37 | + >>> setB = {'c', 'd', 'e', 'f', 'h', 'i'} |
| 38 | + >>> jaccard_similariy(setA,setB) |
| 39 | + 0.375 |
| 40 | +
|
| 41 | + >>> jaccard_similariy(setA,setA) |
| 42 | + 1.0 |
| 43 | +
|
| 44 | + >>> jaccard_similariy(setA,setA,True) |
| 45 | + 0.5 |
| 46 | +
|
| 47 | + >>> setA = ['a', 'b', 'c', 'd', 'e'] |
| 48 | + >>> setB = ('c', 'd', 'e', 'f', 'h', 'i') |
| 49 | + >>> jaccard_similariy(setA,setB) |
| 50 | + 0.375 |
| 51 | + """ |
| 52 | + |
| 53 | + if isinstance(setA, set) and isinstance(setB, set): |
| 54 | + |
| 55 | + intersection = len(setA.intersection(setB)) |
| 56 | + |
| 57 | + if alternativeUnion: |
| 58 | + union = len(setA) + len(setB) |
| 59 | + else: |
| 60 | + union = len(setA.union(setB)) |
| 61 | + |
| 62 | + return intersection / union |
| 63 | + |
| 64 | + if isinstance(setA, (list, tuple)) and isinstance(setB, (list, tuple)): |
| 65 | + |
| 66 | + intersection = [element for element in setA if element in setB] |
| 67 | + |
| 68 | + if alternativeUnion: |
| 69 | + union = len(setA) + len(setB) |
| 70 | + else: |
| 71 | + union = setA + [element for element in setB if element not in setA] |
| 72 | + |
| 73 | + return len(intersection) / len(union) |
| 74 | + |
| 75 | + |
| 76 | +if __name__ == "__main__": |
| 77 | + |
| 78 | + setA = {"a", "b", "c", "d", "e"} |
| 79 | + setB = {"c", "d", "e", "f", "h", "i"} |
| 80 | + print(jaccard_similariy(setA, setB)) |
0 commit comments