|
| 1 | +/* The Sørensen–Dice coefficient is a statistic used to gauge the similarity of two samples. |
| 2 | + * Applied to strings, it can give you a value between 0 and 1 (included) which tells you how similar they are. |
| 3 | + * Dice coefficient is calculated by comparing the bigrams of both stings, |
| 4 | + * a bigram is a substring of the string of length 2. |
| 5 | + * read more: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient |
| 6 | + */ |
| 7 | + |
| 8 | +// Time complexity: O(m + n), m and n being the sizes of string A and string B |
| 9 | + |
| 10 | +// Find the bistrings of a string and return a hashmap (key => bistring, value => count) |
| 11 | +function mapBigrams (string) { |
| 12 | + const bigrams = new Map() |
| 13 | + for (let i = 0; i < string.length - 1; i++) { |
| 14 | + const bigram = string.substring(i, i + 2) |
| 15 | + const count = bigrams.get(bigram) |
| 16 | + bigrams.set(bigram, (count || 0) + 1) |
| 17 | + } |
| 18 | + return bigrams |
| 19 | +} |
| 20 | + |
| 21 | +// Calculate the number of common bigrams between a map of bigrams and a string |
| 22 | + |
| 23 | +function countCommonBigrams (bigrams, string) { |
| 24 | + let count = 0 |
| 25 | + for (let i = 0; i < string.length - 1; i++) { |
| 26 | + const bigram = string.substring(i, i + 2) |
| 27 | + if (bigrams.has(bigram)) count++ |
| 28 | + } |
| 29 | + return count |
| 30 | +} |
| 31 | + |
| 32 | +// Calculate Dice coeff of 2 strings |
| 33 | +function diceCoefficient (stringA, stringB) { |
| 34 | + if (stringA === stringB) return 1 |
| 35 | + else if (stringA.length < 2 || stringB.length < 2) return 0 |
| 36 | + |
| 37 | + const bigramsA = mapBigrams(stringA) |
| 38 | + |
| 39 | + const lengthA = stringA.length - 1 |
| 40 | + const lengthB = stringB.length - 1 |
| 41 | + |
| 42 | + let dice = (2 * countCommonBigrams(bigramsA, stringB)) / (lengthA + lengthB) |
| 43 | + |
| 44 | + // cut 0.xxxxxx to 0.xx for simplicity |
| 45 | + dice = Math.floor(dice * 100) / 100 |
| 46 | + |
| 47 | + console.log('Dice coefficient of', stringA, 'and', stringB, 'is', dice) |
| 48 | + |
| 49 | + return dice |
| 50 | +} |
| 51 | +export { diceCoefficient } |
0 commit comments