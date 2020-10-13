Hm isn’t the lookahead assertion kinda redundant at the beginning of the expression, meaning “anything followed by x”? If I’m not mistaken the same could be achieved like
/(\bbee|\bapple)\S*/g
Anyway, maybe another approach would be using an actual comparison algorithm for the heavy lifting, such as the Levenshtein distance or the Sørensen-Dice coefficient… the latter probably being more useful here as it gives us a percentage value.
This would also allow for fuzzy matches if desired; here’s an example using the Sørensen-Dice-based
string-similarity package (too lazy to re-implement the wheel right now hehe):
const { findBestMatch } = require('string-similarity')
function trimPunctuation (word) {
// Strip surrounding non-word characters; e.g. remove the
// exclamation mark from "something!" but not "someth!ng"
return word.replace(/^\W*|\W*$/g, '')
}
function getMatches (list, value, { fuzzy = false } = {}) {
const words = value.split(/\s+/).map(trimPunctuation)
return words.reduce((matches, word) => {
const testWords = fuzzy ? list : list.filter(listed => word.includes(listed))
if (testWords.length === 0) {
return matches
}
const { bestMatch } = findBestMatch(word, testWords)
const { rating, target } = bestMatch
if (rating > 0) {
matches[word] = { rating, target }
}
return matches
}, {})
}
const blacklist = ['apple', 'bee', 'like']
const input = 'the beer drinking bee does not l!ke apple-cider'
console.log(getMatches(blacklist, input))
// {
// beer: { rating: 0.8, target: 'bee' },
// bee: { rating: 1, target: 'bee' },
// 'apple-cider': { rating: 0.5714285714285714, target: 'apple' }
// }
console.log(getMatches(blacklist, input, { fuzzy: true }))
// {
// beer: { rating: 0.8, target: 'bee' },
// bee: { rating: 1, target: 'bee' },
// 'l!ke': { rating: 0.3333333333333333, target: 'like' },
// 'apple-cider': { rating: 0.5714285714285714, target: 'apple' }
// }