v/vlib/strings/similarity.v

225 lines
5.4 KiB
V
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module strings
@[inline]
fn min(a int, b int, c int) int {
mut m := a
if b < m {
m = b
}
if c < m {
m = c
}
return m
}
@[inline]
fn max2(a int, b int) int {
if a < b {
return b
}
return a
}
@[inline]
fn min2(a int, b int) int {
if a < b {
return a
}
return b
}
@[inline]
fn abs2(a int, b int) int {
if a < b {
return b - a
}
return a - b
}
// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
// the distance between between two strings `a` and `b` (lower is closer).
@[direct_array_access]
pub fn levenshtein_distance(a string, b string) int {
if a.len == 0 {
return b.len
}
if b.len == 0 {
return a.len
}
if a == b {
return 0
}
mut row := []int{len: a.len + 1, init: index}
for i := 1; i < b.len + 1; i++ {
mut prev := i
for j := 1; j < a.len + 1; j++ {
mut current := row[j - 1] // match
if b[i - 1] != a[j - 1] {
// insertion, substitution, deletion
current = min(row[j - 1] + 1, prev + 1, row[j] + 1)
}
row[j - 1] = prev
prev = current
}
row[a.len] = prev
}
return row[a.len]
}
// levenshtein_distance_percentage uses the Levenshtein Distance algorithm to calculate how similar two strings are as a percentage (higher is closer).
pub fn levenshtein_distance_percentage(a string, b string) f32 {
d := levenshtein_distance(a, b)
l := if a.len >= b.len { a.len } else { b.len }
return (1.00 - f32(d) / f32(l)) * 100.00
}
// dice_coefficient implements the SørensenDice coefficient.
// It finds the similarity between two strings, and returns a coefficient
// between 0.0 (not similar) and 1.0 (exact match).
pub fn dice_coefficient(s1 string, s2 string) f32 {
if s1.len == 0 || s2.len == 0 {
return 0.0
}
if s1 == s2 {
return 1.0
}
if s1.len < 2 || s2.len < 2 {
return 0.0
}
a := if s1.len > s2.len { s1 } else { s2 }
b := if a == s1 { s2 } else { s1 }
mut first_bigrams := map[string]int{}
for i in 0 .. a.len - 1 {
bigram := a[i..i + 2]
q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
first_bigrams[bigram] = q
}
mut intersection_size := 0
for i in 0 .. b.len - 1 {
bigram := b[i..i + 2]
count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
if count > 0 {
first_bigrams[bigram] = count - 1
intersection_size++
}
}
return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
}
// hamming_distance uses the Hamming Distance algorithm to calculate
// the distance between two strings `a` and `b` (lower is closer).
@[direct_array_access]
pub fn hamming_distance(a string, b string) int {
if a.len == 0 && b.len == 0 {
return 0
}
mut match_len := min2(a.len, b.len)
mut diff_count := abs2(a.len, b.len)
for i in 0 .. match_len {
if a[i] != b[i] {
diff_count++
}
}
return diff_count
}
// hamming_similarity uses the Hamming Distance algorithm to calculate the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
pub fn hamming_similarity(a string, b string) f32 {
l := max2(a.len, b.len)
if l == 0 {
// Both are empty strings, should return 1.0
return 1.0
}
d := hamming_distance(a, b)
return 1.00 - f32(d) / f32(l)
}
// jaro_similarity uses the Jaro Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
@[direct_array_access]
pub fn jaro_similarity(a string, b string) f64 {
a_len := a.len
b_len := b.len
if a_len == 0 && b_len == 0 {
// Both are empty strings, should return 1.0
return 1.0
}
if a_len == 0 || b_len == 0 {
return 0
}
// Maximum distance upto which matching is allowed
match_distance := max2(a_len, b_len) / 2 - 1
mut a_matches := []bool{len: a_len}
mut b_matches := []bool{len: b_len}
mut matches := 0
mut transpositions := 0.0
// Traverse through the first string
for i in 0 .. a_len {
start := max2(0, i - match_distance)
end := min2(b_len, i + match_distance + 1)
for k in start .. end {
// If there is a match
if b_matches[k] {
continue
}
if a[i] != b[k] {
continue
}
a_matches[i] = true
b_matches[k] = true
matches++
break
}
}
// If there is no match
if matches == 0 {
return 0
}
mut k := 0
// Count number of occurrences where two characters match but
// there is a third matched character in between the indices
for i in 0 .. a_len {
if !a_matches[i] {
continue
}
// Find the next matched character in second string
for !b_matches[k] {
k++
}
if a[i] != b[k] {
transpositions++
}
k++
}
transpositions /= 2
return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
}
// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
// similarities, making it especially effective for cases where slight misspellings
// or prefixes are common.
@[direct_array_access]
pub fn jaro_winkler_similarity(a string, b string) f64 {
// Maximum of 4 characters are allowed in prefix
mut lmax := min2(4, min2(a.len, b.len))
mut l := 0
for i in 0 .. lmax {
if a[i] == b[i] {
l++
}
}
js := jaro_similarity(a, b)
// select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
p := 0.1
ws := js + f64(l) * p * (1 - js)
return ws
}