Repository URL to install this package:
|
Version:
0.1.31-1 ▾
|
odigos-demo-inventory
/
opt
/
odigos-demo-inventory
/
site-packages
/
rapidfuzz
/
distance
/
Jaro_py.py
|
|---|
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
def _jaro_calculate_similarity(pattern_len, text_len, common_chars, transpositions):
transpositions //= 2
sim = 0.0
sim += common_chars / pattern_len
sim += common_chars / text_len
sim += (common_chars - transpositions) / common_chars
return sim / 3.0
def _jaro_length_filter(pattern_len, text_len, score_cutoff):
"""
filter matches below score_cutoff based on string lengths
"""
if not pattern_len or not text_len:
return False
sim = _jaro_calculate_similarity(pattern_len, text_len, min(pattern_len, text_len), 0)
return sim >= score_cutoff
def _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
"""
filter matches below score_cutoff based on string lengths and common characters
"""
if not common_chars:
return False
sim = _jaro_calculate_similarity(pattern_len, text_len, common_chars, 0)
return sim >= score_cutoff
def _jaro_bounds(s1, s2):
"""
find bounds and skip out of bound parts of the sequences
"""
pattern_len = len(s1)
text_len = len(s2)
# since jaro uses a sliding window some parts of T/P might never be in
# range an can be removed ahead of time
bound = 0
if text_len > pattern_len:
bound = text_len // 2 - 1
if text_len > pattern_len + bound:
s2 = s2[: pattern_len + bound]
else:
bound = pattern_len // 2 - 1
if pattern_len > text_len + bound:
s1 = s1[: text_len + bound]
return s1, s2, bound
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if not s1 and not s2:
return 1.0
if score_cutoff is None:
score_cutoff = 0
s1, s2 = conv_sequences(s1, s2)
pattern_len = len(s1)
text_len = len(s2)
# short circuit if score_cutoff can not be reached
if not _jaro_length_filter(pattern_len, text_len, score_cutoff):
return 0
if pattern_len == 1 and text_len == 1:
return float(s1[0] == s2[0])
s1, s2, bound = _jaro_bounds(s1, s2)
s1_flags = [False] * pattern_len
s2_flags = [False] * text_len
# todo use bitparallel implementation
# looking only within search range, count & flag matched pairs
common_chars = 0
for i, s1_ch in enumerate(s1):
low = max(0, i - bound)
hi = min(i + bound, text_len - 1)
for j in range(low, hi + 1):
if not s2_flags[j] and s2[j] == s1_ch:
s1_flags[i] = s2_flags[j] = True
common_chars += 1
break
# short circuit if score_cutoff can not be reached
if not _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
return 0
# todo use bitparallel implementation
# count transpositions
k = trans_count = 0
for i, s1_f in enumerate(s1_flags):
if s1_f:
for j in range(k, text_len):
if s2_flags[j]:
k = j + 1
break
if s1[i] != s2[j]:
trans_count += 1
return _jaro_calculate_similarity(pattern_len, text_len, common_chars, trans_count)
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized similarity : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
return similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
distance : float
distance between s1 and s2 as a float between 1.0 and 0.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
sim = similarity(s1, s2, score_cutoff=cutoff_distance)
dist = 1.0 - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized distance : float
normalized distance between s1 and s2 as a float between 1.0 and 0.0
"""
return distance(s1, s2, processor=processor, score_cutoff=score_cutoff)