Repository URL to install this package:
|
Version:
0.1.31-1 ▾
|
odigos-demo-inventory
/
opt
/
odigos-demo-inventory
/
site-packages
/
rapidfuzz
/
distance
/
JaroWinkler_py.py
|
|---|
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
from rapidfuzz.distance import Jaro_py as Jaro
def similarity(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro winkler similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 1.0
Raises
------
ValueError
If prefix_weight is invalid
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if score_cutoff is None:
score_cutoff = 0
if prefix_weight > 1.0 or prefix_weight < 0.0:
msg = "prefix_weight has to be in the range 0.0 - 1.0"
raise ValueError(msg)
s1, s2 = conv_sequences(s1, s2)
P_len = len(s1)
T_len = len(s2)
min_len = min(P_len, T_len)
prefix = 0
max_prefix = min(min_len, 4)
for _ in range(max_prefix):
if s1[prefix] != s2[prefix]:
break
prefix += 1
jaro_score_cutoff = score_cutoff
if jaro_score_cutoff > 0.7:
prefix_sim = prefix * prefix_weight
if prefix_sim >= 1.0:
jaro_score_cutoff = 0.7
else:
jaro_score_cutoff = max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0))
Sim = Jaro.similarity(s1, s2, score_cutoff=jaro_score_cutoff)
if Sim > 0.7:
Sim += prefix * prefix_weight * (1.0 - Sim)
Sim = min(Sim, 1.0)
return Sim if Sim >= score_cutoff else 0
def normalized_similarity(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro winkler similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized similarity : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Raises
------
ValueError
If prefix_weight is invalid
"""
return similarity(
s1,
s2,
prefix_weight=prefix_weight,
processor=processor,
score_cutoff=score_cutoff,
)
def distance(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro winkler distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
distance : float
distance between s1 and s2 as a float between 1.0 and 0.0
Raises
------
ValueError
If prefix_weight is invalid
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
sim = similarity(s1, s2, prefix_weight=prefix_weight, score_cutoff=cutoff_distance)
dist = 1.0 - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
def normalized_distance(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro winkler distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized distance : float
normalized distance between s1 and s2 as a float between 1.0 and 0.0
Raises
------
ValueError
If prefix_weight is invalid
"""
return distance(
s1,
s2,
prefix_weight=prefix_weight,
processor=processor,
score_cutoff=score_cutoff,
)