Repository URL to install this package:
|
Version:
0.1.31-1 ▾
|
odigos-demo-inventory
/
opt
/
odigos-demo-inventory
/
site-packages
/
rapidfuzz
/
distance
/
DamerauLevenshtein_py.py
|
|---|
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
def _damerau_levenshtein_distance_zhao(s1, s2):
maxVal = max(len(s1), len(s2)) + 1
last_row_id = {}
last_row_id_get = last_row_id.get
size = len(s2) + 2
FR = [maxVal] * size
R1 = [maxVal] * size
R = list(range(size))
R[-1] = maxVal
for i in range(1, len(s1) + 1):
R, R1 = R1, R
last_col_id = -1
last_i2l1 = R[0]
R[0] = i
T = maxVal
for j in range(1, len(s2) + 1):
diag = R1[j - 1] + (s1[i - 1] != s2[j - 1])
left = R[j - 1] + 1
up = R1[j] + 1
temp = min(diag, left, up)
if s1[i - 1] == s2[j - 1]:
last_col_id = j # last occurrence of s1_i
FR[j] = R1[j - 2] # save H_k-1,j-2
T = last_i2l1 # save H_i-2,l-1
else:
k = last_row_id_get(s2[j - 1], -1)
l = last_col_id # noqa: E741
if (j - l) == 1:
transpose = FR[j] + (i - k)
temp = min(temp, transpose)
elif (i - k) == 1:
transpose = T + (j - l)
temp = min(temp, transpose)
last_i2l1 = R[j]
R[j] = temp
last_row_id[s1[i - 1]] = i
return R[len(s2)]
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the Damerau-Levenshtein distance.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Examples
--------
Find the Damerau-Levenshtein distance between two strings:
>>> from rapidfuzz.distance import DamerauLevenshtein
>>> DamerauLevenshtein.distance("CA", "ABC")
2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
dist = _damerau_levenshtein_distance_zhao(s1, s2)
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the Damerau-Levenshtein similarity in the range [max, 0].
This is calculated as ``max(len1, len2) - distance``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
similarity : int
similarity between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2)
sim = maximum - dist
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized Damerau-Levenshtein distance in the range [1, 0].
This is calculated as ``distance / max(len1, len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2)
norm_dist = dist / maximum if maximum else 0
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized Damerau-Levenshtein similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
norm_dist = normalized_distance(s1, s2)
norm_sim = 1.0 - norm_dist
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0