Repository URL to install this package:
Version:
1.5.4 ▾
|
# frozen_string_literal: true
require 'jaro_winkler/adjusting_table'
module JaroWinkler
class Error < RuntimeError; end
class InvalidWeightError < Error; end
DEFAULT_WEIGHT = 0.1
DEFAULT_THRESHOLD = 0.7
DEFAULT_OPTIONS = {
jaro: { adj_table: false, ignore_case: false },
jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
}.freeze
class << self
def distance(str1, str2, options = {})
validate!(str1, str2)
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end
def jaro_distance(str1, str2, options = {})
validate!(str1, str2)
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end
private
def _distance(codes1, codes2, options = {})
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
raise InvalidWeightError if options[:weight] > 0.25
jaro_distance = _jaro_distance(codes1, codes2, options)
if jaro_distance < options[:threshold]
jaro_distance
else
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
len1 = codes1.length
len2 = codes2.length
max_4 = len1 > 4 ? 4 : len1
prefix = 0
prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
end
end
def _jaro_distance(codes1, codes2, options = {})
options = DEFAULT_OPTIONS[:jaro].merge options
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
len1 = codes1.length
len2 = codes2.length
return 0.0 if len1 == 0 || len2 == 0
if options[:ignore_case]
codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
end
window = len2 / 2 - 1
window = 0 if window < 0
flags1 = 0
flags2 = 0
# // count number of matching characters
match_count = 0
i = 0
while i < len1
left = i >= window ? i - window : 0
right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
right = len2 - 1 if right > len2 - 1
j = left
while j <= right
if flags2[j] == 0 && codes1[i] == codes2[j]
flags1 |= (1 << i)
flags2 |= (1 << j)
match_count += 1
break
end
j += 1
end
i += 1
end
return 0.0 if match_count == 0
# // count number of transpositions
transposition_count = j = k = 0
i = 0
while i < len1
if flags1[i] == 1
j = k
while j < len2
if flags2[j] == 1
k = j + 1
break
end
j += 1
end
transposition_count += 1 if codes1[i] != codes2[j]
end
i += 1
end
# // count similarities in nonmatched characters
similar_count = 0
if options[:adj_table] && len1 > match_count
i = 0
while i < len1
if flags1[i] == 0
j = 0
while j < len2
if flags2[j] == 0
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
similar_count += 3
break
end
end
j += 1
end
end
i += 1
end
end
m = match_count.to_f
t = transposition_count / 2
m = similar_count / 10.0 + m if options[:adj_table]
(m / len1 + m / len2 + (m - t) / m) / 3
end
def validate!(str1, str2)
raise TypeError unless str1.is_a?(String) && str2.is_a?(String)
end
end
end