Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
jaro_winkler / lib / jaro_winkler / jaro_winkler_pure.rb
Size: Mime:
# frozen_string_literal: true

require 'jaro_winkler/adjusting_table'
module JaroWinkler
  class Error < RuntimeError; end
  class InvalidWeightError < Error; end

  DEFAULT_WEIGHT = 0.1
  DEFAULT_THRESHOLD = 0.7
  DEFAULT_OPTIONS = {
    jaro: { adj_table: false, ignore_case: false },
    jaro_winkler: { weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD }
  }.freeze

  class << self
    def distance(str1, str2, options = {})
      validate!(str1, str2)
      _distance str1.codepoints.to_a, str2.codepoints.to_a, options
    end

    def jaro_distance(str1, str2, options = {})
      validate!(str1, str2)
      _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
    end

    private

    def _distance(codes1, codes2, options = {})
      options = DEFAULT_OPTIONS[:jaro_winkler].merge options
      raise InvalidWeightError if options[:weight] > 0.25
      jaro_distance = _jaro_distance(codes1, codes2, options)

      if jaro_distance < options[:threshold]
        jaro_distance
      else
        codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
        len1 = codes1.length
        len2 = codes2.length
        max_4 = len1 > 4 ? 4 : len1
        prefix = 0
        prefix += 1 while prefix < max_4 && codes1[prefix] == codes2[prefix]
        jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
      end
    end

    def _jaro_distance(codes1, codes2, options = {})
      options = DEFAULT_OPTIONS[:jaro].merge options

      codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
      len1 = codes1.length
      len2 = codes2.length
      return 0.0 if len1 == 0 || len2 == 0

      if options[:ignore_case]
        codes1.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
        codes2.map! { |c| c >= 97 && c <= 122 ? c -= 32 : c }
      end

      window = len2 / 2 - 1
      window = 0 if window < 0
      flags1 = 0
      flags2 = 0

      # // count number of matching characters
      match_count = 0
      i = 0
      while i < len1
        left = i >= window ? i - window : 0
        right = i + window <= len2 - 1 ? (i + window) : (len2 - 1)
        right = len2 - 1 if right > len2 - 1
        j = left
        while j <= right
          if flags2[j] == 0 && codes1[i] == codes2[j]
            flags1 |= (1 << i)
            flags2 |= (1 << j)
            match_count += 1
            break
          end
          j += 1
        end
        i += 1
      end

      return 0.0 if match_count == 0

      # // count number of transpositions
      transposition_count = j = k = 0
      i = 0
      while i < len1
        if flags1[i] == 1
          j = k
          while j < len2
            if flags2[j] == 1
              k = j + 1
              break
            end
            j += 1
          end
          transposition_count += 1 if codes1[i] != codes2[j]
        end
        i += 1
      end

      # // count similarities in nonmatched characters
      similar_count = 0
      if options[:adj_table] && len1 > match_count
        i = 0
        while i < len1
          if flags1[i] == 0
            j = 0
            while j < len2
              if flags2[j] == 0
                if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
                  similar_count += 3
                  break
                end
              end
              j += 1
            end
          end
          i += 1
        end
      end

      m = match_count.to_f
      t = transposition_count / 2
      m = similar_count / 10.0 + m if options[:adj_table]
      (m / len1 + m / len2 + (m - t) / m) / 3
    end

    def validate!(str1, str2)
      raise TypeError unless str1.is_a?(String) && str2.is_a?(String)
    end
  end
end