Gemfury

duality-group / gensim python

Repository URL to install this package:
Details
gensim / similarities / nmslib.py
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
# Copyright (C) 2019 Masahiro Kazama <kazama.masa@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
This module integrates `NMSLIB <https://github.com/nmslib/nmslib>`_ fast similarity
search with Gensim's :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec`,
:class:`~gensim.models.fasttext.FastText` and :class:`~gensim.models.keyedvectors.KeyedVectors`
vector embeddings.

.. Important::
    To use this module, you must have the external ``nmslib`` library installed.
    To install it, run ``pip install nmslib``.

To use the integration, instantiate a :class:`~gensim.similarities.nmslib.NmslibIndexer` class
and pass the instance as the `indexer` parameter to your model's `model.most_similar()` method.

Example usage
-------------

.. sourcecode:: pycon

    >>> from gensim.similarities.nmslib import NmslibIndexer
    >>> from gensim.models import Word2Vec
    >>>
    >>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
    >>> model = Word2Vec(sentences, min_count=1, epochs=10, seed=2)
    >>>
    >>> indexer = NmslibIndexer(model)
    >>> model.wv.most_similar("cat", topn=2, indexer=indexer)
    [('cat', 1.0), ('meow', 0.16398882865905762)]

Load and save example
---------------------

.. sourcecode:: pycon

    >>> from gensim.similarities.nmslib import NmslibIndexer
    >>> from gensim.models import Word2Vec
    >>> from tempfile import mkstemp
    >>>
    >>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
    >>> model = Word2Vec(sentences, min_count=1, seed=2, epochs=10)
    >>>
    >>> indexer = NmslibIndexer(model)
    >>> _, temp_fn = mkstemp()
    >>> indexer.save(temp_fn)
    >>>
    >>> new_indexer = NmslibIndexer.load(temp_fn)
    >>> model.wv.most_similar("cat", topn=2, indexer=new_indexer)
    [('cat', 1.0), ('meow', 0.5595494508743286)]

What is NMSLIB
--------------

Non-Metric Space Library (NMSLIB) is an efficient cross-platform similarity search library and a toolkit
for evaluation of similarity search methods. The core-library does not have any third-party dependencies.
More information about NMSLIB: `github repository <https://github.com/nmslib/nmslib>`_.

Why use NMSIB?
--------------

Gensim's native :py:class:`~gensim.similarities.Similarity` for finding the `k` nearest neighbors to a vector
uses brute force and has linear complexity, albeit with extremely low constant factors.

The retrieved results are exact, which is an overkill in many applications:
approximate results retrieved in sub-linear time may be enough.

NMSLIB can find approximate nearest neighbors much faster, similar to Spotify's Annoy library.
Compared to :py:class:`~gensim.similarities.annoy.Annoy`, NMSLIB has more parameters to
control the build and query time and accuracy. NMSLIB often achieves faster and more accurate
nearest neighbors search than Annoy.

"""

# Avoid import collisions on py2: this module has the same name as the actual NMSLIB library.
from __future__ import absolute_import
import pickle as _pickle

from smart_open import open
try:
    import nmslib
except ImportError:
    raise ImportError("NMSLIB not installed. To use the NMSLIB indexer, please run `pip install nmslib`.")

from gensim import utils
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors


class NmslibIndexer():
    """This class allows to use `NMSLIB <https://github.com/nmslib/nmslib>`_ as indexer for `most_similar` method
    from :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec`,
    :class:`~gensim.models.fasttext.FastText` and :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` classes.

    """

    def __init__(self, model, index_params=None, query_time_params=None):
        """
        Parameters
        ----------
        model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
            Model, that will be used as source for index.
        index_params : dict, optional
            Indexing parameters passed through to NMSLIB:
            https://github.com/nmslib/nmslib/blob/master/manual/methods.md#graph-based-search-methods-sw-graph-and-hnsw

            If not specified, defaults to `{'M': 100, 'indexThreadQty': 1, 'efConstruction': 100, 'post': 0}`.
        query_time_params : dict, optional
            query_time_params for NMSLIB indexer.
            If not specified, defaults to `{'efSearch': 100}`.

        """
        if index_params is None:
            index_params = {'M': 100, 'indexThreadQty': 1, 'efConstruction': 100, 'post': 0}
        if query_time_params is None:
            query_time_params = {'efSearch': 100}

        self.index = None
        self.labels = None
        self.model = model
        self.index_params = index_params
        self.query_time_params = query_time_params

        #
        # In the main use case, the user will pass us a non-None model, and we use that model
        # to initialize the index and labels.  In a separate (completely internal) use case, the
        # NsmlibIndexer.load function handles the index and label initialization separately,
        # so it passes us None as the model.
        #
        if model:
            if isinstance(self.model, Doc2Vec):
                self._build_from_doc2vec()
            elif isinstance(self.model, (Word2Vec, FastText)):
                self._build_from_word2vec()
            elif isinstance(self.model, (KeyedVectors,)):
                self._build_from_keyedvectors()
            else:
                raise ValueError("model must be a Word2Vec, Doc2Vec, FastText or KeyedVectors instance")

    def save(self, fname, protocol=utils.PICKLE_PROTOCOL):
        """Save this NmslibIndexer instance to a file.

        Parameters
        ----------
        fname : str
            Path to the output file,
            will produce 2 files: `fname` - parameters and `fname`.d - :class:`~nmslib.NmslibIndex`.
        protocol : int, optional
            Protocol for pickle.

        Notes
        -----
        This method saves **only** the index (**the model isn't preserved**).

        """
        fname_dict = fname + '.d'
        self.index.saveIndex(fname)
        d = {'index_params': self.index_params, 'query_time_params': self.query_time_params, 'labels': self.labels}
        with open(fname_dict, 'wb') as fout:
            _pickle.dump(d, fout, protocol=protocol)

    @classmethod
    def load(cls, fname):
        """Load a NmslibIndexer instance from a file.

        Parameters
        ----------
        fname : str
            Path previously used in `save()`.

        """
        fname_dict = fname + '.d'
        with open(fname_dict, 'rb') as f:
            d = _pickle.load(f)
        index_params = d['index_params']
        query_time_params = d['query_time_params']
        nmslib_instance = cls(model=None, index_params=index_params, query_time_params=query_time_params)
        index = nmslib.init(method='hnsw', space='cosinesimil')
        index.loadIndex(fname)
        nmslib_instance.index = index
        nmslib_instance.labels = d['labels']
        return nmslib_instance

    def _build_from_word2vec(self):
        """Build an NMSLIB index using word vectors from a Word2Vec model."""
        self._build_from_model(self.model.wv.get_normed_vectors(), self.model.wv.index_to_key)

    def _build_from_doc2vec(self):
        """Build an NMSLIB index using document vectors from a Doc2Vec model."""
        docvecs = self.model.dv
        labels = docvecs.index_to_key
        self._build_from_model(docvecs.get_normed_vectors(), labels)

    def _build_from_keyedvectors(self):
        """Build an NMSLIB index using word vectors from a KeyedVectors model."""
        self._build_from_model(self.model.get_normed_vectors(), self.model.index_to_key)

    def _build_from_model(self, vectors, labels):
        index = nmslib.init(method='hnsw', space='cosinesimil')
        index.addDataPointBatch(vectors)

        index.createIndex(self.index_params, print_progress=True)
        nmslib.setQueryTimeParams(index, self.query_time_params)

        self.index = index
        self.labels = labels

    def most_similar(self, vector, num_neighbors):
        """Find the approximate `num_neighbors` most similar items.

        Parameters
        ----------
        vector : numpy.array
            Vector for a word or document.
        num_neighbors : int
            How many most similar items to look for?

        Returns
        -------
        list of (str, float)
            List of most similar items in the format `[(item, cosine_similarity), ... ]`.

        """
        ids, distances = self.index.knnQueryBatch(vector.reshape(1, -1), k=num_neighbors)[0]

        # NMSLIB returns cosine distance (not similarity), which is simply `dist = 1 - cossim`.
        # So, convert back to similarities here.
        return [(self.labels[id_], 1.0 - distance) for id_, distance in zip(ids, distances)]
duality-group / gensim python

Products

About

Resources

Contact Gemfury