Gemfury

duality-group / gensim python

Repository URL to install this package:
Details
gensim / test / test_phrases.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automated tests for the phrase detection module.
"""

import logging
import unittest

import numpy as np

from gensim.models.phrases import Phrases, FrozenPhrases, _PhrasesTransformation
from gensim.models.phrases import original_scorer
from gensim.test.utils import common_texts, temporary_file, datapath


class TestPhraseAnalysis(unittest.TestCase):

    class AnalysisTester(_PhrasesTransformation):

        def __init__(self, scores, threshold):
            super().__init__(connector_words={"a", "the", "with", "of"})
            self.scores = scores
            self.threshold = threshold

        def score_candidate(self, word_a, word_b, in_between):
            phrase = "_".join([word_a] + in_between + [word_b])
            score = self.scores.get(phrase, -1)
            if score > self.threshold:
                return phrase, score
            return None, None

    def test_simple_analysis(self):
        """Test transformation with no phrases."""
        sentence = ["simple", "sentence", "should", "pass"]
        result = self.AnalysisTester({}, threshold=1)[sentence]
        self.assertEqual(result, sentence)
        sentence = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"]
        result = self.AnalysisTester({}, threshold=1)[sentence]
        self.assertEqual(result, sentence)

    def test_analysis_bigrams(self):
        scores = {
            "simple_sentence": 2, "sentence_many": 2,
            "many_possible": 2, "possible_bigrams": 2,
        }
        sentence = ["simple", "sentence", "many", "possible", "bigrams"]
        result = self.AnalysisTester(scores, threshold=1)[sentence]
        self.assertEqual(result, ["simple_sentence", "many_possible", "bigrams"])

        sentence = ["some", "simple", "sentence", "many", "bigrams"]
        result = self.AnalysisTester(scores, threshold=1)[sentence]
        self.assertEqual(result, ["some", "simple_sentence", "many", "bigrams"])

        sentence = ["some", "unrelated", "simple", "words"]
        result = self.AnalysisTester(scores, threshold=1)[sentence]
        self.assertEqual(result, sentence)

    def test_analysis_connector_words(self):
        scores = {
            "simple_sentence": 2, "sentence_many": 2,
            "many_possible": 2, "possible_bigrams": 2,
        }
        sentence = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"]
        result = self.AnalysisTester(scores, threshold=1)[sentence]
        self.assertEqual(result, ["a", "simple_sentence", "many", "the", "possible_bigrams"])

        sentence = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"]
        result = self.AnalysisTester(scores, threshold=1)[sentence]
        self.assertEqual(
            result,
            ["simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"],
        )

    def test_analysis_connector_words_in_between(self):
        scores = {
            "simple_sentence": 2, "sentence_with_many": 2,
            "many_possible": 2, "many_of_the_possible": 2, "possible_bigrams": 2,
        }
        sentence = ["sentence", "with", "many", "possible", "bigrams"]
        result = self.AnalysisTester(scores, threshold=1)[sentence]
        self.assertEqual(result, ["sentence_with_many", "possible_bigrams"])

        sentence = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"]
        result = self.AnalysisTester(scores, threshold=1)[sentence]
        self.assertEqual(
            result, ["a", "simple_sentence", "with", "many_of_the_possible", "bigrams", "with"])


class PhrasesData:

    sentences = common_texts + [
        ['graph', 'minors', 'survey', 'human', 'interface'],
    ]
    connector_words = frozenset()

    bigram1 = u'response_time'
    bigram2 = u'graph_minors'
    bigram3 = u'human_interface'

    def gen_sentences(self):
        return ((w for w in sentence) for sentence in self.sentences)


class PhrasesCommon(PhrasesData):
    """Tests for both Phrases and FrozenPhrases classes."""

    def setUp(self):
        self.bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
        self.bigram_default = Phrases(self.sentences, connector_words=self.connector_words)

    def test_empty_phrasified_sentences_iterator(self):
        bigram_phrases = Phrases(self.sentences)
        bigram_phraser = FrozenPhrases(bigram_phrases)
        trigram_phrases = Phrases(bigram_phraser[self.sentences])
        trigram_phraser = FrozenPhrases(trigram_phrases)
        trigrams = trigram_phraser[bigram_phraser[self.sentences]]
        fst, snd = list(trigrams), list(trigrams)
        self.assertEqual(fst, snd)
        self.assertNotEqual(snd, [])

    def test_empty_inputs_on_bigram_construction(self):
        """Test that empty inputs don't throw errors and return the expected result."""
        # Empty list -> empty list
        self.assertEqual(list(self.bigram_default[[]]), [])
        # Empty iterator -> empty list
        self.assertEqual(list(self.bigram_default[iter(())]), [])
        # List of empty list -> list of empty list
        self.assertEqual(list(self.bigram_default[[[], []]]), [[], []])
        # Iterator of empty list -> list of empty list
        self.assertEqual(list(self.bigram_default[iter([[], []])]), [[], []])
        # Iterator of empty iterator -> list of empty list
        self.assertEqual(list(self.bigram_default[(iter(()) for i in range(2))]), [[], []])

    def test_sentence_generation(self):
        """Test basic bigram using a dummy corpus."""
        # test that we generate the same amount of sentences as the input
        self.assertEqual(
            len(self.sentences),
            len(list(self.bigram_default[self.sentences])),
        )

    def test_sentence_generation_with_generator(self):
        """Test basic bigram production when corpus is a generator."""
        self.assertEqual(
            len(list(self.gen_sentences())),
            len(list(self.bigram_default[self.gen_sentences()])),
        )

    def test_bigram_construction(self):
        """Test Phrases bigram construction."""
        # with this setting we should get response_time and graph_minors
        bigram1_seen = False
        bigram2_seen = False
        for sentence in self.bigram[self.sentences]:
            if not bigram1_seen and self.bigram1 in sentence:
                bigram1_seen = True
            if not bigram2_seen and self.bigram2 in sentence:
                bigram2_seen = True
            if bigram1_seen and bigram2_seen:
                break

        self.assertTrue(bigram1_seen and bigram2_seen)

        # check the same thing, this time using single doc transformation
        # last sentence should contain both graph_minors and human_interface
        self.assertTrue(self.bigram1 in self.bigram[self.sentences[1]])
        self.assertTrue(self.bigram1 in self.bigram[self.sentences[4]])
        self.assertTrue(self.bigram2 in self.bigram[self.sentences[-2]])
        self.assertTrue(self.bigram2 in self.bigram[self.sentences[-1]])
        self.assertTrue(self.bigram3 in self.bigram[self.sentences[-1]])

    def test_bigram_construction_from_generator(self):
        """Test Phrases bigram construction building when corpus is a generator."""
        bigram1_seen = False
        bigram2_seen = False

        for s in self.bigram[self.gen_sentences()]:
            if not bigram1_seen and self.bigram1 in s:
                bigram1_seen = True
            if not bigram2_seen and self.bigram2 in s:
                bigram2_seen = True
            if bigram1_seen and bigram2_seen:
                break
        self.assertTrue(bigram1_seen and bigram2_seen)

    def test_bigram_construction_from_array(self):
        """Test Phrases bigram construction building when corpus is a numpy array."""
        bigram1_seen = False
        bigram2_seen = False

        for s in self.bigram[np.array(self.sentences, dtype=object)]:
            if not bigram1_seen and self.bigram1 in s:
                bigram1_seen = True
            if not bigram2_seen and self.bigram2 in s:
                bigram2_seen = True
            if bigram1_seen and bigram2_seen:
                break
        self.assertTrue(bigram1_seen and bigram2_seen)


# scorer for testCustomScorer
# function is outside of the scope of the test because for picklability of custom scorer
# Phrases tests for picklability
# all scores will be 1
def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
    return 1


class TestPhrasesModel(PhrasesCommon, unittest.TestCase):

    def test_export_phrases(self):
        """Test Phrases bigram and trigram export phrases."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
        trigram = Phrases(bigram[self.sentences], min_count=1, threshold=1, delimiter=' ')
        seen_bigrams = set(bigram.export_phrases().keys())
        seen_trigrams = set(trigram.export_phrases().keys())

        assert seen_bigrams == set([
            'human interface',
            'response time',
            'graph minors',
            'minors survey',
        ])

        assert seen_trigrams == set([
            'human interface',
            'graph minors survey',
        ])

    def test_find_phrases(self):
        """Test Phrases bigram find phrases."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
        seen_bigrams = set(bigram.find_phrases(self.sentences).keys())

        assert seen_bigrams == set([
            'response time',
            'graph minors',
            'human interface',
        ])

    def test_multiple_bigrams_single_entry(self):
        """Test a single entry produces multiple bigrams."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_bigrams = set(bigram.find_phrases(test_sentences).keys())

        assert seen_bigrams == {'graph minors', 'human interface'}

    def test_scoring_default(self):
        """Test the default scoring, from the mikolov word2vec paper."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == {
            5.167,  # score for graph minors
            3.444  # score for human interface
        }

    def test__getitem__(self):
        """Test Phrases[sentences] with a single sentence."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1)
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        phrased_sentence = next(bigram[test_sentences].__iter__())

        assert phrased_sentence == ['graph_minors', 'survey', 'human_interface']

    def test_scoring_npmi(self):
        """Test normalized pointwise mutual information scoring."""
        bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == {
            .882,  # score for graph minors
            .714  # score for human interface
        }

    def test_custom_scorer(self):
        """Test using a custom scoring function."""
        bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        seen_scores = list(bigram.find_phrases(test_sentences).values())

        assert all(score == 1 for score in seen_scores)
        assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'

    def test_bad_parameters(self):
        """Test the phrases module with bad parameters."""
        # should fail with something less or equal than 0
        self.assertRaises(ValueError, Phrases, self.sentences, min_count=0)

        # threshold should be positive
        self.assertRaises(ValueError, Phrases, self.sentences, threshold=-1)

    def test_pruning(self):
        """Test that max_vocab_size parameter is respected."""
        bigram = Phrases(self.sentences, max_vocab_size=5)
        self.assertTrue(len(bigram.vocab) <= 5)
# endclass TestPhrasesModel


class TestPhrasesPersistence(PhrasesData, unittest.TestCase):
    def test_save_load_custom_scorer(self):
        """Test saving and loading a Phrases object with a custom scorer."""
        bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
        with temporary_file("test.pkl") as fpath:
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        seen_scores = list(bigram_loaded.find_phrases(test_sentences).values())

        assert all(score == 1 for score in seen_scores)
        assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'

    def test_save_load(self):
        """Test saving and loading a Phrases object."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1)
        with temporary_file("test.pkl") as fpath:
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        seen_scores = set(round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values())
        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])

    def test_save_load_with_connector_words(self):
        """Test saving and loading a Phrases object."""
        connector_words = frozenset({'of'})
        bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words)
        with temporary_file("test.pkl") as fpath:
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)

        assert bigram_loaded.connector_words == connector_words

    def test_save_load_string_scoring(self):
        """Test backwards compatibility with a previous version of Phrases with custom scoring."""
        bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl"))
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        seen_scores = set(round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values())

        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])

    def test_save_load_no_scoring(self):
        """Test backwards compatibility with old versions of Phrases with no scoring parameter."""
        bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl"))
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        seen_scores = set(round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values())

        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])

    def test_save_load_no_common_terms(self):
        """Ensure backwards compatibility with old versions of Phrases, before connector_words."""
        bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
        self.assertEqual(bigram_loaded.connector_words, frozenset())
        # can make a phraser, cf #1751
        phraser = FrozenPhrases(bigram_loaded)  # does not raise
        phraser[["human", "interface", "survey"]]  # does not raise


class TestFrozenPhrasesPersistence(PhrasesData, unittest.TestCase):

    def test_save_load_custom_scorer(self):
        """Test saving and loading a FrozenPhrases object with a custom scorer."""

        with temporary_file("test.pkl") as fpath:
            bigram = FrozenPhrases(
                Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = FrozenPhrases.load(fpath)
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)

    def test_save_load(self):
        """Test saving and loading a FrozenPhrases object."""
        with temporary_file("test.pkl") as fpath:
            bigram = FrozenPhrases(Phrases(self.sentences, min_count=1, threshold=1))
            bigram.save(fpath)
            bigram_loaded = FrozenPhrases.load(fpath)
            self.assertEqual(
                bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
                ['graph_minors', 'survey', 'human_interface', 'system'])

    def test_save_load_with_connector_words(self):
        """Test saving and loading a FrozenPhrases object."""
        connector_words = frozenset({'of'})
        with temporary_file("test.pkl") as fpath:
            bigram = FrozenPhrases(Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words))
            bigram.save(fpath)
            bigram_loaded = FrozenPhrases.load(fpath)
            self.assertEqual(bigram_loaded.connector_words, connector_words)

    def test_save_load_string_scoring(self):
        """Test saving and loading a FrozenPhrases object with a string scoring parameter.
        This should ensure backwards compatibility with the previous version of FrozenPhrases"""
        bigram_loaded = FrozenPhrases.load(datapath("phraser-scoring-str.pkl"))
        # we do not much with scoring, just verify its the one expected
        self.assertEqual(bigram_loaded.scoring, original_scorer)

    def test_save_load_no_scoring(self):
        """Test saving and loading a FrozenPhrases object with no scoring parameter.
        This should ensure backwards compatibility with old versions of FrozenPhrases"""
        bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl"))
        # we do not much with scoring, just verify its the one expected
        self.assertEqual(bigram_loaded.scoring, original_scorer)

    def test_save_load_no_common_terms(self):
        """Ensure backwards compatibility with old versions of FrozenPhrases, before connector_words."""
        bigram_loaded = FrozenPhrases.load(datapath("phraser-no-common-terms.pkl"))
        self.assertEqual(bigram_loaded.connector_words, frozenset())


class TestFrozenPhrasesModel(PhrasesCommon, unittest.TestCase):
    """Test FrozenPhrases models."""

    def setUp(self):
        """Set up FrozenPhrases models for the tests."""
        bigram_phrases = Phrases(
            self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
        self.bigram = FrozenPhrases(bigram_phrases)

        bigram_default_phrases = Phrases(self.sentences, connector_words=self.connector_words)
        self.bigram_default = FrozenPhrases(bigram_default_phrases)


class CommonTermsPhrasesData:
    """This mixin permits to reuse tests with the connector_words option."""

    sentences = [
        ['human', 'interface', 'with', 'computer'],
        ['survey', 'of', 'user', 'computer', 'system', 'lack', 'of', 'interest'],
        ['eps', 'user', 'interface', 'system'],
        ['system', 'and', 'human', 'system', 'eps'],
        ['user', 'lack', 'of', 'interest'],
        ['trees'],
        ['graph', 'of', 'trees'],
        ['data', 'and', 'graph', 'of', 'trees'],
        ['data', 'and', 'graph', 'survey'],
        ['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']  # test bigrams within same sentence
    ]
    connector_words = ['of', 'and', 'for']

    bigram1 = u'lack_of_interest'
    bigram2 = u'data_and_graph'
    bigram3 = u'human_interface'
    expression1 = u'lack of interest'
    expression2 = u'data and graph'
    expression3 = u'human interface'

    def gen_sentences(self):
        return ((w for w in sentence) for sentence in self.sentences)


class TestPhrasesModelCommonTerms(CommonTermsPhrasesData, TestPhrasesModel):
    """Test Phrases models with connector words."""

    def test_multiple_bigrams_single_entry(self):
        """Test a single entry produces multiple bigrams."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ')
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        seen_bigrams = set(bigram.find_phrases(test_sentences).keys())

        assert seen_bigrams == set([
            'data and graph',
            'human interface',
        ])

    def test_find_phrases(self):
        """Test Phrases bigram export phrases."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ')
        seen_bigrams = set(bigram.find_phrases(self.sentences).keys())

        assert seen_bigrams == set([
            'human interface',
            'graph of trees',
            'data and graph',
            'lack of interest',
        ])

    def test_export_phrases(self):
        """Test Phrases bigram export phrases."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
        seen_bigrams = set(bigram.export_phrases().keys())
        assert seen_bigrams == set([
            'and graph',
            'data and',
            'graph of',
            'graph survey',
            'human interface',
            'lack of',
            'of interest',
            'of trees',
        ])

    def test_scoring_default(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())

        min_count = float(bigram.min_count)
        len_vocab = float(len(bigram.vocab))
        graph = float(bigram.vocab["graph"])
        data = float(bigram.vocab["data"])
        data_and_graph = float(bigram.vocab["data_and_graph"])
        human = float(bigram.vocab["human"])
        interface = float(bigram.vocab["interface"])
        human_interface = float(bigram.vocab["human_interface"])

        assert seen_scores == set([
            # score for data and graph
            round((data_and_graph - min_count) / data / graph * len_vocab, 3),
            # score for human interface
            round((human_interface - min_count) / human / interface * len_vocab, 3),
        ])

    def test_scoring_npmi(self):
        """Test normalized pointwise mutual information scoring."""
        bigram = Phrases(
            self.sentences, min_count=1, threshold=.5,
            scoring='npmi', connector_words=self.connector_words,
        )
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == set([
            .74,  # score for data and graph
            .894  # score for human interface
        ])

    def test_custom_scorer(self):
        """Test using a custom scoring function."""
        bigram = Phrases(
            self.sentences, min_count=1, threshold=.001,
            scoring=dumb_scorer, connector_words=self.connector_words,
        )
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        seen_scores = list(bigram.find_phrases(test_sentences).values())

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 2  # 'data and graph' 'survey for human'

    def test__getitem__(self):
        """Test Phrases[sentences] with a single sentence."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        phrased_sentence = next(bigram[test_sentences].__iter__())

        assert phrased_sentence == ['data_and_graph', 'survey', 'for', 'human_interface']


class TestFrozenPhrasesModelCompatibility(unittest.TestCase):

    def test_compatibility(self):
        phrases = Phrases.load(datapath("phrases-3.6.0.model"))
        phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model"))
        test_sentences = ['trees', 'graph', 'minors']

        self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors'])
        self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])


if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
    unittest.main()
duality-group / gensim python

Products

About

Resources

Contact Gemfury