Repository URL to install this package:
|
Version:
4.2.0 ▾
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Automated tests for the phrase detection module.
"""
import logging
import unittest
import numpy as np
from gensim.models.phrases import Phrases, FrozenPhrases, _PhrasesTransformation
from gensim.models.phrases import original_scorer
from gensim.test.utils import common_texts, temporary_file, datapath
class TestPhraseAnalysis(unittest.TestCase):
class AnalysisTester(_PhrasesTransformation):
def __init__(self, scores, threshold):
super().__init__(connector_words={"a", "the", "with", "of"})
self.scores = scores
self.threshold = threshold
def score_candidate(self, word_a, word_b, in_between):
phrase = "_".join([word_a] + in_between + [word_b])
score = self.scores.get(phrase, -1)
if score > self.threshold:
return phrase, score
return None, None
def test_simple_analysis(self):
"""Test transformation with no phrases."""
sentence = ["simple", "sentence", "should", "pass"]
result = self.AnalysisTester({}, threshold=1)[sentence]
self.assertEqual(result, sentence)
sentence = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"]
result = self.AnalysisTester({}, threshold=1)[sentence]
self.assertEqual(result, sentence)
def test_analysis_bigrams(self):
scores = {
"simple_sentence": 2, "sentence_many": 2,
"many_possible": 2, "possible_bigrams": 2,
}
sentence = ["simple", "sentence", "many", "possible", "bigrams"]
result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["simple_sentence", "many_possible", "bigrams"])
sentence = ["some", "simple", "sentence", "many", "bigrams"]
result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["some", "simple_sentence", "many", "bigrams"])
sentence = ["some", "unrelated", "simple", "words"]
result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, sentence)
def test_analysis_connector_words(self):
scores = {
"simple_sentence": 2, "sentence_many": 2,
"many_possible": 2, "possible_bigrams": 2,
}
sentence = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"]
result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["a", "simple_sentence", "many", "the", "possible_bigrams"])
sentence = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"]
result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(
result,
["simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"],
)
def test_analysis_connector_words_in_between(self):
scores = {
"simple_sentence": 2, "sentence_with_many": 2,
"many_possible": 2, "many_of_the_possible": 2, "possible_bigrams": 2,
}
sentence = ["sentence", "with", "many", "possible", "bigrams"]
result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(result, ["sentence_with_many", "possible_bigrams"])
sentence = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"]
result = self.AnalysisTester(scores, threshold=1)[sentence]
self.assertEqual(
result, ["a", "simple_sentence", "with", "many_of_the_possible", "bigrams", "with"])
class PhrasesData:
sentences = common_texts + [
['graph', 'minors', 'survey', 'human', 'interface'],
]
connector_words = frozenset()
bigram1 = u'response_time'
bigram2 = u'graph_minors'
bigram3 = u'human_interface'
def gen_sentences(self):
return ((w for w in sentence) for sentence in self.sentences)
class PhrasesCommon(PhrasesData):
"""Tests for both Phrases and FrozenPhrases classes."""
def setUp(self):
self.bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
self.bigram_default = Phrases(self.sentences, connector_words=self.connector_words)
def test_empty_phrasified_sentences_iterator(self):
bigram_phrases = Phrases(self.sentences)
bigram_phraser = FrozenPhrases(bigram_phrases)
trigram_phrases = Phrases(bigram_phraser[self.sentences])
trigram_phraser = FrozenPhrases(trigram_phrases)
trigrams = trigram_phraser[bigram_phraser[self.sentences]]
fst, snd = list(trigrams), list(trigrams)
self.assertEqual(fst, snd)
self.assertNotEqual(snd, [])
def test_empty_inputs_on_bigram_construction(self):
"""Test that empty inputs don't throw errors and return the expected result."""
# Empty list -> empty list
self.assertEqual(list(self.bigram_default[[]]), [])
# Empty iterator -> empty list
self.assertEqual(list(self.bigram_default[iter(())]), [])
# List of empty list -> list of empty list
self.assertEqual(list(self.bigram_default[[[], []]]), [[], []])
# Iterator of empty list -> list of empty list
self.assertEqual(list(self.bigram_default[iter([[], []])]), [[], []])
# Iterator of empty iterator -> list of empty list
self.assertEqual(list(self.bigram_default[(iter(()) for i in range(2))]), [[], []])
def test_sentence_generation(self):
"""Test basic bigram using a dummy corpus."""
# test that we generate the same amount of sentences as the input
self.assertEqual(
len(self.sentences),
len(list(self.bigram_default[self.sentences])),
)
def test_sentence_generation_with_generator(self):
"""Test basic bigram production when corpus is a generator."""
self.assertEqual(
len(list(self.gen_sentences())),
len(list(self.bigram_default[self.gen_sentences()])),
)
def test_bigram_construction(self):
"""Test Phrases bigram construction."""
# with this setting we should get response_time and graph_minors
bigram1_seen = False
bigram2_seen = False
for sentence in self.bigram[self.sentences]:
if not bigram1_seen and self.bigram1 in sentence:
bigram1_seen = True
if not bigram2_seen and self.bigram2 in sentence:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
self.assertTrue(bigram1_seen and bigram2_seen)
# check the same thing, this time using single doc transformation
# last sentence should contain both graph_minors and human_interface
self.assertTrue(self.bigram1 in self.bigram[self.sentences[1]])
self.assertTrue(self.bigram1 in self.bigram[self.sentences[4]])
self.assertTrue(self.bigram2 in self.bigram[self.sentences[-2]])
self.assertTrue(self.bigram2 in self.bigram[self.sentences[-1]])
self.assertTrue(self.bigram3 in self.bigram[self.sentences[-1]])
def test_bigram_construction_from_generator(self):
"""Test Phrases bigram construction building when corpus is a generator."""
bigram1_seen = False
bigram2_seen = False
for s in self.bigram[self.gen_sentences()]:
if not bigram1_seen and self.bigram1 in s:
bigram1_seen = True
if not bigram2_seen and self.bigram2 in s:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
self.assertTrue(bigram1_seen and bigram2_seen)
def test_bigram_construction_from_array(self):
"""Test Phrases bigram construction building when corpus is a numpy array."""
bigram1_seen = False
bigram2_seen = False
for s in self.bigram[np.array(self.sentences, dtype=object)]:
if not bigram1_seen and self.bigram1 in s:
bigram1_seen = True
if not bigram2_seen and self.bigram2 in s:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
self.assertTrue(bigram1_seen and bigram2_seen)
# scorer for testCustomScorer
# function is outside of the scope of the test because for picklability of custom scorer
# Phrases tests for picklability
# all scores will be 1
def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
return 1
class TestPhrasesModel(PhrasesCommon, unittest.TestCase):
def test_export_phrases(self):
"""Test Phrases bigram and trigram export phrases."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
trigram = Phrases(bigram[self.sentences], min_count=1, threshold=1, delimiter=' ')
seen_bigrams = set(bigram.export_phrases().keys())
seen_trigrams = set(trigram.export_phrases().keys())
assert seen_bigrams == set([
'human interface',
'response time',
'graph minors',
'minors survey',
])
assert seen_trigrams == set([
'human interface',
'graph minors survey',
])
def test_find_phrases(self):
"""Test Phrases bigram find phrases."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
seen_bigrams = set(bigram.find_phrases(self.sentences).keys())
assert seen_bigrams == set([
'response time',
'graph minors',
'human interface',
])
def test_multiple_bigrams_single_entry(self):
"""Test a single entry produces multiple bigrams."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
seen_bigrams = set(bigram.find_phrases(test_sentences).keys())
assert seen_bigrams == {'graph minors', 'human interface'}
def test_scoring_default(self):
"""Test the default scoring, from the mikolov word2vec paper."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())
assert seen_scores == {
5.167, # score for graph minors
3.444 # score for human interface
}
def test__getitem__(self):
"""Test Phrases[sentences] with a single sentence."""
bigram = Phrases(self.sentences, min_count=1, threshold=1)
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
phrased_sentence = next(bigram[test_sentences].__iter__())
assert phrased_sentence == ['graph_minors', 'survey', 'human_interface']
def test_scoring_npmi(self):
"""Test normalized pointwise mutual information scoring."""
bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi')
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())
assert seen_scores == {
.882, # score for graph minors
.714 # score for human interface
}
def test_custom_scorer(self):
"""Test using a custom scoring function."""
bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
seen_scores = list(bigram.find_phrases(test_sentences).values())
assert all(score == 1 for score in seen_scores)
assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def test_bad_parameters(self):
"""Test the phrases module with bad parameters."""
# should fail with something less or equal than 0
self.assertRaises(ValueError, Phrases, self.sentences, min_count=0)
# threshold should be positive
self.assertRaises(ValueError, Phrases, self.sentences, threshold=-1)
def test_pruning(self):
"""Test that max_vocab_size parameter is respected."""
bigram = Phrases(self.sentences, max_vocab_size=5)
self.assertTrue(len(bigram.vocab) <= 5)
# endclass TestPhrasesModel
class TestPhrasesPersistence(PhrasesData, unittest.TestCase):
def test_save_load_custom_scorer(self):
"""Test saving and loading a Phrases object with a custom scorer."""
bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
with temporary_file("test.pkl") as fpath:
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
seen_scores = list(bigram_loaded.find_phrases(test_sentences).values())
assert all(score == 1 for score in seen_scores)
assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def test_save_load(self):
"""Test saving and loading a Phrases object."""
bigram = Phrases(self.sentences, min_count=1, threshold=1)
with temporary_file("test.pkl") as fpath:
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
seen_scores = set(round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values())
assert seen_scores == set([
5.167, # score for graph minors
3.444 # score for human interface
])
def test_save_load_with_connector_words(self):
"""Test saving and loading a Phrases object."""
connector_words = frozenset({'of'})
bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words)
with temporary_file("test.pkl") as fpath:
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
assert bigram_loaded.connector_words == connector_words
def test_save_load_string_scoring(self):
"""Test backwards compatibility with a previous version of Phrases with custom scoring."""
bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl"))
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
seen_scores = set(round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values())
assert seen_scores == set([
5.167, # score for graph minors
3.444 # score for human interface
])
def test_save_load_no_scoring(self):
"""Test backwards compatibility with old versions of Phrases with no scoring parameter."""
bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl"))
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
seen_scores = set(round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values())
assert seen_scores == set([
5.167, # score for graph minors
3.444 # score for human interface
])
def test_save_load_no_common_terms(self):
"""Ensure backwards compatibility with old versions of Phrases, before connector_words."""
bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
self.assertEqual(bigram_loaded.connector_words, frozenset())
# can make a phraser, cf #1751
phraser = FrozenPhrases(bigram_loaded) # does not raise
phraser[["human", "interface", "survey"]] # does not raise
class TestFrozenPhrasesPersistence(PhrasesData, unittest.TestCase):
def test_save_load_custom_scorer(self):
"""Test saving and loading a FrozenPhrases object with a custom scorer."""
with temporary_file("test.pkl") as fpath:
bigram = FrozenPhrases(
Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
bigram.save(fpath)
bigram_loaded = FrozenPhrases.load(fpath)
self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def test_save_load(self):
"""Test saving and loading a FrozenPhrases object."""
with temporary_file("test.pkl") as fpath:
bigram = FrozenPhrases(Phrases(self.sentences, min_count=1, threshold=1))
bigram.save(fpath)
bigram_loaded = FrozenPhrases.load(fpath)
self.assertEqual(
bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
['graph_minors', 'survey', 'human_interface', 'system'])
def test_save_load_with_connector_words(self):
"""Test saving and loading a FrozenPhrases object."""
connector_words = frozenset({'of'})
with temporary_file("test.pkl") as fpath:
bigram = FrozenPhrases(Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words))
bigram.save(fpath)
bigram_loaded = FrozenPhrases.load(fpath)
self.assertEqual(bigram_loaded.connector_words, connector_words)
def test_save_load_string_scoring(self):
"""Test saving and loading a FrozenPhrases object with a string scoring parameter.
This should ensure backwards compatibility with the previous version of FrozenPhrases"""
bigram_loaded = FrozenPhrases.load(datapath("phraser-scoring-str.pkl"))
# we do not much with scoring, just verify its the one expected
self.assertEqual(bigram_loaded.scoring, original_scorer)
def test_save_load_no_scoring(self):
"""Test saving and loading a FrozenPhrases object with no scoring parameter.
This should ensure backwards compatibility with old versions of FrozenPhrases"""
bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl"))
# we do not much with scoring, just verify its the one expected
self.assertEqual(bigram_loaded.scoring, original_scorer)
def test_save_load_no_common_terms(self):
"""Ensure backwards compatibility with old versions of FrozenPhrases, before connector_words."""
bigram_loaded = FrozenPhrases.load(datapath("phraser-no-common-terms.pkl"))
self.assertEqual(bigram_loaded.connector_words, frozenset())
class TestFrozenPhrasesModel(PhrasesCommon, unittest.TestCase):
"""Test FrozenPhrases models."""
def setUp(self):
"""Set up FrozenPhrases models for the tests."""
bigram_phrases = Phrases(
self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
self.bigram = FrozenPhrases(bigram_phrases)
bigram_default_phrases = Phrases(self.sentences, connector_words=self.connector_words)
self.bigram_default = FrozenPhrases(bigram_default_phrases)
class CommonTermsPhrasesData:
"""This mixin permits to reuse tests with the connector_words option."""
sentences = [
['human', 'interface', 'with', 'computer'],
['survey', 'of', 'user', 'computer', 'system', 'lack', 'of', 'interest'],
['eps', 'user', 'interface', 'system'],
['system', 'and', 'human', 'system', 'eps'],
['user', 'lack', 'of', 'interest'],
['trees'],
['graph', 'of', 'trees'],
['data', 'and', 'graph', 'of', 'trees'],
['data', 'and', 'graph', 'survey'],
['data', 'and', 'graph', 'survey', 'for', 'human', 'interface'] # test bigrams within same sentence
]
connector_words = ['of', 'and', 'for']
bigram1 = u'lack_of_interest'
bigram2 = u'data_and_graph'
bigram3 = u'human_interface'
expression1 = u'lack of interest'
expression2 = u'data and graph'
expression3 = u'human interface'
def gen_sentences(self):
return ((w for w in sentence) for sentence in self.sentences)
class TestPhrasesModelCommonTerms(CommonTermsPhrasesData, TestPhrasesModel):
"""Test Phrases models with connector words."""
def test_multiple_bigrams_single_entry(self):
"""Test a single entry produces multiple bigrams."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ')
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
seen_bigrams = set(bigram.find_phrases(test_sentences).keys())
assert seen_bigrams == set([
'data and graph',
'human interface',
])
def test_find_phrases(self):
"""Test Phrases bigram export phrases."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ')
seen_bigrams = set(bigram.find_phrases(self.sentences).keys())
assert seen_bigrams == set([
'human interface',
'graph of trees',
'data and graph',
'lack of interest',
])
def test_export_phrases(self):
"""Test Phrases bigram export phrases."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ')
seen_bigrams = set(bigram.export_phrases().keys())
assert seen_bigrams == set([
'and graph',
'data and',
'graph of',
'graph survey',
'human interface',
'lack of',
'of interest',
'of trees',
])
def test_scoring_default(self):
""" test the default scoring, from the mikolov word2vec paper """
bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())
min_count = float(bigram.min_count)
len_vocab = float(len(bigram.vocab))
graph = float(bigram.vocab["graph"])
data = float(bigram.vocab["data"])
data_and_graph = float(bigram.vocab["data_and_graph"])
human = float(bigram.vocab["human"])
interface = float(bigram.vocab["interface"])
human_interface = float(bigram.vocab["human_interface"])
assert seen_scores == set([
# score for data and graph
round((data_and_graph - min_count) / data / graph * len_vocab, 3),
# score for human interface
round((human_interface - min_count) / human / interface * len_vocab, 3),
])
def test_scoring_npmi(self):
"""Test normalized pointwise mutual information scoring."""
bigram = Phrases(
self.sentences, min_count=1, threshold=.5,
scoring='npmi', connector_words=self.connector_words,
)
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
seen_scores = set(round(score, 3) for score in bigram.find_phrases(test_sentences).values())
assert seen_scores == set([
.74, # score for data and graph
.894 # score for human interface
])
def test_custom_scorer(self):
"""Test using a custom scoring function."""
bigram = Phrases(
self.sentences, min_count=1, threshold=.001,
scoring=dumb_scorer, connector_words=self.connector_words,
)
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
seen_scores = list(bigram.find_phrases(test_sentences).values())
assert all(seen_scores) # all scores 1
assert len(seen_scores) == 2 # 'data and graph' 'survey for human'
def test__getitem__(self):
"""Test Phrases[sentences] with a single sentence."""
bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words)
test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
phrased_sentence = next(bigram[test_sentences].__iter__())
assert phrased_sentence == ['data_and_graph', 'survey', 'for', 'human_interface']
class TestFrozenPhrasesModelCompatibility(unittest.TestCase):
def test_compatibility(self):
phrases = Phrases.load(datapath("phrases-3.6.0.model"))
phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model"))
test_sentences = ['trees', 'graph', 'minors']
self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors'])
self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()