Repository URL to install this package:
|
Version:
4.2.0 ▾
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Base Indexed Corpus class."""
import logging
import numpy
from gensim import interfaces, utils
logger = logging.getLogger(__name__)
class IndexedCorpus(interfaces.CorpusABC):
"""Indexed corpus is a mechanism for random-accessing corpora.
While the standard corpus interface in gensim allows iterating over corpus,
we'll show it with :class:`~gensim.corpora.mmcorpus.MmCorpus`.
.. sourcecode:: pycon
>>> from gensim.corpora import MmCorpus
>>> from gensim.test.utils import datapath
>>>
>>> corpus = MmCorpus(datapath('testcorpus.mm'))
>>> for doc in corpus:
... pass
:class:`~gensim.corpora.indexedcorpus.IndexedCorpus` allows accessing the documents with index
in :math:`{O}(1)` look-up time.
.. sourcecode:: pycon
>>> document_index = 3
>>> doc = corpus[document_index]
Notes
-----
This functionality is achieved by storing an extra file (by default named the same as the `fname.index`)
that stores the byte offset of the beginning of each document.
"""
def __init__(self, fname, index_fname=None):
"""
Parameters
----------
fname : str
Path to corpus.
index_fname : str, optional
Path to index, if not provided - used `fname.index`.
"""
try:
if index_fname is None:
index_fname = utils.smart_extension(fname, '.index')
self.index = utils.unpickle(index_fname)
# change self.index into a numpy.ndarray to support fancy indexing
self.index = numpy.asarray(self.index)
logger.info("loaded corpus index from %s", index_fname)
except Exception:
self.index = None
self.length = None
@classmethod
def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
progress_cnt=None, labels=None, metadata=False):
"""Serialize corpus with offset metadata, allows to use direct indexes after loading.
Parameters
----------
fname : str
Path to output file.
corpus : iterable of iterable of (int, float)
Corpus in BoW format.
id2word : dict of (str, str), optional
Mapping id -> word.
index_fname : str, optional
Where to save resulting index, if None - store index to `fname`.index.
progress_cnt : int, optional
Number of documents after which progress info is printed.
labels : bool, optional
If True - ignore first column (class labels).
metadata : bool, optional
If True - ensure that serialize will write out article titles to a pickle file.
Examples
--------
.. sourcecode:: pycon
>>> from gensim.corpora import MmCorpus
>>> from gensim.test.utils import get_tmpfile
>>>
>>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
>>> output_fname = get_tmpfile("test.mm")
>>>
>>> MmCorpus.serialize(output_fname, corpus)
>>> mm = MmCorpus(output_fname) # `mm` document stream now has random access
>>> print(mm[1]) # retrieve document no. 42, etc.
[(1, 0.1)]
"""
if getattr(corpus, 'fname', None) == fname:
raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)
if index_fname is None:
index_fname = utils.smart_extension(fname, '.index')
kwargs = {'metadata': metadata}
if progress_cnt is not None:
kwargs['progress_cnt'] = progress_cnt
if labels is not None:
kwargs['labels'] = labels
offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)
if offsets is None:
raise NotImplementedError(
"Called serialize on class %s which doesn't support indexing!" % serializer.__name__
)
# store offsets persistently, using pickle
# we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
# the offsets that are actually stored on disk - we're not storing self.index in any case, the
# load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
# backwards compatibility
logger.info("saving %s index to %s", serializer.__name__, index_fname)
utils.pickle(offsets, index_fname)
def __len__(self):
"""Get the index length.
Notes
-----
If the corpus is not indexed, also count corpus length and cache this value.
Returns
-------
int
Length of index.
"""
if self.index is not None:
return len(self.index)
if self.length is None:
logger.info("caching corpus length")
self.length = sum(1 for _ in self)
return self.length
def __getitem__(self, docno):
"""Get document by `docno` index.
Parameters
----------
docno : {int, iterable of int}
Document number or iterable of numbers (like a list of str).
Returns
-------
list of (int, float)
If `docno` is int - return document in BoW format.
:class:`~gensim.utils.SlicedCorpus`
If `docno` is iterable of int - return several documents in BoW format
wrapped to :class:`~gensim.utils.SlicedCorpus`.
Raises
------
RuntimeError
If index isn't exist.
"""
if self.index is None:
raise RuntimeError("Cannot call corpus[docid] without an index")
if isinstance(docno, (slice, list, numpy.ndarray)):
return utils.SlicedCorpus(self, docno)
elif isinstance(docno, (int, numpy.integer,)):
return self.docbyoffset(self.index[docno])
# TODO: no `docbyoffset` method, should be defined in this class
else:
raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')