Repository URL to install this package:
|
Version:
4.2.0 ▾
|
# Copyright (C) 2018 Radim Rehurek <radimrehurek@seznam.cz>
# cython: embedsignature=True
"""Reader for corpus in the Matrix Market format."""
import logging
cimport cython
from libc.stdio cimport sscanf
from gensim import utils
logger = logging.getLogger(__name__)
cdef class MmReader():
"""Matrix market file reader (fast Cython version), used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`.
Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).
Attributes
----------
num_docs : int
Number of documents in the market matrix file.
num_terms : int
Number of terms.
num_nnz : int
Number of non-zero terms.
Notes
-----
Note that the file is read into memory one document at a time, not the whole matrix at once
(unlike e.g. `scipy.io.mmread` and other implementations).
This allows us to process corpora which are larger than the available RAM.
"""
cdef public input
cdef public bint transposed
cdef public long long num_docs, num_terms, num_nnz
def __init__(self, input, transposed=True):
"""
Parameters
----------
input : {str, file-like object}
Path to the input file in MM format or a file-like object that supports `seek()`
(e.g. smart_open objects).
transposed : bool, optional
Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`?
"""
logger.info("initializing cython corpus reader from %s", input)
self.input, self.transposed = input, transposed
with utils.open_file(self.input) as lines:
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError(
"File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header)
)
except StopIteration:
pass
self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
logger.info(
"accepted corpus with %i documents, %i features, %i non-zero entries",
self.num_docs, self.num_terms, self.num_nnz
)
def __len__(self):
"""Get the corpus size: total number of documents."""
return self.num_docs
def __str__(self):
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
(self.num_docs, self.num_terms, self.num_nnz))
def skip_headers(self, input_file):
"""Skip file headers that appear before the first document.
Parameters
----------
input_file : iterable of str
Iterable taken from file in MM format.
"""
for line in input_file:
if line.startswith(b'%'):
continue
break
def __iter__(self):
"""Iterate through all documents in the corpus.
Notes
------
Note that the total number of vectors returned is always equal to the number of rows specified
in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly
stored in the Matrix Market file.
Yields
------
(int, list of (int, number))
Document id and document in sparse bag-of-words format.
"""
cdef long long docid, termid, previd
cdef double val = 0
with utils.file_or_filename(self.input) as lines:
self.skip_headers(lines)
previd = -1
for line in lines:
if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3):
raise ValueError("unable to parse line: {}".format(line))
if not self.transposed:
termid, docid = docid, termid
# -1 because matrix market indexes are 1-based => convert to 0-based
docid -= 1
termid -= 1
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
# change of document: return the document read so far (its id is prevId)
if previd >= 0:
yield previd, document # noqa:F821
# return implicit (empty) documents between previous id and new id
# too, to keep consistent document numbering and corpus length
for previd in range(previd + 1, docid):
yield previd, []
# from now on start adding fields to a new document, with a new id
previd = docid
document = []
document.append((termid, val,)) # add another field to the current document
# handle the last document, as a special case
if previd >= 0:
yield previd, document
# return empty documents between the last explicit document and the number
# of documents as specified in the header
for previd in range(previd + 1, self.num_docs):
yield previd, []
def docbyoffset(self, offset):
"""Get the document at file offset `offset` (in bytes).
Parameters
----------
offset : int
File offset, in bytes, of the desired document.
Returns
------
list of (int, str)
Document in sparse bag-of-words format.
"""
# empty documents are not stored explicitly in MM format, so the index marks
# them with a special offset, -1.
cdef long long docid, termid, previd
cdef double val
if offset == -1:
return []
if isinstance(self.input, str):
fin, close_fin = utils.open(self.input, 'rb'), True
else:
fin, close_fin = self.input, False
fin.seek(offset) # works for gzip/bz2 input, too
previd, document = -1, []
for line in fin:
if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3):
raise ValueError("unable to parse line: {}".format(line))
if not self.transposed:
termid, docid = docid, termid
# -1 because matrix market indexes are 1-based => convert to 0-based
docid -= 1
termid -= 1
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
if previd >= 0:
break
previd = docid
document.append((termid, val,)) # add another field to the current document
if close_fin:
fin.close()
return document