Gemfury

replicahq / pomegranate python

Repository URL to install this package:
Details
pomegranate / hmm / dense_hmm.py
# dense_hmm.py
# Author: Jacob Schreiber <jmschreiber91@gmail.com>

import math
import numpy
import torch

from .._utils import _cast_as_tensor
from .._utils import _cast_as_parameter
from .._utils import _update_parameter
from .._utils import _check_parameter

from ..distributions._distribution import Distribution

from ._base import _BaseHMM
from ._base import _check_inputs


NEGINF = float("-inf")
inf = float("inf")


class DenseHMM(_BaseHMM):
	"""A hidden Markov model with a dense transition matrix.

	A hidden Markov model is an extension of a mixture model to sequences by
	including a transition matrix between the elements of the mixture. Each of
	the algorithms for a hidden Markov model are essentially just a revision
	of those algorithms to incorporate this transition matrix.

	This object is a wrapper for a hidden Markov model with a dense transition
	matrix.

	This object is a wrapper for both implementations, which can be specified
	using the `kind` parameter. Choosing the right implementation will not
	effect the accuracy of the results but will change the speed at which they
	are calculated. 	

	Separately, there are two ways to instantiate the hidden Markov model. The
	first is by passing in a set of distributions, a dense transition matrix, 
	and optionally start/end probabilities. The second is to initialize the
	object without these and then to add edges using the `add_edge` method
	and to add distributions using the `add_distributions` method. Importantly, the way that
	you choose to initialize the hidden Markov model is independent of the
	implementation that you end up choosing. If you pass in a dense transition
	matrix, this will be converted to a sparse matrix with all the zeros
	dropped if you choose `kind='sparse'`.


	Parameters
	----------
	distributions: tuple or list
		A set of distribution objects. These objects do not need to be
		initialized, i.e., can be "Normal()". 

	edges: numpy.ndarray, torch.Tensor, or None. shape=(k,k), optional
		A dense transition matrix of probabilities for how each node or
		distribution passed in connects to each other one. This can contain
		many zeroes, and when paired with `kind='sparse'`, will drop those
		elements from the matrix.

	starts: list, numpy.ndarray, torch.Tensor, or None. shape=(k,), optional
		The probability of starting at each node. If not provided, assumes
		these probabilities are uniform.

	ends: list, numpy.ndarray, torch.Tensor, or None. shape=(k,), optional
		The probability of ending at each node. If not provided, assumes
		these probabilities are uniform.

	inertia: float, [0, 1], optional
		Indicates the proportion of the update to apply to the parameters
		during training. When the inertia is 0.0, the update is applied in
		its entirety and the previous parameters are ignored. When the
		inertia is 1.0, the update is entirely ignored and the previous
		parameters are kept, equivalently to if the parameters were frozen.

	frozen: bool, optional
		Whether all the parameters associated with this distribution are frozen.
		If you want to freeze individual pameters, or individual values in those
		parameters, you must modify the `frozen` attribute of the tensor or
		parameter directly. Default is False.

	check_data: bool, optional
		Whether to check properties of the data and potentially recast it to
		torch.tensors. This does not prevent checking of parameters but can
		slightly speed up computation when you know that your inputs are valid.
		Setting this to False is also necessary for compiling. Default is True.
	"""

	def __init__(self, distributions=None, edges=None, starts=None, ends=None, 
		init='random', max_iter=1000, tol=0.1, sample_length=None, 
		return_sample_paths=False, inertia=0.0, frozen=False, check_data=True,
		random_state=None, verbose=False):
		super().__init__(distributions=distributions, starts=starts, ends=ends, 
			init=init, max_iter=max_iter, tol=tol, sample_length=sample_length, 
			return_sample_paths=return_sample_paths, inertia=inertia,
			frozen=frozen, check_data=check_data, random_state=random_state, 
			verbose=verbose)
		
		self.name = "DenseHMM"
		n = len(distributions) if distributions is not None else 0

		if edges is not None:
			self.edges = _cast_as_parameter(torch.log(_check_parameter(
				_cast_as_tensor(edges), "edges", ndim=2, shape=(n, n), 
				min_value=0., max_value=1.)))

		self._initialized = (self.distributions is not None and
			self.starts is not None and self.ends is not None and
			self.edges is not None and 
			all(d._initialized for d in self.distributions))

		if self._initialized:
			self.distributions = torch.nn.ModuleList(
				self.distributions)

		self._reset_cache()

	def _reset_cache(self):
		"""Reset the internally stored statistics.

		This method is meant to only be called internally. It resets the
		stored statistics used to update the model parameters as well as
		recalculates the cached values meant to speed up log probability
		calculations.
		"""

		if self._initialized == False:
			return

		for node in self.distributions:
			node._reset_cache()

		self.register_buffer("_xw_sum", torch.zeros(self.n_distributions, 
			self.n_distributions, dtype=self.dtype, requires_grad=False, 
			device=self.device))

		self.register_buffer("_xw_starts_sum", torch.zeros(self.n_distributions, 
			dtype=self.dtype, requires_grad=False, device=self.device))

		self.register_buffer("_xw_ends_sum", torch.zeros(self.n_distributions, 
			dtype=self.dtype, requires_grad=False, device=self.device))

	def _initialize(self, X=None, sample_weight=None):
		"""Initialize the probability distribution.

		This method is meant to only be called internally. It initializes the
		parameters of the distribution and stores its dimensionality. For more
		complex methods, this function will do more.


		Parameters
		----------
		X: list, numpy.ndarray, torch.Tensor, shape=(-1, len, self.d), optional
			The data to use to initialize the model. Default is None.

		sample_weight: list, tuple, numpy.ndarray, torch.Tensor, optional
			A set of weights for the examples. This can be either of shape
			(-1, len) or a vector of shape (-1,). If None, defaults to ones.
			Default is None.
		"""

		super()._initialize(X, sample_weight=sample_weight)

		n = self.n_distributions
		if self.edges == None:
			self.edges = _cast_as_parameter(torch.log(torch.ones(n, n, 
				dtype=self.dtype, device=self.device) / n))

		self.distributions = torch.nn.ModuleList(self.distributions)

	def add_edge(self, start, end, prob):
		"""Add an edge to the model.

		This method will fill in an entry in the dense transition matrix
		at row indexed by the start distribution and the column indexed
		by the end distribution. The value that will be included is the
		log of the probability value provided. Note that this will override
		values that already exist, and that this will initialize a new
		dense transition matrix if none has been passed in so far.


		Parameters
		----------
		start: torch.distributions.distribution
			The distribution that the edge starts at.

		end: torch.distributions.distribution
			The distribution that the edge ends at.

		prob: float, (0.0, 1.0]
			The probability of that edge.
		"""

		if self.distributions is None:
			raise ValueError("Must add distributions before edges.")

		n = self.n_distributions

		if start == self.start:
			if self.starts is None:
				self.starts = torch.empty(n, dtype=self.dtype, 
					device=self.device) - inf

			idx = self.distributions.index(end)
			self.starts[idx] = math.log(prob)

		elif end == self.end:
			if self.ends is None:
				self.ends = torch.empty(n, dtype=self.dtype, 
					device=self.device) - inf

			idx = self.distributions.index(start)
			self.ends[idx] = math.log(prob)

		else:
			if self.edges is None:
				self.edges = torch.empty((n, n), dtype=self.dtype, 
					device=self.device) - inf

			idx1 = self.distributions.index(start)
			idx2 = self.distributions.index(end)

			self.edges[idx1, idx2] = math.log(prob)

	def sample(self, n):
		"""Sample from the probability distribution.

		This method will return `n` samples generated from the underlying
		probability distribution. Because a HMM describes variable length
		sequences, a list will be returned where each element is one of
		the generated sequences.


		Parameters
		----------
		n: int
			The number of samples to generate.
		

		Returns
		-------
		X: list of torch.tensor, shape=(n,)
			A list of randomly generated samples, where each sample of
			size (length, self.d).
		"""

		if self.sample_length is None and self.ends is None:
			raise ValueError("Must specify a length or have explicit "
				+ "end probabilities.")

		if self.ends is None:
			ends = torch.zeros(self.n_distributions, dtype=self.edges.dtype, 
				device=self.edges.device) + float("-inf")
		else:
			ends = self.ends

		distributions, emissions = [], []

		edge_probs = torch.hstack([self.edges, ends.unsqueeze(1)])
		edge_probs = torch.exp(edge_probs).numpy()

		starts = torch.exp(self.starts).numpy()

		for _ in range(n):
			node_i = self.random_state.choice(self.n_distributions, p=starts)
			emission_i = self.distributions[node_i].sample(n=1)
			distributions_, emissions_ = [node_i], [emission_i]

			for i in range(1, self.sample_length or int(1e8)):
				node_i = self.random_state.choice(self.n_distributions+1, 
					p=edge_probs[node_i])

				if node_i == self.n_distributions:
					break

				emission_i = self.distributions[node_i].sample(n=1)

				distributions_.append(node_i)
				emissions_.append(emission_i)

			distributions.append(distributions_)
			emissions.append(torch.vstack(emissions_))

		if self.return_sample_paths == True:
			return emissions, distributions
		return emissions

	def viterbi(self, X=None, emissions=None, priors=None):
		"""Run the Viterbi algorithm on some data.

		Runs the Viterbi algortihm on a batch of sequences. The Viterbi 
		algorithm is a dynamic programming algorithm that begins at the start
		state and calculates the single best path through the model involving
		alignments of symbol i to node j. This is in contrast to the forward
		function, which involves calculating the sum of all paths, not just
		the single best path. Because we have to keep track of the best path,
		the Viterbi algorithm is slightly more conceptually challenging and
		involves keeping track of a traceback matrix.


		Parameters
		----------
		X: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, d)
			A set of examples to evaluate. Does not need to be passed in if
			emissions are. 

		emissions: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, n_dists)
			Precalculated emission log probabilities. These are the
			probabilities of each observation under each probability 
			distribution. When running some algorithms it is more efficient
			to precalculate these and pass them into each call.

		priors: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, self.k)
			Prior probabilities of assigning each symbol to each node. If not
			provided, do not include in the calculations (conceptually
			equivalent to a uniform probability, but without scaling the
			probabilities). This can be used to assign labels to observatons
			by setting one of the probabilities for an observation to 1.0.
			Note that this can be used to assign hard labels, but does not
			have the same semantics for soft labels, in that it only
			influences the initial estimate of an observation being generated
			by a component, not gives a target. Default is None.


		Returns
		-------
		path: torch.Tensor, shape=(-1, -1)
			The state assignment for each observation in each sequence.
		""" 

		emissions = _check_inputs(self, X, emissions, priors)
		n, l = emissions.shape[:2]

		v = torch.clone(emissions.permute(1, 0, 2)).contiguous()
		v[0] += self.starts
		
		traceback = torch.zeros_like(v, dtype=torch.int32)
		traceback[0] = torch.arange(v.shape[-1])

		for i in range(1, l):
			z = v[i-1].unsqueeze(-1) + self.edges.unsqueeze(0) + v[i][:, None]
			v[i], traceback[i] = torch.max(z, dim=-2)

		ends = self.ends + v[-1]
		best_end_logps, best_end_idxs = torch.max(ends, dim=-1)

		paths = [best_end_idxs]
		for i in range(1, l):
			paths.append(traceback[l-i, torch.arange(n), paths[-1]])

		paths = torch.flip(torch.stack(paths).T, dims=(-1,))
		return paths


	def forward(self, X=None, emissions=None, priors=None):
		"""Run the forward algorithm on some data.

		Runs the forward algorithm on a batch of sequences. This is not to be
		confused with a "forward pass" when talking about neural networks. The
		forward algorithm is a dynamic programming algorithm that begins at the
		start state and returns the probability, over all paths through the
		model, that result in the alignment of symbol i to node j.

		
		Parameters
		----------
		X: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, d)
			A set of examples to evaluate. Does not need to be passed in if
			emissions are. 

		emissions: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, n_dists)
			Precalculated emission log probabilities. These are the
			probabilities of each observation under each probability 
			distribution. When running some algorithms it is more efficient
			to precalculate these and pass them into each call.

		priors: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, self.k)
			Prior probabilities of assigning each symbol to each node. If not
			provided, do not include in the calculations (conceptually
			equivalent to a uniform probability, but without scaling the
			probabilities). This can be used to assign labels to observatons
			by setting one of the probabilities for an observation to 1.0.
			Note that this can be used to assign hard labels, but does not
			have the same semantics for soft labels, in that it only
			influences the initial estimate of an observation being generated
			by a component, not gives a target. Default is None.


		Returns
		-------
		f: torch.Tensor, shape=(-1, -1, self.n_distributions)
			The log probabilities calculated by the forward algorithm.
		"""
		
		emissions = _check_inputs(self, X, emissions, priors)
		l = emissions.shape[1]

		t_max = self.edges.max()
		t = torch.exp(self.edges - t_max)
		f = torch.clone(emissions.permute(1, 0, 2)).contiguous()
		f[0] += self.starts
		f[1:] += t_max

		for i in range(1, l):
			p_max = torch.max(f[i-1], dim=1, keepdims=True).values
			p = torch.exp(f[i-1] - p_max)
			f[i] += torch.log(torch.matmul(p, t)) + p_max

		f = f.permute(1, 0, 2)
		return f

	def backward(self, X=None, emissions=None, priors=None):
		"""Run the backward algorithm on some data.

		Runs the backward algorithm on a batch of sequences. This is not to be
		confused with a "backward pass" when talking about neural networks. The
		backward algorithm is a dynamic programming algorithm that begins at end
		of the sequence and returns the probability, over all paths through the
		model, that result in the alignment of symbol i to node j, working
		backwards.

		
		Parameters
		----------
		X: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, d)
			A set of examples to evaluate. Does not need to be passed in if
			emissions are. 

		emissions: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, n_distributions)
			Precalculated emission log probabilities. These are the
			probabilities of each observation under each probability 
			distribution. When running some algorithms it is more efficient
			to precalculate these and pass them into each call.

		priors: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, self.k)
			Prior probabilities of assigning each symbol to each node. If not
			provided, do not include in the calculations (conceptually
			equivalent to a uniform probability, but without scaling the
			probabilities). This can be used to assign labels to observatons
			by setting one of the probabilities for an observation to 1.0.
			Note that this can be used to assign hard labels, but does not
			have the same semantics for soft labels, in that it only
			influences the initial estimate of an observation being generated
			by a component, not gives a target. Default is None.


		Returns
		-------
		b: torch.Tensor, shape=(-1, length, self.n_distributions)
			The log probabilities calculated by the backward algorithm.
		"""

		emissions = _check_inputs(self, X, emissions, priors)
		n, l, _ = emissions.shape

		b = torch.zeros(l, n, self.n_distributions, dtype=self.dtype, 
			device=self.device) + float("-inf")
		b[-1] = self.ends

		t_max = self.edges.max()
		t = torch.exp(self.edges.T - t_max)

		for i in range(l-2, -1, -1):
			p = b[i+1] + emissions[:, i+1]
			p_max = torch.max(p, dim=1, keepdims=True).values
			p = torch.exp(p - p_max)

			b[i] = torch.log(torch.matmul(p, t)) + t_max + p_max

		b = b.permute(1, 0, 2)
		return b

	def forward_backward(self, X=None, emissions=None, priors=None):
		"""Run the forward-backward algorithm on some data.

		Runs the forward-backward algorithm on a batch of sequences. This
		algorithm combines the best of the forward and the backward algorithm.
		It combines the probability of starting at the beginning of the sequence
		and working your way to each observation with the probability of
		starting at the end of the sequence and working your way backward to it.

		A number of statistics can be calculated using this information. These
		statistics are powerful inference tools but are also used during the
		Baum-Welch training process. 

		
		Parameters
		----------
		X: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, d)
			A set of examples to evaluate. Does not need to be passed in if
			emissions are. 

		emissions: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, n_distributions)
			Precalculated emission log probabilities. These are the
			probabilities of each observation under each probability 
			distribution. When running some algorithms it is more efficient
			to precalculate these and pass them into each call.

		priors: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, self.k)
			Prior probabilities of assigning each symbol to each node. If not
			provided, do not include in the calculations (conceptually
			equivalent to a uniform probability, but without scaling the
			probabilities). This can be used to assign labels to observatons
			by setting one of the probabilities for an observation to 1.0.
			Note that this can be used to assign hard labels, but does not
			have the same semantics for soft labels, in that it only
			influences the initial estimate of an observation being generated
			by a component, not gives a target. Default is None.


		Returns
		-------
		transitions: torch.Tensor, shape=(-1, n, n)
			The expected number of transitions across each edge that occur
			for each example. The returned transitions follow the structure
			of the transition matrix and so will be dense or sparse as
			appropriate.

		responsibility: torch.Tensor, shape=(-1, -1, n)
			The posterior probabilities of each observation belonging to each
			state given that one starts at the beginning of the sequence,
			aligns observations across all paths to get to the current
			observation, and then proceeds to align all remaining observations
			until the end of the sequence.

		starts: torch.Tensor, shape=(-1, n)
			The probabilities of starting at each node given the 
			forward-backward algorithm.

		ends: torch.Tensor, shape=(-1, n)
			The probabilities of ending at each node given the forward-backward
			algorithm.

		logp: torch.Tensor, shape=(-1,)
			The log probabilities of each sequence given the model.
		"""

		emissions = _check_inputs(self, X, emissions, priors)
		n, l, _ = emissions.shape

		f = self.forward(emissions=emissions)
		b = self.backward(emissions=emissions)

		logp = torch.logsumexp(f[:, -1] + self.ends, dim=1)

		f_ = f[:, :-1].unsqueeze(-1)
		b_ = (b[:, 1:] + emissions[:, 1:]).unsqueeze(-2)

		t = f_ + b_ + self.edges.unsqueeze(0).unsqueeze(0)
		t = t.reshape(n, l-1, -1)
		t = torch.exp(torch.logsumexp(t, dim=1).T - logp).T
		t = t.reshape(n, int(t.shape[1] ** 0.5), -1)

		starts = self.starts + emissions[:, 0] + b[:, 0]
		starts = torch.exp(starts.T - torch.logsumexp(starts, dim=-1)).T

		ends = self.ends + f[:, -1]
		ends = torch.exp(ends.T - torch.logsumexp(ends, dim=-1)).T

		r = f + b
		r = r - torch.logsumexp(r, dim=2).reshape(n, -1, 1)
		return t, r, starts, ends, logp

	def summarize(self, X, sample_weight=None, emissions=None, priors=None):
		"""Extract the sufficient statistics from a batch of data.

		This method calculates the sufficient statistics from optionally
		weighted data and adds them to the stored cache. The examples must be
		given in a 2D format. Sample weights can either be provided as one
		value per example or as a 2D matrix of weights for each feature in
		each example.


		Parameters
		----------
		X: torch.Tensor, shape=(-1, -1, self.d)
			A set of examples to summarize.

		y: torch.Tensor, shape=(-1, -1), optional 
			A set of labels with the same number of examples and length as the
			observations that indicate which node in the model that each
			observation should be assigned to. Passing this in means that the
			model uses labeled training instead of Baum-Welch. Default is None.

		sample_weight: torch.Tensor, optional
			A set of weights for the examples. This can be either of shape
			(-1, self.d) or a vector of shape (-1,). Default is ones.

		emissions: torch.Tensor, shape=(-1, -1, self.n_distributions)
			Precalculated emission log probabilities. These are the
			probabilities of each observation under each probability 
			distribution. When running some algorithms it is more efficient
			to precalculate these and pass them into each call.	

		priors: list, numpy.ndarray, torch.Tensor, shape=(-1, -1, self.k)
			Prior probabilities of assigning each symbol to each node. If not
			provided, do not include in the calculations (conceptually
			equivalent to a uniform probability, but without scaling the
			probabilities). This can be used to assign labels to observatons
			by setting one of the probabilities for an observation to 1.0.
			Note that this can be used to assign hard labels, but does not
			have the same semantics for soft labels, in that it only
			influences the initial estimate of an observation being generated
			by a component, not gives a target. Default is None.
		"""

		X, emissions, sample_weight = super().summarize(X, 
			sample_weight=sample_weight, emissions=emissions, priors=priors)

		t, r, starts, ends, logps = self.forward_backward(emissions=emissions)

		X = X.reshape(-1, X.shape[-1])
		r = torch.exp(r) * sample_weight.unsqueeze(-1)
		for i, node in enumerate(self.distributions):
			w = r[:, :, i].reshape(-1, 1)
			node.summarize(X, sample_weight=w)

		if self.frozen == False:
			self._xw_starts_sum += torch.sum(starts * sample_weight, dim=0)
			self._xw_ends_sum += torch.sum(ends * sample_weight, dim=0)
			self._xw_sum += torch.sum(t * sample_weight.unsqueeze(-1), dim=0) 

		return logps

	def from_summaries(self):
		"""Update the model parameters given the extracted statistics.

		This method uses calculated statistics from calls to the `summarize`
		method to update the distribution parameters. Hyperparameters for the
		update are passed in at initialization time.

		Note: Internally, a call to `fit` is just a successive call to the
		`summarize` method followed by the `from_summaries` method.
		"""

		for node in self.distributions:
			node.from_summaries()

		if self.frozen:
			return

		node_out_count = torch.sum(self._xw_sum, dim=1, keepdims=True)
		node_out_count += self._xw_ends_sum.unsqueeze(1)

		ends = torch.log(self._xw_ends_sum / node_out_count[:,0])
		starts = torch.log(self._xw_starts_sum / self._xw_starts_sum.sum())
		edges = torch.log(self._xw_sum / node_out_count)

		_update_parameter(self.ends, ends, inertia=self.inertia)
		_update_parameter(self.starts, starts, inertia=self.inertia)
		_update_parameter(self.edges, edges, inertia=self.inertia)
		self._reset_cache()
replicahq / pomegranate python

Products

About

Resources

Contact Gemfury