From a804d8bcc28e00f47b128d46d7be53a1825f749c Mon Sep 17 00:00:00 2001 From: Sebastian <72598707+sebastianbot6969@users.noreply.github.com> Date: Sun, 29 Dec 2024 11:53:19 +0100 Subject: [PATCH] giovanni recommendations (#70) * made most of giovannis corrections for the implementation chapter, not done though * rewrote the things giovanni wanted in preliminaries * added some text on how we do matrix multiplication * made runges comments --- report/src/bib/main.bib | 32 +++ .../figures/vector_add_example_reduced.tex | 30 +-- report/src/sections/02-preliminaries.tex | 214 +++++++++--------- report/src/sections/03-implementation.tex | 177 +++++++-------- 4 files changed, 231 insertions(+), 222 deletions(-) diff --git a/report/src/bib/main.bib b/report/src/bib/main.bib index 061071e..402f60d 100644 --- a/report/src/bib/main.bib +++ b/report/src/bib/main.bib @@ -211,4 +211,36 @@ @inproceedings{benyacoub2015initial pages={359--368}, year={2015}, organization={Springer} +} + +@article{bryant1986graph, + title={Graph-based algorithms for boolean function manipulation}, + author={Bryant, Randal E}, + journal={Computers, IEEE Transactions on}, + volume={100}, + number={8}, + pages={677--691}, + year={1986}, + publisher={IEEE} +} + +@article{bahar1997algebric, + title={Algebric decision diagrams and their applications}, + author={Bahar, R Iris and Frohm, Erica A and Gaona, Charles M and Hachtel, Gary D and Macii, Enrico and Pardo, Abelardo and Somenzi, Fabio}, + journal={Formal methods in system design}, + volume={10}, + pages={171--206}, + year={1997}, + publisher={Springer} +} + +@article{rabiner1989tutorial, + title={A tutorial on hidden Markov models and selected applications in speech recognition}, + author={Rabiner, Lawrence R}, + journal={Proceedings of the IEEE}, + volume={77}, + number={2}, + pages={257--286}, + year={1989}, + publisher={Ieee} } \ No newline at end of file diff --git a/report/src/figures/vector_add_example_reduced.tex b/report/src/figures/vector_add_example_reduced.tex index 8b63ecf..d7c4e3f 100644 --- a/report/src/figures/vector_add_example_reduced.tex +++ b/report/src/figures/vector_add_example_reduced.tex @@ -1,25 +1,17 @@ \begin{tikzpicture}[node distance=1cm and 0.5cm] \node[c] (x) {$x_1$}; - \node[c] (a) [below left=of x] {$y_1$}; - \node[c] (b) [below left=of a] {$x_2$}; - \node[c] (c) [below left=of b] {$y_2$}; - \node[c] (d) [below right=of b] {$y_2$}; + \node[c] (a) [below left=of x] {$x_2$}; + \node[c] (b) [below right=of x] {$x_2$}; - \node[t] (final-1) [below left=of c] {1}; - \node[t] (final-2) [below right=of c] {2}; - \node[t] (final-3) [right=of final-2] {3}; - \node[t] (final-4) [right=of final-3] {4}; - \node[t] (final-5) [right=of final-4] {0}; + \node[t] (final-1) [below left=of a] {1}; + \node[t] (final-2) [right=of final-1] {2}; + \node[t] (final-4) [below right=of b] {4}; + \node[t] (final-3) [left=of final-4] {3}; \draw[zeroarrow] (x) -- (a); - \draw[onearrow] (x) -- (final-5); - \draw[zeroarrow] (a) -- (b); - \draw[zeroarrow] (b) -- (c); - \draw[onearrow] (b) -- (d); - \draw[zeroarrow] (c) -- (final-1); - \draw[onearrow] (c) -- (final-2); - \draw[zeroarrow] (d) -- (final-3); - \draw[onearrow] (d) -- (final-4); - \draw[onearrow] (a) -- (final-5); - + \draw[zeroarrow] (a) -- (final-1); + \draw[onearrow] (a) -- (final-2); + \draw[zeroarrow] (x) -- (b); + \draw[onearrow] (b) -- (final-3); + \draw[zeroarrow] (b) -- (final-4); \end{tikzpicture} \ No newline at end of file diff --git a/report/src/sections/02-preliminaries.tex b/report/src/sections/02-preliminaries.tex index 4b3b617..dc7752d 100644 --- a/report/src/sections/02-preliminaries.tex +++ b/report/src/sections/02-preliminaries.tex @@ -33,13 +33,13 @@ \subsection{Hidden Markov Models}\label{subsec:hmm} \subsection{Matrix Representation of HMMs}\label{subsec:matrix-representation} \glspl{hmm} can be represented using various matrices. -The emission function $\mathscr{l}$ can be represented as a matrix $\omega$ where $\omega_{s, l} = \mathscr{l}(s)(l)$. -The matrix $\omega$ has the size $|S| \times |\mathcal{L}|$. -The sum of each row in the matrix $\omega$ is equal to one, reflecting the total probability of emitting all labels from a given state. +The emission function $\mathscr{l}$ can be represented as a matrix $\pmb{\omega}$ where $\pmb{\omega}_{s, l} = \mathscr{l}(s)(l)$. +The matrix $\pmb{\omega}$ has the size $|S| \times |\mathcal{L}|$. +The sum of each row in the matrix $\pmb{\omega}$ is equal to one, reflecting the total probability of emitting all labels from a given state. \[ - \omega = \begin{bmatrix} + \pmb{\omega} = \begin{bmatrix} \mathscr{l}(s_1)(l_1) & \cdots & \mathscr{l}(s_1)(l_{|\mathcal{L}|}) \\ \vdots & \ddots & \vdots \\ \mathscr{l}(s_{|S|})(l_1) & \cdots & \mathscr{l}(s_{|S|})(l_{|\mathcal{L}|}) @@ -47,24 +47,24 @@ \subsection{Matrix Representation of HMMs}\label{subsec:matrix-representation} \] -The transition function $\tau$ can be represented as a stochastic matrix $P$ where $P_{s, s'} = \tau(s)(s')$. -The matrix $P$ has the size $|S| \times |S|$. -The sum of each row in $P$ is equal to one, reflecting the total probability of transitioning from a given state to all other states. +The transition function $\tau$ can be represented as a stochastic matrix $\pmb{P}$ where $\pmb{P}_{s, s'} = \tau(s)(s')$. +The matrix $\pmb{P}$ has the size $|S| \times |S|$. +The sum of each row in $\pmb{P}$ is equal to one, reflecting the total probability of transitioning from a given state to all other states. \[ - P = \begin{bmatrix} + \pmb{P} = \begin{bmatrix} \tau(s_1)(s_1) & \cdots & \tau(s_1)(s_{|S|}) \\ \vdots & \ddots & \vdots \\ \tau(s_{|S|})(s_1) & \cdots & \tau(s_{|S|})(s_{|S|}) \end{bmatrix} \] -The initial distribution $\pi$ can be represented as a vector $\pi$ where $\pi_s = \pi(s)$. -The vector $\pi$ has the size $|S|$. -The sum of all elements in $\pi$ is equal to one, reflecting the fact that $\pi \in D(s)$. +The initial distribution $\pi$ can be represented as a vector $\pmb{\pi}$ where $\pmb{\pi}_s = \pi(s)$. +The vector $\pmb{\pi}$ has the size $|S|$. +The sum of all elements in $\pmb{\pi}$ is equal to one, reflecting the fact that $\pi \in D(s)$. \[ - \pi = \begin{bmatrix} + \pmb{\pi} = \begin{bmatrix} \pi(s_1) \\ \vdots \\ \pi(s_{|S|}) @@ -102,24 +102,23 @@ \subsection{The Baum-Welch Algorithm}\label{subsec:baum-welch} \begin{enumerate} \item \textbf{Initialization:} Begin with initial estimates for the \gls{hmm} parameters $(\pi, P, \omega)$. - \item \textbf{Expectation Step (E-step):} - Compute the forward probabilities $\alpha_s(t)$ and backward probabilities $\beta_s(t)$, which represent: - \begin{itemize} - \item The probability of observing the sequence up to time $t$, given that the \gls{hmm} is in state $s$ at time $t$ ($\alpha_s(t)$). - \item The probability of observing the sequence from time $t+1$ to the end, given that the \gls{hmm} is in state $s$ at time $t$ ($\beta_s(t)$). - \end{itemize} + \item \textbf{Expectation Step (E-step):} Compute the expected counts of the latent variables, i.e., the hidden states, based on the observation sequence and the current model parameters. + That is we compute the probabilities of observing the sequence up to time $t$, given that the \gls{hmm} is in state $s$ at time $t$ and the probabilities of observing the sequence from time $t+1$ to the end, given that the \gls{hmm} is in state $s$ at time $t$. \item \textbf{Maximization Step (M-step):} Update the \gls{hmm} parameters $(\pi, P, \omega)$ to maximize the likelihood of the observed data based on the expected counts computed in the E-step. \item \textbf{Iteration:} Repeat the E-step and M-step until convergence, i.e., when the change in likelihood between iterations falls below a predefined threshold. \end{enumerate} -The Baum-Welch algorithm refines the \gls{hmm} parameters by iteratively maximizing the likelihood of the observations. + +The Baum Welch algorithm seeks to estimate the parameters $\tau$, $\pi$, and $\omega$ of a \gls{hmm} model $\mathcal{M}$ so that it maximises the likelihood function $l(\mathcal{M} | O)$. +That is, the probability that the \gls{hmm} $\mathcal{M}$ has independently generated each observation sequence $O_1, \ldots, O_N$. + Starting with an initial hypothesis $\textbf{x}_0 = (\pi, P, \omega)$, the algorithm produces a sequence of parameter estimates $\textbf{x}_1, \textbf{x}_2, \ldots$, where each new estimate improves upon the previous one. The process terminates when the improvement in likelihood is sufficiently small, satisfying the convergence criterion: \[ - ||l(o,x_n)|| < \epsilon + ||l(x_n, )|| < \epsilon \] -Where $l(o, x_n)$ is the likelihood of the observations under the parameters $x_n$, and $\epsilon > 0$ is a small threshold. +Where $l(x_n, o)$ is the likelihood of all the observations under the parameters $x_n$, and $\epsilon > 0$ is a small threshold. %where $l(\textbf{x}_n)$ is the likelihood of the observations under the parameters $\textbf{x}_n$, and $\epsilon > 0$ is a small threshold. The steps of the Baum-Welch algorithm are detailed in the following sections, including the initialization of the \gls{hmm} parameters, the forward-backward algorithm, and the update algorithm, see \autoref{subsec:initialization},~\ref{subsec:forward-backwards_algorithm}, and~\ref{subsec:update-algorithm} respectively. @@ -129,43 +128,28 @@ \subsection{Initialization of HMM Parameters}\label{subsec:initialization} Since the algorithm is designed to converge iteratively toward a locally optimal parameter set, the initial estimates do not need to be exact. However, reasonable initialization can accelerate convergence and improve numerical stability~\cite{benyacoub2015initial}. -As we are working with \glspl{hmm} we do not know which labels are emitted by which states, and we do not know the transition probabilities between states,therefore we cannot initialize the parameters based on some prior knowledge. -A common approach to initialize these parameters is as follows: - -\subsubsection{Initialization state probability distribution} -The initial state probabilities $\pi$ represent the likelihood of starting in each hidden state $s \in S$. -To estimate initial state probabilities, we can use one of the following strategies: - -\begin{itemize} - % \item Examine the observed data: Estimate the initial probabilities based on the observed data. For example, if certain states are more likely to occur at the beginning of the sequence, assign higher probabilities to these states. - % $\pi_s = \frac{R_s}{\sum_{s' \in S} R_{s'}}$ where $R_s$ is the number of times state $s$ occurs at the beginning of the sequence. - \item Random initialization: Assign random probabilities to each state $s \in S$, such that $\sum_{s \in S} \pi_s = 1$. - \item Uniform initialization: Set $\pi_s = \frac{1}{|S|}$ for all $s \in S$. -\end{itemize} - -\textit{Initialization transition probability distribution}: - -The transition matrix $P$ represents the probability of transitioning from one state to another. -To estimate transition probabilities, we can use one of the following strategies: -\begin{itemize} - % \item Examine the observed data: Estimate the transition probabilities based on the observed data. For example, if certain states are more likely to transition to others, assign higher probabilities to these transitions. - % $P_{s \rightarrow s'} = \frac{C_{s \rightarrow s'}}{\sum_{s'' \in S} C_{s \rightarrow s''}}$ where $C_{s \rightarrow s'}$ is the number of transitions from state $s$ to state $s'$. - \item Random initialization: Assign random probabilities to each transition $P_{s \rightarrow s'}$, such that $\sum_{s' \in S} P_{s \rightarrow s'} = 1$. - \item Uniform initialization: Set $P_{s \rightarrow s'} = \frac{1}{|S|}$ for all $s, s' \in S$. -\end{itemize} - -\textit{Initialization emission probability distribution}: - -The emission matrix $\omega$ represents the probability of emitting each observation label from each hidden state. -To estimate emission probabilities, we can use one of the following strategies: -\begin{itemize} - % \item Examine the observed data: Estimate the emission probabilities based on the observed data. For example, if certain states are more likely to emit certain labels, assign higher probabilities to these emissions. - % $\omega_{s, l} = \frac{C_{s, l}}{\sum_{l' \in \mathcal{L}} C_{s, l'}}$ where $C_{s, l}$ is the number of times state $s$ emits label $l$. - \item Random initialization: Assign random probabilities to each emission $\omega_{s, l}$, such that $\sum_{l \in \mathcal{L}} \omega_{s, l} = 1$ for all $s \in S$. - \item Uniform initialization: Set $\omega_{s, l} = \frac{1}{|\mathcal{L}|}$ for all $s \in S, l \in \mathcal{L}$. -\end{itemize} - -We initialize the parameters using uniform initialization for the emission matrix $\omega$, the transition matrix $P$, and the initial state distribution $\pi$. + +As we work with models that we have no prior knowledge of, meaning we dont know the number of observations, or what states that generated the observations, we cannot initialize th model parameters based on some domain knowledge. +Therefore we need to initialize the parameters based on some strategy. +A common approach to initialize these parameters is as one of the following strategies: + +\begin{enumerate} + \item Random initialization: + \begin{itemize} + \item Assign random probabilities to for the initial state distribution $\pi_s$, such that $\sum_{s \in S} \pi_s = 1$. + \item Assign random probabilities to each transition $P_{s, s'}$, such that $\sum_{s' \in S} P_{s, s'} = 1$. + \item Assign random probabilities to each emission $\omega_{s, l}$, such that $\sum_{l \in \mathcal{L}} \omega_{s, l} = 1$ for all $s \in S$. + \end{itemize} + \item Uniform initialization: + \begin{itemize} + \item Set $\pi_s = \frac{1}{|S|}$ for all $s \in S$. + \item Set $P_{s, s'} = \frac{1}{|S|}$ for all $s, s' \in S$. + \item Set $\omega_{s, l} = \frac{1}{|\mathcal{L}|}$ for all $s \in S, l \in \mathcal{L}$. + \end{itemize} +\end{enumerate} + +We initialize the parameters using random initialization, as it provides a diverse set of initial values that can help avoid local optima. +The uniform initialization is the worst choice as it provides a very limited set of initial values that can lead to local optima, and it is not recommended for practical use. These initialization strategies provide a starting point for the Baum-Welch algorithm to iteratively refine the model parameters based on the observed data. @@ -175,18 +159,6 @@ \subsection{Forward-Backward Algorithm}\label{subsec:forward-backwards_algorithm The forward variable $\alpha_s(t)$ represents the likelihood of observing the partial sequence $o_0, o_1, \dots, o_t$ and being in state $s$ at time $t$, given the model $\mathcal{M}$. The backward variable $\beta_s(t)$ represents the likelihood of observing the partial sequence $o_{t+1}, o_{t+2}, \dots, o_{|\mathbf{o}|-1}$ given state $s$ at time $t$ and the model $\mathcal{M}$. - -% \begin{equation} -% \alpha_s(t) = l(o_0, o_1, \dots, o_t, S_{t} = s \mid \mathcal{M}) -% \label{eq:alpha-recursive} -% \end{equation} - - -% \begin{equation} -% \beta_s(t) = l(o_{t+1}, o_{t+2}, \dots, o_{|\mathbf{o}|-1} \mid S_{t} = s, \mathcal{M}) -% \label{eq:beta-recursive} -% \end{equation} - The forward variable $\alpha_s(t)$ and backward variable $\beta_s(t)$ can be computed recursively as follows: \begin{equation} @@ -225,7 +197,7 @@ \subsection{Update Algorithm}\label{subsec:update-algorithm} \subsubsection{Intermediate Variables} We need to calculate the intermediate variables $\gamma_s(t)$ and $\xi_{ss'}(t)$. -$\gamma_s(t)$ represent the expected number of times the model is in state $s$ at time $t$ and $\xi_{ss'}(t)$ represent the expected number of transitions from state $s$ to state $s'$ at time $t$. +$\gamma_s(t)$ represent the expected number of times the model is in state $s$ at time $t$ given that the sequence $O$ was observed and $\xi_{ss'}(t)$ represent the expected number of transitions from state $s$ to state $s'$ at time $t$ given that the sequence $O$ was observed. For a given \gls{hmm} $\mathcal{M}$, the intermediate variables, $\gamma_s(t)$ and $\xi_{ss'}(t)$, are computed for each observation sequence $o_0, o_1, \dots, o_{|\mathbf{o}|-1} = \mathbf{o} \in \mathcal{O}$. These variables are computed as follows: @@ -302,32 +274,32 @@ \subsection{Matrix Operations}\label{subsec:matrix-operations} %Baum-Welch with matrix operations The Baum-Welch algorithm can be implemented using matrix operations to efficiently compute the forward and backward variables, intermediate variables, and parameter updates. -Given a \gls{hmm} $\mathcal{M}$ with parameters $\omega$, $P$, and $\pi$, and an observation sequence $\mathbf{o}$, the forward and backward variables $\alpha_t$ and $\beta_t$ can be computed using matrix operations as follows: +Given a \gls{hmm} $\mathcal{M}$ with parameters $\pmb{\omega}$, $\pmb{P}$, and $\pmb{\pi}$, and an observation sequence $\mathbf{o}$, the forward and backward variables $\pmb{\alpha}_t$ and $\pmb{\beta}_t$ can be computed using matrix operations as follows: \begin{equation} \label{eq:alpha} - \alpha_t = + \pmb{\alpha}_t = \begin{cases} - \omega_0 \; \circ \; \pi & \text{if } t = 0 \\ - \omega_t \; \circ \; \left( {P}^\top \alpha_{t - 1} \right) & \text{if } 0 < t \leq |\mathbf{o}|-1 \\ + \pmb{\omega}_0 \; \circ \; \pmb{\pi} & \text{if } t = 0 \\ + \pmb{\omega}_t \; \circ \; \left( \pmb{P}^\top \pmb{\alpha}_{t - 1} \right) & \text{if } 0 < t \leq |\mathbf{o}|-1 \\ \end{cases} \end{equation} \begin{equation} \label{eq:beta} - {\beta}_t = + \pmb{\beta}_t = \begin{cases} \mathbbm{1} & \text{if } t = |\mathbf{o}|-1 \\ - {P} \; ({\beta}_{t + 1} \; \circ \; {\omega}_{t + 1}) & \text{if } 0 \leq t < |\mathbf{o}|-1 \\ + \pmb{P} \; (\pmb{\beta}_{t + 1} \; \circ \; \pmb{\omega}_{t + 1}) & \text{if } 0 \leq t < |\mathbf{o}|-1 \\ \end{cases} \end{equation} -Here $\circ$ represents the Hadamard (point-wise) matrix multiplication, ${P}^\top$ denotes the transpose of the matrix ${P}$, and $\mathbbm{1}$ is a column vector of ones. -The resulting vectors ${\alpha}_t$ and ${\beta}_t$ for each time step $t$ are then related to $\alpha_s(t)$ and $\beta_s(t)$ for some $s$ by: +Here $\circ$ represents the Hadamard (point-wise) matrix multiplication, $\pmb{P}^\top$ denotes the transpose of the matrix $\pmb{P}$, and $\mathbbm{1}$ is a column vector of ones, and $\pmb{\omega}_t$ is the coloumn vector that represents the label we are observing at time $t$ of matrix $\pmb{\omega}$. +The resulting vectors $\pmb{\alpha}_t$ and $\pmb{\beta}_t$ for each time step $t$ are then related to $\alpha_s(t)$ and $\beta_s(t)$ for some $s$ by: \begin{align} -{\alpha} +\pmb{\alpha} _t = \begin{bmatrix} \alpha_{s_0}(t) \\ \vdots \\ @@ -340,85 +312,105 @@ \subsection{Matrix Operations}\label{subsec:matrix-operations} \end{bmatrix} \end{align} +We introduce $\pmb{\alpha}_{t i}$ and $\pmb{\beta}_{t i}$ as the combined matrix for $\alpha$ and $\beta$ respectively, $\pmb{\alpha}_t$ and $\pmb{\beta}_t$. +\begin{equation} + \pmb{\alpha}_{t i} = + \begin{bmatrix} + \alpha_{s_0t_{0}} & \cdots & \alpha_{s_{|S|-1}t_{|\textbf{o}|-1}} \\ + \vdots & \ddots & \vdots \\ + \alpha_{s_0t_{0}} & \cdots & \alpha_{s_{|S|-1}t_{|\textbf{o}|-1}} + \end{bmatrix} + \text{ and } + \pmb{\beta}_{t i} = + \begin{bmatrix} + \beta_{s_0t_{0}} & \cdots & \beta_{s_{|S|-1}t_{|\textbf{o}|-1}} \\ + \vdots & \ddots & \vdots \\ + \beta_{s_0t_{0}} & \cdots & \beta_{s_{|S|-1}t_{|\textbf{o}|-1}} + \end{bmatrix} +\end{equation} $\gamma$ and $\xi$ can be expressed in terms of matrix operations as follows: \begin{equation} -{\gamma} - _t = (\sum_{i=1}^{|\mathbf{o}|-1} (\alpha_{t i} \;\beta_{t i}))^{-1} \cdot \alpha_t \; \circ \; \beta_t +\pmb{\gamma} + _t = (\sum_{i=1}^{|\mathbf{o}|-1} (\pmb{\alpha}_{t i} \;\pmb{\beta}_{t i}))^{-1} \cdot \pmb{\alpha}_t \; \circ \; \pmb{\beta}_t \label{eq:gamma-matrix} \end{equation} \begin{equation} -{\xi} - _t = ((\sum_{i=1}^{|\mathbf{o}|-1} (\alpha_{t i} \; \beta_{t i}))^{-1} \cdot \; {P}) \; \circ \;(\alpha_t \otimes (\beta_{t+1} \; \circ \; {\omega}_{t+1})) +\pmb{\xi} + _t = ((\sum_{i=1}^{|\mathbf{o}|-1} (\pmb{\alpha}_{t i} \; \pmb{\beta}_{t i}))^{-1} \cdot \; \pmb{P}) \; \circ \;(\pmb{\alpha}_t \otimes (\pmb{\beta}_{t+1} \; \circ \; \pmb{\omega}_{t+1})) \label{eq:xi-matrix} \end{equation} Here $\otimes$ represents the Kronecker (block) matrix multiplication, $\cdot$ denotes the scalar product and $^{-1}$ denotes the elementwise inverse of a matrix. -We can simplify $\sum_{i=1}^{|\mathbf{o}|-1} (\alpha_{t i} \beta_{t i})$ as, the sum does not depend on $t$: +We can simplify $\sum_{i=1}^{|\mathbf{o}|-1} (\pmb{\alpha}_{t i} \pmb{\beta}_{t i})$ as, the sum does not depend on $t$: \begin{align} - \sum_{i=1}^{|\mathbf{o}|-1} (\alpha_{t i} \; \beta_{t i}) &= \sum_{i=1}^{|\mathbf{o}|-1} \alpha_{|\mathbf{o}|-1 i} \\ - &= \mathbbm{1}^T \; \alpha_{|\mathbf{o}|-1} + \sum_{i=1}^{|\mathbf{o}|-1} (\pmb{\alpha}_{t i} \; \pmb{\beta}_{t i}) &= \sum_{i=1}^{|\mathbf{o}|-1} \pmb{\alpha}_{|\mathbf{o}|-1 i} \\ + &= \mathbbm{1}^T \; \pmb{\alpha}_{|\mathbf{o}|-1} \end{align} -Here $\mathbbm{1}^T$ is a row vector of ones, and $\alpha_{|\mathbf{o}|-1}$ is the last column of the matrix ${\alpha_t}_{T \;\in 0\dots|\mathbf{o}|-1}$. +Here $\mathbbm{1}^T$ is a row vector of ones, and $\pmb{\alpha}_{|\mathbf{o}|-1}$ is the last column of the matrix $\pmb{\alpha}_{ti}$. So we get: \begin{equation} -{\gamma} - _t = (\mathbbm{1}^T \; \alpha_{|\mathbf{o}|-1})^{-1} \cdot \alpha_t \; \circ \; \beta_t +\pmb{\gamma} + _t = (\mathbbm{1}^T \; \pmb{\alpha}_{|\mathbf{o}|-1})^{-1} \cdot \pmb{\alpha}_t \; \circ \; \pmb{\beta}_t \label{eq:gamma-matrix-ones} \end{equation} \begin{equation} -{\xi} - _t = ((\mathbbm{1}^T \; \alpha_{|\mathbf{o}|-1})^{-1} \cdot \; {P}) \; \circ \;(\alpha_t \otimes (\beta_{t+1} \; \circ \; {\omega}_{t+1})) +\pmb{\xi} + _t = ((\mathbbm{1}^T \; \pmb{\alpha}_{|\mathbf{o}|-1})^{-1} \cdot \; \pmb{P}) \; \circ \;(\pmb{\alpha}_t \otimes (\pmb{\beta}_{t+1} \; \circ \; \pmb{\omega}_{t+1})) \label{eq:xi-matrix-ones} \end{equation} -The resulting vectors ${\gamma}_t$ and ${\xi}_t$ for each time step $t$ are then related to $\gamma_s(t)$ and $\xi_{ss'}(t)$ for some $s, s'$ by: +The resulting vectors $\pmb{\gamma}_t$ and $\pmb{\xi}_t$ for each time step $t$ are then related to $\gamma_s(t)$ and $\xi_{ss'}(t)$ for some $s, s'$ by: \begin{align} -{\gamma} - _t = \begin{bmatrix} - \gamma_{s_0}(t) \\ - \vdots \\ - \gamma_{s_{|S|-1}}(t) \\ +\pmb{\gamma}_t = + \begin{bmatrix} + \gamma_{s_0}(t) \\ + \vdots \\ + \gamma_{s_{|S|-1}}(t) \\ \end{bmatrix}, \; - {\xi}_t = \begin{bmatrix} - \xi_{s_0 s_0}(t) & \cdots & \xi_{s_0 s_{|S|-1}}(t) \\ - \vdots & \ddots & \vdots \\ - \xi_{s_{|S|-1}s_0}(t) & \cdots & \xi_{s_{|S|-1}s_{|S|-1}}(t) \\ +\pmb{\xi}_t = + \begin{bmatrix} + \xi_{s_0 s_0}(t) & \cdots & \xi_{s_0 s_{|S|-1}}(t) \\ + \vdots & \ddots & \vdots \\ + \xi_{s_{|S|-1}s_0}(t) & \cdots & \xi_{s_{|S|-1}s_{|S|-1}}(t) \\ \end{bmatrix} \end{align} We can update the parameters with matrix operations as follows: \begin{equation} -{P} - = (\mathbbm{1} \oslash \gamma) \bullet \xi +\pmb{P} + = (\mathbbm{1} \oslash \pmb{\gamma}) \bullet \pmb{\xi} \label{eq:transition-probabilities-update} \end{equation} \begin{equation} -{\omega} - _s(o) = (\mathbbm{1} \oslash \gamma) \bullet (\sum_{t=1}^{|\mathbf{o}|-1} \gamma_t \otimes \mathbbm{1}_{yt}^{|\mathbf{o}|-1}) +\pmb{\omega} + _s(o) = (\mathbbm{1} \oslash \pmb{\gamma}) \bullet (\sum_{t=1}^{|\mathbf{o}|-1} \pmb{\gamma}_t \otimes \mathbbm{1}_{yt}^{|\mathbf{o}|-1}) \label{eq:omega-update} \end{equation} \begin{equation} -{\pi} - = {\gamma}_1 +\pmb{\pi} + = \pmb{\gamma}_1 \label{eq:initial-probabilities-update} \end{equation} Where $\oslash$ denotes Hadamard division (elementwise division) product and $\bullet$ denotes the Katri-Rao product (column-wise Kronecker product). -In the formulas above, $\mathbbm{1}$ denotes a column vector of ones, $\mathbbm{1}_{yt}$ denotes a row vector of ones, $\gamma$ and $\xi$ are the sum of the respective vectors over all time steps $t$: +In the formulas above, $\mathbbm{1}$ denotes a column vector of ones, $\mathbbm{1}_{yt}$ denotes a column vector that is $|\mathcal{L}|$ long, with all elements set to zero except for the element at index where $o_t = l$ which is set to one. + +$\gamma$ and $\xi$ are the sum of the respective vectors over all time steps $t$: \begin{align} - \gamma = \sum_{t=1}^{|\mathbf{o}|-1} \gamma_t, \; - \xi = \sum_{t=1}^{|\mathbf{o}|-1} \xi_t + \pmb{\gamma} = \sum_{t=1}^{|\mathbf{o}|-1} \pmb{\gamma}_t + \text{ and } + \pmb{\xi} = \sum_{t=1}^{|\mathbf{o}|-1} \pmb{\xi}_t \end{align} diff --git a/report/src/sections/03-implementation.tex b/report/src/sections/03-implementation.tex index d661380..759eb86 100644 --- a/report/src/sections/03-implementation.tex +++ b/report/src/sections/03-implementation.tex @@ -1,16 +1,51 @@ \section{Implementation}\label{sec:implementation} In this section, we will discuss the implementation of the project. -We will start by discussing the tools used in the implementation, followed by the transition from matrices to \glspl{add}. +We will start by discussing \glspl{add} and their advantages and disadvantages. +How we transition from vectors and matrices to \glspl{add} will be discussed. +We will then discuss the CUDD library and the Storm model checker. Finally, we will discuss the implementation of the matrix operations using \glspl{add}. + +\subsection{Algebraic Decision Diagrams}\label{subsec:algebraic-decision-diagrams} +\acrfullpl{add} generalize \glspl{bdd}~\cite{bahar1997algebric}. +Unlike \glspl{bdd} which are limited binary values (\{0,1\})~\cite{bryant1986graph}, \glspl{add} extend this framework by allowing terminal nodes to store arbitrary values, such as integers or real numbers. +This flexibility makes them well-suited for applications like probabilistic model checking, optimization, and symbolic matrix operations. + +In an \gls{add}, each path from the root to a terminal node represents a unique variable assignment, with terminal nodes storing the associated function values. +Their hierarchical structure supports compact, symbolic representations and efficient manipulation of large functions. + +Advantages of \glspl{add}: +\begin{itemize} +\item Compact Representation: \glspl{add} exploit redundancies in data, providing a compact representation for many functions. Shared subgraphs reduce memory usage, especially when storing large, structured data, such as matrices. +\item Canonical Form: For a fixed variable ordering, \glspl{add} are canonical. This property simplifies equivalence checking and ensures consistency in symbolic manipulations. +\item Efficient Operations: Operations such as addition and multiplication can be performed directly on \glspl{add}, often avoiding explicit enumeration of the underlying data. +\end{itemize} + + +Disadvantages of \glspl{add}: +\begin{itemize} +\item Variable Ordering Sensitivity: The size and efficiency of \glspl{add} depend heavily on the chosen variable ordering. A suboptimal ordering can lead to exponentially larger diagrams, reducing performance. +\item Overhead in Construction: Constructing an \gls{add} involves significant overhead compared to simpler data structures like arrays or hash tables, particularly for small datasets, where the benefits of compactness may not be realized. +\item Complexity for Certain Functions: While \glspl{add} work well for structured data, highly unstructured or random data can result in large, inefficient diagrams. In such cases, other data structures may be more suitable. +\end{itemize} + +\glspl{add} can symbolically encode row and column indices as binary variables, enabling efficient storage and manipulation of matrix data. +This approach is particularly advantageous for sparse or structured matrices, where patterns and redundancies can be exploited. + +In this work, \glspl{add} provide a foundation for symbolic matrix operations like addition, multiplication, and Kronecker product. Their compact representation and efficiency make them a powerful tool for handling complex, parameterized computations. + \subsection{Transition to ADDs}\label{subsec:transition-to-adds} The first step in the implementation is to transition from vectors and matrices to \glspl{add}. This conversion leverages the compact and efficient representation of \glspl{add} to perform operations symbolically. -To convert a vector into an \gls{add}, the vector must first be interpreted as a square matrix. -This step ensures compatibility with the \gls{add} representation, which organizes data hierarchically. -When a matrix is represented as an \gls{add}, the matrix also has to be square, as the \gls{add} representation requires a square matrix, if the matrix is not square, it has to be padded with zeros to make it square. +To convert a vector into an \gls{add}, we encode the indices as binary variables and the values as terminal nodes. +It is important to note that all the variables in the \gls{add} has to be consistent, that is the number of variables in the \gls{add} has to be the same for all the nodes in the \gls{add}. + +When a matrix is represented as an \gls{add}, the matrix has to be square, as the \gls{add} representation requires a square matrix, if the matrix is not square, it has to be padded with zeros to make it square. +This is because in our implementation, we use the CUDD library, which requires the matrices to be square to be represented as \glspl{add}. +\subsubsection{Vector to ADD} +We will illustrate the conversion from a vector to an \gls{add} using an example. Consider the following vector: \[ @@ -20,23 +55,13 @@ \subsection{Transition to ADDs}\label{subsec:transition-to-adds} \end{bmatrix} \] -This vector corresponds to a matrix of size $4 \times 4$. +In an \gls{add}, each layer corresponds to one binary variable (or bit) in the encoding of an index. +For a vector of size $n$, where $n = 2^k$, the binary representation of the column indices requires $k$ bits each. +In the case of the vector $V$, the vector has 4 elements, so it requires $4 = 2^2$ bits to represent the indices. -\[ - \begin{bmatrix} - 1 & 2 & 3 & 4 \\ - 0 & 0 & 0 & 0 \\ - 0 & 0 & 0 & 0 \\ - 0 & 0 & 0 & 0 \\ - \end{bmatrix} -\] -In an ADD, each layer corresponds to one binary variable (or bit) in the encoding of an index. -For a matrix of size $n \times n$, where $n = 2^k$, the binary representation of the row and column indices requires $k$ bits each. -By interleaving these bits (e.g., alternating between row and column bits), we construct a balanced and regular structure that preserves the matrix's two-dimensional nature. -In the case of the vector V, the vector has 4 elements, so it requires $4 = 2^2$ bits to represent the indices. - -The binary representation of the vector entries is shown in \autoref{tab:vector}, the rest of the matrix indices is filled with zeros. +We can create the binary representation of the indices by using the binary representation of the numbers from 0 to $k$. +The binary representation of the vector $V$ entries is shown in \autoref{tab:vector}. \begin{table} \centering \caption{Binary encoding of a vector V of size 4} @@ -45,37 +70,28 @@ \subsection{Transition to ADDs}\label{subsec:transition-to-adds} \toprule Vector Index & Value & Binary Encoding \\ \midrule - 1 & 1 & 0000 \\ - 2 & 2 & 0001 \\ - 3 & 3 & 0010 \\ - 4 & 4 & 0011 \\ + 0 & 1 & 00 \\ + 1 & 2 & 01 \\ + 2 & 3 & 10 \\ + 3 & 4 & 11 \\ \bottomrule \end{tabular} \end{table} -The \gls{add} representation of this vector is shown in \autoref{fig:add}. The binary encodings determine the structure of the decision diagram, where each entry in the vector is stored as a terminal node. The paths to these nodes are dictated by the binary representation of their indices. +The root node represents the first bit of the binary encoding, with subsequent layers corresponding to the remaining bits. -\begin{figure*} - \centering - \input{figures/vector_add_example} - \caption{Vector V represented as an ADD} - \label{fig:add} -\end{figure*} - -The conversion of a matrix to an \gls{add} is similar to that of a vector, but with an additional layer of nodes to represent the rows. -The \gls{add} can however be reduced as shown in \autoref{fig:add_reduced}. -This reduction is done by removing the duplicated terminal nodes, removing the redundant nodes and merging the nodes with the same children. -The techniques for reducing \glspl{add} is the standard reduction techniques used for \glspl{bdd}. -The reduction of the \gls{add} is done to reduce the size of the \gls{add} and to make the operations on the \gls{add} more efficient. -CUDD has built-in functions for reducing the \gls{add}, that follows the standard reduction techniques. - +The \gls{add} representation of the vector $V$ is shown in \autoref{fig:add_vector}. +For matrices represented as \glspl{add}, the structure is similar to vectors, but with two sets of binary variables for row and column indices. +The terminal nodes store the matrix entries, with paths determined by the binary representation of the row and column indices. +The \gls{add} variables are interleaved, with the row and column indices alternating in the path from the root to the terminal nodes. +This is the way the CUDD library represents matrices as \glspl{add}. \begin{figure} \centering \input{figures/vector_add_example_reduced} - \caption{Reduced ADD of matrix V} - \label{fig:add_reduced} + \caption{\gls{add} representation of a vector V of size 4} + \label{fig:add_vector} \end{figure} \subsection{CUDD}\label{subsec:cudd} @@ -87,36 +103,36 @@ \subsection{CUDD}\label{subsec:cudd} The CUDD library is implemented in C, ensuring high-performance execution, but it also ensures it can be used in C++ programs. \subsection{Storm}\label{subsec:storm} -Storm is a versatile, open-source probabilistic model checking tool designed to verify the correctness and properties of stochastic models~\cite{hensel2021probabilistic}. It supports a wide range of probabilistic models, including \glspl{hmm}, \glspl{mc} and \glspl{mdp}. Storm allows users to analyze models efficiently by computing various quantitative properties, such as probabilities, expected rewards, or long-run averages. -A key feature of Storm is its ability to represent models symbolically, leveraging data structures like \glspl{bdd} and \glspl{add}. These symbolic representations compactly encode the model's state space and transition structure, enabling efficient manipulation and analysis even for large-scale systems. Storm achieves this by interfacing with the CUDD library, mentioned earlier. +Storm is a versatile, open-source probabilistic model checking tool designed to verify the correctness and properties of stochastic models~\cite{hensel2021probabilistic}. +It supports a wide range of probabilistic models, including \glspl{mc} and \glspl{mdp}. -In our implementation, Storm serves as a parser for loading the input models. Specifically, we utilize Storm to convert the model into its \gls{add} representation. This \gls{add} representation provides a compact and hierarchical encoding of the underlying matrices, which can then be used to perform symbolic matrix operations using the CUDD library. +It does not include \glspl{hmm} in its supported models, but a \gls{hmm} can be enoded as an \gls{mc}~\cite{rabiner1989tutorial}. +Storm allows users to analyze models efficiently by computing various quantitative properties, such as probabilities, expected rewards, or long-run averages. -The reason for using Storm lies in it is open-source, which makes it easy to integrate into our project. Storm is designed to handle large and complex models efficiently for model checking. -Therefore the next step in Storm is to calculate the parameters of interest, such as transition probabilities, rewards, or other metrics derived from the model. By performing these computations symbolically within the ADD framework, we achieve a scalable and efficient approach to analyzing stochastic models. +The main purpose of this project is to provide data-driven modeling tools for formal verification. +The Storm model checker is widely recognized as a state-of-the-art verification tool for probabilistic models, known for its efficiency and scalability. +For this reason, in this project we have chosen integrate our symbolic implementation of the Baum Welch algorithm within Stom. +Storm uses the CUDD library for the symbolic representation of models, which makes it a natural choice for our implementation. +This translate into working directly with the symbolic representation of the models provided by the parsers available in Storm that, in principle could be direcly used (off the shelf) by Storm for verification purposes. \subsection{Matrix operations using ADDs}\label{subsec:matrix-operations-using-adds} The matrix operations are implemented using \glspl{add}. The matrix operations implemented are matrix transpose, matrix addition, matrix multiplication, Hadamard product, Hadamard division, Kronecker product and Khatri-Rao product. +The matrices: \[ A = \begin{bmatrix} 1 & 2 \\ 3 & 4 \\ \end{bmatrix} -\] - -and - -\[ +\text{ and } B = \begin{bmatrix} 5 & 6 \\ 7 & 8 \\ \end{bmatrix} \] - are used as examples in the following subsections. In CUDD the \textit{DdManager} is used to manage the \glspl{add} and the operations on them. @@ -167,11 +183,11 @@ \subsubsection{Matrix multiplication} \end{bmatrix} \] -%\textit{Cudd\_addMatrixMultiply(DdManager dd, DdNode A, DdNode B, DdNode **z, int nz)} We use the function \textit{Cudd\_addMatrixMultiply(DdManager dd, DdNode A, DdNode B, DdNode **z, int nz)} in CUDD to multiply two \glspl{add}. The function takes two \glspl{add} \textit{A} and \textit{B} to be multiplied as input and returns a pointer to the resulting \gls{add}. \textit{z} is the set of variables that are dependent on the columns in \textit{A} and the rows in \textit{B}. \textit{A} is assumed to have the same number of columns as \textit{B} has rows, namely \textit{nz}. +In the implementation of the matrix multiplication, we needed to rename the variables in the \glspl{add} to do the matrix multiplication, namely \textit{A}s columns variables and \textit{B}s row variables, as CUDD requires the variables in the \glspl{add} to have the same name. \subsubsection{Hadamard product}\label{subsubsec:hadamard-product} The Hadamard product is implemented by pairwise multiplication of corresponding terminal nodes in the two \glspl{add}. For each matching row-column index pair $(i, j)$: @@ -181,7 +197,6 @@ \subsubsection{Hadamard product}\label{subsubsec:hadamard-product} \end{enumerate} The structure of the indices remains unchanged. The Hadamard product of matrices $A$ and $B$ is: - \[ A \circ B = \begin{bmatrix} 5 & 12 \\ @@ -193,7 +208,7 @@ \subsubsection{Hadamard product}\label{subsubsec:hadamard-product} \subsubsection{Hadamard division} The Hadamard division is implemented as Hadamard product, but with division instead of multiplication. See~\autoref{subsubsec:hadamard-product} for more details. -The Hadamard division of matrices $A$ and $B$ is +The Hadamard division of matrices $A$ and $B$ is: \[ A \oslash B = \begin{bmatrix} @@ -202,18 +217,24 @@ \subsubsection{Hadamard division} \end{bmatrix} \] -The Hadamard division is implemented by \textit{Cudd\_addApply(DdManager * dd, Cudd\_addDivide(), DdNode * f, DdNode * g)}, where \textit{f} and \textit{g} are the two \glspl{add} to be divided and \textit{Cudd\_addDivide()} is the function that is used to divide the two \glspl{add} elementwise. +The Hadamard division is implemented in CUDD by \textit{Cudd\_addApply(DdManager * dd, Cudd\_addDivide(), DdNode * f, DdNode * g)}, where \textit{f} and \textit{g} are the two \glspl{add} to be divided and \textit{Cudd\_addDivide()} is the function that is used to divide the two \glspl{add} elementwise. \subsubsection{Kronecker product} -The Kronecker product is implemented by expanding the indices and terminal nodes of the two matrices symbolically, with the resulting \gls{add} having the dimensions of the sum of the dimensions of the two matrices. -The Kronecker product is a generalization of the outer product, where each element of the first matrix is multiplied by the second matrix as a whole. -For each entry $(i, j)$ in the first matrix with value $a$, the second matrix $B$ is multiplied by $a$, and the indices are adjusted: +The Kronecker product is implemented by symbolically expanding the indices and terminal nodes of the two matrices. +This operation results in a new \gls{add} with dimensions equal to the sum of the dimensions of the input matrices. +The Kronecker product generalizes the outer product, multiplying each element of the first matrix by the entire second matrix. + +For a specific entry in the first matrix $A$ at row $i$ and column $j$, with value $a_{ij}$, the second matrix $B$ is scaled by $a_{ij}$, and its indices are adjusted as follows: +and its indices are adjusted as follows: \begin{enumerate} - \item The row and column indices of $B$ are shifted based on $i$ and $j$ of $A$. - \item The resulting values are stored in a new \gls{add}. +\item Multiply the values of $B$ by $a_{ij}$. +\item The row indices of $B$ are shifted by $i \times \text{rows}(B)$ in the new \gls{add}. +\item The column indices of $B$ are shifted by $j \times \text{columns}(B)$ in the new \gls{add}. +\item The scaled values are stored in the new \gls{add}. \end{enumerate} -The Kronecker product of matrices $A$ and $B$ is +This operation is not natively supported in the CUDD library, so we implemented it explicitly as a custom function. +The Kronecker product of matrices $A$ and $B$ is: \[ A \otimes B = \begin{bmatrix} 5 & 6 & 10 & 12 \\ @@ -229,8 +250,8 @@ \subsubsection{Khatri-Rao product} \item The elements of row $i$ in the first matrix are multiplied element-wise with the entire row $i$ in the second matrix. \item The resulting row is constructed symbolically within the \gls{add}. \end{enumerate} -The Khatri-Rao product of matrices $A$ and $B$ is - +This operation is not natively supported in the CUDD library, so we implemented it explicitly as a custom function. +The Khatri-Rao product of matrices $A$ and $B$ is: \[ A \bullet B = \begin{bmatrix} 5 & 6 & 10 & 12 \\ @@ -243,37 +264,9 @@ \subsubsection{Khatri-Rao product} % Matrix scalar is implemented by multiplying each element in the matrix by a scalar value. % The scaling of matrix $A$ by a factor of 2 is % \[ -% 2 \times A = 2 \times \begin{bmatrix} -% 1 & 2 \\ -% 3 & 4 \\ -% \end{bmatrix} = \begin{bmatrix} +% 2 \times A = \begin{bmatrix} % 2 & 4 \\ % 6 & 8 \\ % \end{bmatrix} % \] -% The \gls{add} representation of the scaled matrix is shown in Figure \ref{fig:add_scaling}. -% \begin{figure} -% \centering -% \begin{tikzpicture}[ -% level 1/.style={sibling distance=20mm}, -% level 2/.style={sibling distance=10mm}, -% level 3/.style={sibling distance=10mm} -% ] -% \node[c] {$x_1$} -% child{ node[c] {$y_1$} edge from parent[zeroarrow] -% child{ node[t] {2} -% } -% child{ node [t] {4} edge from parent[onearrow] -% } -% } -% child{ node[c] {$y_1$} edge from parent[onearrow] -% child{ node[t] {6} edge from parent[zeroarrow] -% } -% child{ node[t] {8} edge from parent[onearrow]} -% } -% ; -% \end{tikzpicture} -% \caption{Scaling of matrix A by a factor of 2} -% \label{fig:add_scaling} -% \end{figure}