Update.
authorFrançois Fleuret <francois@fleuret.org>
Sat, 2 Mar 2024 00:04:42 +0000 (01:04 +0100)
committerFrançois Fleuret <francois@fleuret.org>
Sat, 2 Mar 2024 00:04:42 +0000 (01:04 +0100)
elbo.tex

index 4c6cb24..563ec3c 100644 (file)
--- a/elbo.tex
+++ b/elbo.tex
@@ -148,4 +148,20 @@ $\theta$ and $\alpha$ to maximize
 it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid
 x_n)$ close to $p_\theta(z \mid x_n)$.
 
+\medskip
+
+A point that may be important in practice is
+%
+\begin{align*}
+ & \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right]                      \\
+ & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n \mid Z) p_\theta(Z)}{q_\alpha(Z \mid x_n)} \right] \\
+ & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \, p_\theta(x_n \mid Z) \right]                                            \\
+ & \hspace*{7em} - \dkl(q_\alpha(z \mid x_n) \, \| \, p_\theta(z)).
+\end{align*}
+%
+This form is useful because for certain $p_\theta$ and $q_\alpha$, for
+instance if they are Gaussian, the KL term can be computed exactly
+instead of through sampling, which removes one source of noise in the
+optimization process.
+
 \end{document}