Update.

[tex.git] / elbo.tex
diff --git a/elbo.tex b/elbo.tex

index 6875ddf..fe91565 100644 (file)
--- a/elbo.tex
+++ b/elbo.tex
@@ -71,20 +71,27 @@
  
  \begin{document}
  
-\vspace*{0ex}
+\setlength{\abovedisplayskip}{2ex}
+\setlength{\belowdisplayskip}{2ex}
+\setlength{\abovedisplayshortskip}{2ex}
+\setlength{\belowdisplayshortskip}{2ex}
+
+\vspace*{-4ex}
  
  \begin{center}
  {\Large The Evidence Lower Bound}
  
+\vspace*{1ex}
+
  Fran\c cois Fleuret
  
  \today
  
-\vspace*{1ex}
+\vspace*{-1ex}
  
  \end{center}
  
-Given a training set $x_1, \dots, x_N$ that follows an unknown
+Given i.i.d training samples $x_1, \dots, x_N$ that follows an unknown
  distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it,
  maximizing
  %
@@ -92,7 +99,7 @@ maximizing
  \sum_n \log \, p_\theta(x_n).
  \]
  %
-If we do not have a analytical form of the marginal $p_\theta(x_n)$
+If we do not have an analytical form of the marginal $p_\theta(x_n)$
  but only the expression of $p_\theta(x_n,z)$, we can get an estimate
  of the marginal by sampling $z$ with any distribution $q$
  %
@@ -102,12 +109,14 @@ p_\theta(x_n) & = \int_z p_\theta(x_n,z) dz                   \\
                & = \expect_{Z \sim q(z)} \left[\frac{p_\theta(x_n,Z)}{q(Z)}\right].
  \end{align*}
  %
-So if we wanted to maximize $p_\theta(x_n)$ alone, we could sample a
+So if we sample a
  $Z$ with $q$ and maximize
  %
  \begin{equation*}
-\frac{p_\theta(x_n,Z)}{q(Z)}.\label{eq:estimator}
+\frac{p_\theta(x_n,Z)}{q(Z)},
  \end{equation*}
+%
+we do maximize $p_\theta(x_n)$ on average.
  
  But we want to maximize $\sum_n \log \, p_\theta(x_n)$. If we use the
  $\log$ of the previous expression, we can decompose its average value