Update.
authorFrançois Fleuret <francois@fleuret.org>
Sat, 24 Feb 2024 08:06:51 +0000 (09:06 +0100)
committerFrançois Fleuret <francois@fleuret.org>
Sat, 24 Feb 2024 08:06:51 +0000 (09:06 +0100)
elbo.tex [new file with mode: 0644]

diff --git a/elbo.tex b/elbo.tex
new file mode 100644 (file)
index 0000000..6875ddf
--- /dev/null
+++ b/elbo.tex
@@ -0,0 +1,140 @@
+%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*-
+
+%% Any copyright is dedicated to the Public Domain.
+%% https://creativecommons.org/publicdomain/zero/1.0/
+%% Written by Francois Fleuret <francois@fleuret.org>
+
+\documentclass[11pt,a4paper,oneside]{article}
+\usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry}
+%\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry}
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage{amsmath,amssymb,dsfont}
+\usepackage[pdftex]{graphicx}
+\usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue]{hyperref}
+\usepackage{tikz}
+\usetikzlibrary{arrows,arrows.meta,calc}
+\usetikzlibrary{patterns,backgrounds}
+\usetikzlibrary{positioning,fit}
+\usetikzlibrary{shapes.geometric,shapes.multipart}
+\usetikzlibrary{patterns.meta,decorations.pathreplacing,calligraphy}
+\usetikzlibrary{tikzmark}
+\usetikzlibrary{decorations.pathmorphing}
+\usepackage[round]{natbib}
+\usepackage[osf]{libertine}
+\usepackage{microtype}
+\usepackage{fancyvrb}
+
+\usepackage{mleftright}
+
+\newcommand{\setmuskip}[2]{#1=#2\relax}
+\setmuskip{\thinmuskip}{1.5mu} % by default it is equal to 3 mu
+\setmuskip{\medmuskip}{2mu} % by default it is equal to 4 mu
+\setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu
+
+\setlength{\parindent}{0cm}
+\setlength{\parskip}{1ex}
+%\renewcommand{\baselinestretch}{1.3}
+%\setlength{\tabcolsep}{0pt}
+%\renewcommand{\arraystretch}{1.0}
+
+\def\argmax{\operatornamewithlimits{argmax}}
+\def\argmin{\operatornamewithlimits{argmin}}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\def\given{\,\middle\vert\,}
+\def\proba{\operatorname{P}}
+\newcommand{\seq}{{S}}
+\newcommand{\expect}{\mathds{E}}
+\newcommand{\variance}{\mathds{V}}
+\newcommand{\empexpect}{\hat{\mathds{E}}}
+\newcommand{\mutinf}{\mathds{I}}
+\newcommand{\empmutinf}{\hat{\mathds{I}}}
+\newcommand{\entropy}{\mathds{H}}
+\newcommand{\empentropy}{\hat{\mathds{H}}}
+\newcommand{\ganG}{\mathbf{G}}
+\newcommand{\ganD}{\mathbf{D}}
+\newcommand{\ganF}{\mathbf{F}}
+
+\newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}}
+\newcommand{\djs}{\mathds{D}_{\mathsf{JS}}}
+
+\allowdisplaybreaks[2]
+
+\newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}}
+\newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}}
+
+\def\positionalencoding{\operatorname{pos-enc}}
+\def\concat{\operatorname{concat}}
+\def\crossentropy{\LL_{\operatorname{ce}}}
+
+\begin{document}
+
+\vspace*{0ex}
+
+\begin{center}
+{\Large The Evidence Lower Bound}
+
+Fran\c cois Fleuret
+
+\today
+
+\vspace*{1ex}
+
+\end{center}
+
+Given a training set $x_1, \dots, x_N$ that follows an unknown
+distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it,
+maximizing
+%
+\[
+\sum_n \log \, p_\theta(x_n).
+\]
+%
+If we do not have a analytical form of the marginal $p_\theta(x_n)$
+but only the expression of $p_\theta(x_n,z)$, we can get an estimate
+of the marginal by sampling $z$ with any distribution $q$
+%
+\begin{align*}
+p_\theta(x_n) & = \int_z p_\theta(x_n,z) dz                   \\
+              & = \int_z \frac{p_\theta(x_n,z)}{q(z)} q(z) dz \\
+              & = \expect_{Z \sim q(z)} \left[\frac{p_\theta(x_n,Z)}{q(Z)}\right].
+\end{align*}
+%
+So if we wanted to maximize $p_\theta(x_n)$ alone, we could sample a
+$Z$ with $q$ and maximize
+%
+\begin{equation*}
+\frac{p_\theta(x_n,Z)}{q(Z)}.\label{eq:estimator}
+\end{equation*}
+
+But we want to maximize $\sum_n \log \, p_\theta(x_n)$. If we use the
+$\log$ of the previous expression, we can decompose its average value
+as
+\begin{align*}
+ & \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(x_n,Z)}{q(Z)} \right]                                \\
+ & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n) \, p_\theta(x_n)}{q(Z)} \right]        \\
+ & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n)}{q(Z)} \right] + \log \, p_\theta(x_n) \\
+ & = - \dkl(q(z) \, \| \, p_\theta(z \mid x_n)) + \log \, p_\theta(x_n).
+\end{align*}
+%
+Hence this does not maximize $\log \, p_\theta(x_n)$ on average, but a
+\emph{lower bound} of it, since the KL divergence is non-negative. And
+since this maximization pushes that KL term down, it also aligns
+$p_\theta(z \mid x_n)$ and $q(z)$, and we may get a worse
+$p_\theta(x_n)$ to bring $p_\theta(z \mid x_n)$ closer to $q(z)$.
+
+However, all this analysis is still valid if $q$ is a parameterized
+function $q_\alpha(z \mid x_n)$ of $x_n$. In that case, if we optimize
+$\theta$ and $\alpha$ to maximize
+%
+\[
+\expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right],
+\]
+%
+it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid
+x_n)$ close to $p_\theta(z \mid x_n)$.
+
+
+\end{document}