From: François Fleuret <francois@fleuret.org>
Date: Thu, 25 Apr 2024 07:01:36 +0000 (+0200)
Subject: Update.
X-Git-Url: https://www.fleuret.org/cgi-bin/gitweb/gitweb.cgi?p=tex.git;a=commitdiff_plain;h=HEAD;hp=4b8c58903baa9ff8c508bda798492e10dde9cb7f

Update.
---

diff --git a/dlscore.tex b/dlscore.tex
new file mode 100644
index 0000000..6fd06ac
--- /dev/null
+++ b/dlscore.tex
@@ -0,0 +1,165 @@
+%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*-
+
+\documentclass[11pt,a4paper,twocolumn,twoside]{article}
+\usepackage[a4paper,top=2cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry}
+\usepackage[utf8]{inputenc}
+\usepackage{cmbright}
+
+\begin{document}
+
+\noindent One point per item if you know precisely the meaning of the
+listed word(s)
+
+\section{Machine Learning}
+
+\begin{enumerate}
+
+  \item VC dimension
+  \item over-fitting, under-fitting
+  \item logistic regression
+  \item Q-value
+  \item kernel trick
+  \item boosting
+  \item PCA
+  \item feature design
+  \item linear regression
+  \item expectation-maximization, GMM
+  \item SVM
+  \item Bellman equation
+  \item decision tree
+  \item train/validation/test sets
+  \item naive Bayesian model
+  \item autoregressive model
+  \item bias-variance dilemma
+  \item policy gradient
+  \item random forest
+  \item k-NN
+  \item perceptron algorithm
+
+\end{enumerate}
+
+
+\section{Deep-Learning}
+
+\begin{enumerate}
+
+  \item Adam
+  \item softmax
+  \item residual connections
+  \item autograd
+  \item ReLU
+  \item dropout
+  \item CLIP
+  \item Xavier's initialization
+  \item Vanishing gradient
+  \item LeNet
+  \item ViT
+  \item transposed convolution layer
+  \item checkpoint (during the forward pass)
+  \item minibatch
+  \item masked model
+  \item supervised / unsupervised
+  \item data augmentation
+  \item attention block
+  \item SGD
+  \item batchnorm
+  \item gradient clipping
+  \item tokenizer
+  \item VAE
+  \item weight decay
+  \item GELU
+  \item LSTM, GRU
+  \item GAN
+  \item resnet
+  \item straight-through estimator
+  \item convolution layer
+  \item pre-training / fine-tuning
+  \item perplexity
+  \item logits
+  \item cls token
+  \item forward pass
+  \item Transformer (original one), GPT
+  \item backward pass
+  \item autoencoder, denoising autoencoder
+  \item layer norm
+  \item GNN
+  \item learning rate schedule
+  \item diffusion model
+  \item cross-entropy
+  \item max pooling, average pooling
+  \item RNN
+  \item contrastive loss
+  \item positional encoding
+  \item causal model
+  \item attention layer
+  \item SSL
+  \item MSE
+  \item tensor
+
+\end{enumerate}
+
+\section{Math}
+
+\begin{enumerate}
+
+  \item Hessian
+  \item random variable
+  \item matrix
+  \item entropy, mutual information
+  \item dot product
+  \item mean, variance
+  \item L2 norm
+  \item chain rule (differentiation)
+  \item Fourier transform
+  \item continuity, Lipschitz continuity
+  \item chain rule (probability)
+  \item polynomial
+  \item Cantor's diagonal argument
+  \item Jacobian
+  \item linear operator
+  \item gradient
+  \item Bayes' thorem
+  \item vector
+  \item joint law, product law
+  \item Gaussian distribution
+  \item distribution
+  \item determinant, rank
+  \item eigen-decomposition, svd
+  \item maximum likelihood
+  \item Central Limit Theorem
+
+\end{enumerate}
+
+\section{Computer Science}
+
+\begin{enumerate}
+
+  \item polymorphism
+  \item recursion
+  \item value passed by reference
+  \item binary search
+  \item quick sort
+  \item parallel scan
+  \item mutability
+  \item Turing machine
+  \item FP32
+  \item iterator
+  \item interpreter, compiler
+  \item anonymous function
+  \item set
+  \item binary heap
+  \item mutex
+  \item cache memory
+  \item scope of a variable or function
+  \item dynamic programming
+  \item hash table
+  \item big-O notation
+  \item Turing complete
+  \item class inheritance
+  \item closure
+  \item loop unrolling
+  \item complexity
+
+\end{enumerate}
+
+\end{document}
diff --git a/elbo.tex b/elbo.tex
index 4c6cb24..563ec3c 100644
--- a/elbo.tex
+++ b/elbo.tex
@@ -148,4 +148,20 @@ $\theta$ and $\alpha$ to maximize
 it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid
 x_n)$ close to $p_\theta(z \mid x_n)$.
 
+\medskip
+
+A point that may be important in practice is
+%
+\begin{align*}
+ & \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right]                      \\
+ & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n \mid Z) p_\theta(Z)}{q_\alpha(Z \mid x_n)} \right] \\
+ & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \, p_\theta(x_n \mid Z) \right]                                            \\
+ & \hspace*{7em} - \dkl(q_\alpha(z \mid x_n) \, \| \, p_\theta(z)).
+\end{align*}
+%
+This form is useful because for certain $p_\theta$ and $q_\alpha$, for
+instance if they are Gaussian, the KL term can be computed exactly
+instead of through sampling, which removes one source of noise in the
+optimization process.
+
 \end{document}