From: François Fleuret Date: Thu, 25 Apr 2024 07:01:36 +0000 (+0200) Subject: Update. X-Git-Url: https://www.fleuret.org/cgi-bin/gitweb/gitweb.cgi?p=tex.git;a=commitdiff_plain;h=HEAD;hp=4b8c58903baa9ff8c508bda798492e10dde9cb7f Update. --- diff --git a/dlscore.tex b/dlscore.tex new file mode 100644 index 0000000..6fd06ac --- /dev/null +++ b/dlscore.tex @@ -0,0 +1,165 @@ +%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*- + +\documentclass[11pt,a4paper,twocolumn,twoside]{article} +\usepackage[a4paper,top=2cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} +\usepackage[utf8]{inputenc} +\usepackage{cmbright} + +\begin{document} + +\noindent One point per item if you know precisely the meaning of the +listed word(s) + +\section{Machine Learning} + +\begin{enumerate} + + \item VC dimension + \item over-fitting, under-fitting + \item logistic regression + \item Q-value + \item kernel trick + \item boosting + \item PCA + \item feature design + \item linear regression + \item expectation-maximization, GMM + \item SVM + \item Bellman equation + \item decision tree + \item train/validation/test sets + \item naive Bayesian model + \item autoregressive model + \item bias-variance dilemma + \item policy gradient + \item random forest + \item k-NN + \item perceptron algorithm + +\end{enumerate} + + +\section{Deep-Learning} + +\begin{enumerate} + + \item Adam + \item softmax + \item residual connections + \item autograd + \item ReLU + \item dropout + \item CLIP + \item Xavier's initialization + \item Vanishing gradient + \item LeNet + \item ViT + \item transposed convolution layer + \item checkpoint (during the forward pass) + \item minibatch + \item masked model + \item supervised / unsupervised + \item data augmentation + \item attention block + \item SGD + \item batchnorm + \item gradient clipping + \item tokenizer + \item VAE + \item weight decay + \item GELU + \item LSTM, GRU + \item GAN + \item resnet + \item straight-through estimator + \item convolution layer + \item pre-training / fine-tuning + \item perplexity + \item logits + \item cls token + \item forward pass + \item Transformer (original one), GPT + \item backward pass + \item autoencoder, denoising autoencoder + \item layer norm + \item GNN + \item learning rate schedule + \item diffusion model + \item cross-entropy + \item max pooling, average pooling + \item RNN + \item contrastive loss + \item positional encoding + \item causal model + \item attention layer + \item SSL + \item MSE + \item tensor + +\end{enumerate} + +\section{Math} + +\begin{enumerate} + + \item Hessian + \item random variable + \item matrix + \item entropy, mutual information + \item dot product + \item mean, variance + \item L2 norm + \item chain rule (differentiation) + \item Fourier transform + \item continuity, Lipschitz continuity + \item chain rule (probability) + \item polynomial + \item Cantor's diagonal argument + \item Jacobian + \item linear operator + \item gradient + \item Bayes' thorem + \item vector + \item joint law, product law + \item Gaussian distribution + \item distribution + \item determinant, rank + \item eigen-decomposition, svd + \item maximum likelihood + \item Central Limit Theorem + +\end{enumerate} + +\section{Computer Science} + +\begin{enumerate} + + \item polymorphism + \item recursion + \item value passed by reference + \item binary search + \item quick sort + \item parallel scan + \item mutability + \item Turing machine + \item FP32 + \item iterator + \item interpreter, compiler + \item anonymous function + \item set + \item binary heap + \item mutex + \item cache memory + \item scope of a variable or function + \item dynamic programming + \item hash table + \item big-O notation + \item Turing complete + \item class inheritance + \item closure + \item loop unrolling + \item complexity + +\end{enumerate} + +\end{document} diff --git a/elbo.tex b/elbo.tex index 4c6cb24..563ec3c 100644 --- a/elbo.tex +++ b/elbo.tex @@ -148,4 +148,20 @@ $\theta$ and $\alpha$ to maximize it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid x_n)$ close to $p_\theta(z \mid x_n)$. +\medskip + +A point that may be important in practice is +% +\begin{align*} + & \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right] \\ + & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n \mid Z) p_\theta(Z)}{q_\alpha(Z \mid x_n)} \right] \\ + & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \, p_\theta(x_n \mid Z) \right] \\ + & \hspace*{7em} - \dkl(q_\alpha(z \mid x_n) \, \| \, p_\theta(z)). +\end{align*} +% +This form is useful because for certain $p_\theta$ and $q_\alpha$, for +instance if they are Gaussian, the KL term can be computed exactly +instead of through sampling, which removes one source of noise in the +optimization process. + \end{document}