From cf0fd332cb70bf1a2a793ce658770da5ca702db9 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Fri, 19 Jan 2024 20:36:38 +0100 Subject: [PATCH 01/16] Update. --- inftheory.tex | 97 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/inftheory.tex b/inftheory.tex index 1933ff4..7c34fce 100644 --- a/inftheory.tex +++ b/inftheory.tex @@ -36,13 +36,42 @@ \def\argmax{\operatornamewithlimits{argmax}} \def\argmin{\operatornamewithlimits{argmin}} -\def\expect{\mathds{E}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\def\given{\,\middle\vert\,} +\def\proba{\operatorname{P}} +\newcommand{\seq}{{S}} +\newcommand{\expect}{\mathds{E}} +\newcommand{\variance}{\mathds{V}} +\newcommand{\empexpect}{\hat{\mathds{E}}} +\newcommand{\mutinf}{\mathds{I}} +\newcommand{\empmutinf}{\hat{\mathds{I}}} +\newcommand{\entropy}{\mathds{H}} +\newcommand{\empentropy}{\hat{\mathds{H}}} +\newcommand{\ganG}{\mathbf{G}} +\newcommand{\ganD}{\mathbf{D}} +\newcommand{\ganF}{\mathbf{F}} + +\newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}} +\newcommand{\djs}{\mathds{D}_{\mathsf{JS}}} + +\newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}} +\newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}} + +\def\positionalencoding{\operatorname{pos-enc}} +\def\concat{\operatorname{concat}} +\def\crossentropy{\LL_{\operatorname{ce}}} + \begin{document} +\vspace*{1ex} + +\begin{center} +{\Large Some bits of Information Theory} + \today +\end{center} Information Theory is awesome so here is a TL;DR about Shannon's entropy. @@ -66,8 +95,8 @@ To transmit that stream, for instance with bits over a communication line, you can design a coding that takes into account that the symbols are not all as probable, and decode on the other side. -For instance if $P('\!\!A')=1/2$, $P('\!\!B')=1/4$, and -$P('\!\!C')=1/4$ you would transmit ``0'' for a ``A'' and ``10'' for a +For instance if $\proba('\!\!A')=1/2$, $\proba('\!\!B')=1/4$, and +$\proba('\!\!C')=1/4$ you would transmit ``0'' for a ``A'' and ``10'' for a ``B'' and ``11'' for a ``C'', 1.5 bits on average. If the symbol is always the same, you transmit nothing, if they are @@ -79,7 +108,7 @@ to emit on average per symbol to transmit that stream. It has a simple analytical form: % \[ - H(p) = - \sum_k p(k) \log_2 p(k) + \entropy(p) = - \sum_k p(k) \log_2 p(k) \] % where by convention $0 \log_2 0 = 0$. @@ -92,30 +121,30 @@ Entropy bound only for some distributions. A more sophisticated scheme called "Arithmetic coding" does it always. From this perspective, many quantities have an intuitive -value. Consider for instance sending pairs of symbols (X, Y). +value. Consider for instance sending pairs of symbols $(X, Y)$. If these two symbols are independent, you cannot do better than sending one and the other separately, hence % \[ -H(X, H) = H(X) + H(Y). +\entropy(X, Y) = \entropy(X) + \entropy(Y). \] However, imagine that the second symbol is a function of the first -Y=f(X). You just have to send X since Y can be computed from it on the +Y=f(X). You just have to send $X$ since $Y$ can be computed from it on the other side. Hence in that case % \[ -H(X, Y) = H(X). +\entropy(X, Y) = \entropy(X). \] An associated quantity is the mutual information between two random variables, defined with % \[ -I(X;Y) = H(X) + H(Y) - H(X,Y), +\mutinf(X;Y) = \entropy(X) + \entropy(Y) - \entropy(X,Y), \] % that quantifies the amount of information shared by the two variables. @@ -125,80 +154,80 @@ that quantifies the amount of information shared by the two variables. Conditional entropy is the average of the entropy of the conditional distribution: % \begin{align*} -&H(X \mid Y)\\ - &= \sum_y p(Y=y) H(X \mid Y=y)\\ - &= \sum_y P(Y=y) \sum_x P(X=x \mid Y=y) \log P(X=x \mid Y=y) + & \entropy(X \mid Y) \\ + & = \sum_y \proba(Y=y) \entropy(X \mid Y=y) \\ + & = \sum_y \proba(Y=y) \sum_x \proba(X=x \mid Y=y) \log \proba(X=x \mid Y=y) \end{align*} -Intuitively it is the [minimum average] number of bits required to describe X given that Y is known. +Intuitively it is the [minimum average] number of bits required to describe $X$ given that $Y$ is known. -So in particular, if X and Y are independent, getting the value of $Y$ +So in particular, if $X$ and $Y$ are independent, getting the value of $Y$ does not help at all, so you still have to send all the bits for $X$, hence % \[ - H(X \mid Y)=H(X) + \entropy(X \mid Y)=\entropy(X), \] - -if X is a deterministic function of Y then +% +and if $X$ is a deterministic function of $Y$ then % \[ - H(X \mid Y)=0. + \entropy(X \mid Y)=0. \] -And if you send the bits for Y and then the bits to describe X given -that Y, you have sent (X, Y). Hence we have the chain rule: +And if you send the bits for $Y$ and then the bits to describe $X$ given +that $Y$, you have sent $(X, Y)$. Hence we have the chain rule: % \[ -H(X, Y) = H(Y) + H(X \mid Y). +\entropy(X, Y) = \entropy(Y) + \entropy(X \mid Y). \] And then we get % \begin{align*} -I(X;Y) &= H(X) + H(Y) - H(X,Y)\\ - &= H(X) + H(Y) - (H(Y) + H(X \mid Y))\\ - &= H(X) - H(X \mid Y). +I(X;Y) &= \entropy(X) + \entropy(Y) - \entropy(X,Y)\\ + &= \entropy(X) + \entropy(Y) - (\entropy(Y) + \entropy(X \mid Y))\\ + &= \entropy(X) - \entropy(X \mid Y). \end{align*} \section{Kullback-Leibler divergence} Imagine that you encode your stream thinking it comes from -distribution $q$ while it comes from $p$. You would emit more bits than -the optimal $H(p)$, and that supplement is $D_{KL}(p||q)$ the -Kullback-Leibler divergence between $p$ and $q$. +distribution $q$ while it comes from $p$. You would emit more bits +than the optimal $\entropy(p)$, and that excess of bits is +$\dkl(p||q)$ the Kullback-Leibler divergence between $p$ and $q$. In particular if $p=q$ % \[ - D_{KL}(p\|q)=0, + \dkl(p\|q)=0, \] % and if there is a symbol $x$ with $q(x)=0$ and $p(x)>0$, you cannot encode it and % \[ - D_{KL}(p\|q)=+\infty. + \dkl(p\|q)=+\infty. \] Its formal expression is % \[ -D_{KL}(p\|q) = \sum_x p(x) \log\left(\frac{p(x)}{q(x)}\right) +\dkl(p\|q) = \sum_x p(x) \log\left(\frac{p(x)}{q(x)}\right) \] % that can be understood as a value called the cross-entropy between $p$ and $q$ % \[ -H(p,q) = -\sum_x p(x) \log q(x) +\entropy(p,q) = -\sum_x p(x) \log q(x) \] % minus the entropy of p \[ -H(p) = -\sum_x p(x) \log p(x). +\entropy(p) = -\sum_x p(x) \log p(x). \] -Notation horror: if $X$ and $Y$ are random variables $H(X, Y)$ is the +Notation horror: if $X$ and $Y$ are random variables $\entropy(X, Y)$ is the entropy of their joint law, and if $p$ and $q$ are distributions, -$H(p,q)$ is the cross-entropy between them. +$\entropy(p,q)$ is the cross-entropy between them. \end{document} -- 2.20.1 From 74cdd5e14b65ac1ff03725173eb941dc7a455edf Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Mon, 12 Feb 2024 21:25:59 +0100 Subject: [PATCH 02/16] Update. --- inftheory.tex | 48 +++++---- randvar.tex | 284 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 314 insertions(+), 18 deletions(-) create mode 100644 randvar.tex diff --git a/inftheory.tex b/inftheory.tex index 7c34fce..954fd06 100644 --- a/inftheory.tex +++ b/inftheory.tex @@ -4,8 +4,8 @@ %% https://creativecommons.org/publicdomain/zero/1.0/ %% Written by Francois Fleuret -\documentclass[10pt,a4paper,twoside]{article} -\usepackage[paperheight=18cm,paperwidth=10cm,top=5mm,bottom=20mm,right=5mm,left=5mm]{geometry} +\documentclass[11pt,a4paper,oneside]{article} +\usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry} %\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} \usepackage[utf8]{inputenc} \usepackage{amsmath,amssymb,dsfont} @@ -20,6 +20,8 @@ \usetikzlibrary{tikzmark} \usetikzlibrary{decorations.pathmorphing} \usepackage[round]{natbib} +\usepackage[osf]{libertine} +\usepackage{microtype} \usepackage{mleftright} @@ -29,7 +31,7 @@ \setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu \setlength{\parindent}{0cm} -\setlength{\parskip}{12pt} +\setlength{\parskip}{1ex} %\renewcommand{\baselinestretch}{1.3} %\setlength{\tabcolsep}{0pt} %\renewcommand{\arraystretch}{1.0} @@ -65,12 +67,17 @@ \begin{document} -\vspace*{1ex} +\vspace*{-3ex} \begin{center} {\Large Some bits of Information Theory} -\today +Fran\c cois Fleuret + +January 19, 2024 + +\vspace*{1ex} + \end{center} Information Theory is awesome so here is a TL;DR about Shannon's entropy. @@ -79,9 +86,9 @@ The field is originally about quantifying the amount of ``information'' contained in a signal and how much can be transmitted under certain conditions. -What makes it awesome IMO is that it is very intuitive, and like -thermodynamics in Physics, it gives exact bounds about what is possible -or not. +What makes it awesome is that it is very intuitive, and like +thermodynamics in Physics, it gives exact bounds about what is +possible or not. \section{Shannon's Entropy} @@ -96,8 +103,8 @@ line, you can design a coding that takes into account that the symbols are not all as probable, and decode on the other side. For instance if $\proba('\!\!A')=1/2$, $\proba('\!\!B')=1/4$, and -$\proba('\!\!C')=1/4$ you would transmit ``0'' for a ``A'' and ``10'' for a -``B'' and ``11'' for a ``C'', 1.5 bits on average. +$\proba('\!\!C')=1/4$ you would transmit ``$0$'' for a ``A'' and ``$10$'' for a +``B'' and ``$11$'' for a ``C'', 1.5 bits on average. If the symbol is always the same, you transmit nothing, if they are equiprobable you need $\log_2$(nb symbols) etc. @@ -153,11 +160,16 @@ that quantifies the amount of information shared by the two variables. Conditional entropy is the average of the entropy of the conditional distribution: % -\begin{align*} - & \entropy(X \mid Y) \\ - & = \sum_y \proba(Y=y) \entropy(X \mid Y=y) \\ - & = \sum_y \proba(Y=y) \sum_x \proba(X=x \mid Y=y) \log \proba(X=x \mid Y=y) -\end{align*} +\begin{equation*} +\entropy(X \mid Y) = \sum_y \proba(Y=y) \entropy(X \mid Y=y) +\end{equation*} +% +with +% +\begin{eqnarray*} +\entropy(X \mid Y=y) \hspace*{13.5em} \\ + = \sum_x \proba(X=x \mid Y=y) \log \proba(X=x \mid Y=y) +\end{eqnarray*} Intuitively it is the [minimum average] number of bits required to describe $X$ given that $Y$ is known. @@ -175,13 +187,13 @@ and if $X$ is a deterministic function of $Y$ then \entropy(X \mid Y)=0. \] -And if you send the bits for $Y$ and then the bits to describe $X$ given -that $Y$, you have sent $(X, Y)$. Hence we have the chain rule: +And if you send the bits for $Y$ and then the bits to describe $X$ +given that $Y$, you have sent $(X, Y)$, hence the chain rule: % \[ \entropy(X, Y) = \entropy(Y) + \entropy(X \mid Y). \] - +% And then we get % \begin{align*} diff --git a/randvar.tex b/randvar.tex new file mode 100644 index 0000000..6d3aae5 --- /dev/null +++ b/randvar.tex @@ -0,0 +1,284 @@ +%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*- + +%% Any copyright is dedicated to the Public Domain. +%% https://creativecommons.org/publicdomain/zero/1.0/ +%% Written by Francois Fleuret + +\documentclass[11pt,a4paper,oneside]{article} +\usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry} +%\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} +\usepackage[utf8]{inputenc} +\usepackage{amsmath,amssymb,dsfont} +\usepackage[pdftex]{graphicx} +\usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue]{hyperref} +\usepackage{tikz} +\usetikzlibrary{arrows,arrows.meta,calc} +\usetikzlibrary{patterns,backgrounds} +\usetikzlibrary{positioning,fit} +\usetikzlibrary{shapes.geometric,shapes.multipart} +\usetikzlibrary{patterns.meta,decorations.pathreplacing,calligraphy} +\usetikzlibrary{tikzmark} +\usetikzlibrary{decorations.pathmorphing} +\usepackage[round]{natbib} +\usepackage[osf]{libertine} +\usepackage{microtype} + +\usepackage{mleftright} + +\newcommand{\setmuskip}[2]{#1=#2\relax} +\setmuskip{\thinmuskip}{1.5mu} % by default it is equal to 3 mu +\setmuskip{\medmuskip}{2mu} % by default it is equal to 4 mu +\setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu + +\setlength{\parindent}{0cm} +\setlength{\parskip}{1ex} +%\renewcommand{\baselinestretch}{1.3} +%\setlength{\tabcolsep}{0pt} +%\renewcommand{\arraystretch}{1.0} + +\def\argmax{\operatornamewithlimits{argmax}} +\def\argmin{\operatornamewithlimits{argmin}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\def\given{\,\middle\vert\,} +\def\proba{\operatorname{P}} +\newcommand{\seq}{{S}} +\newcommand{\expect}{\mathds{E}} +\newcommand{\variance}{\mathds{V}} +\newcommand{\empexpect}{\hat{\mathds{E}}} +\newcommand{\mutinf}{\mathds{I}} +\newcommand{\empmutinf}{\hat{\mathds{I}}} +\newcommand{\entropy}{\mathds{H}} +\newcommand{\empentropy}{\hat{\mathds{H}}} +\newcommand{\ganG}{\mathbf{G}} +\newcommand{\ganD}{\mathbf{D}} +\newcommand{\ganF}{\mathbf{F}} + +\newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}} +\newcommand{\djs}{\mathds{D}_{\mathsf{JS}}} + +\newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}} +\newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}} + +\def\positionalencoding{\operatorname{pos-enc}} +\def\concat{\operatorname{concat}} +\def\crossentropy{\LL_{\operatorname{ce}}} + +\begin{document} + +\vspace*{0ex} + +\begin{center} +{\Large On Random Variables} + +Fran\c cois Fleuret + +\today + +\vspace*{1ex} + +\end{center} + +\underline{Random variables} are central to any model of a random +process, but their mathematical definition is unclear to most. This is +an attempt at giving an intuitive understanding of their definition +and utility. + +\section{Modeling randomness} + +To formalize something ``random'', the natural strategy is to define a +distribution, that is, in the finite case, a list of values / +probabilities. For instance, the head / tail result of a coin flipping +would be +% +\[ +\{(H, 0.5), (T, 0.5)\}. +\] + +This is perfectly fine, until you have several such objects. To model +two coins $A$ and $B$, it seems intuitively okay: they have nothing to +do with each other, they are ``independent'', so defining how they +behave individually is sufficient. + +\section{Non-independent variables} + +The process to generate two random values can be such that they are +related. Consider for instance that $A$ is the result of flipping a +coin, and $B$ as *the inverse value of $A$*. + +Both $A$ and $B$ are legitimate RVs, a both have the same distribution +(H, 0.5) (T, 0.5). So where is the information that they have a +relation? + +With models of the respective distributions of $A$ and $B$, this is +nowhere. This can be fixed in some way by specifying the distribution +of the pair $(A, B)$. That would be here +% +\[ +\{(H/H, 0.0), (H/T, 0.5), (T/H, 0.5), (T/T, 0.0)\}. +\] + +The distribution of $A$ and $B$ individually are called the +\underline{marginal} distributions, and this is the \underline{joint} +distribution. + +Note that the joint is a far richer object than the two marginals, and +in general many different joints are consistent with given marginals. +Here for instance, the marginals are the same as if $A$ and $B$ where +two independent coins, even though they are not. + +Even though this could somehow work, the notion of a RV here is very +unclear: it is not simply a distribution, and every time a new one is +defined, it require the specification of the joint with all the +variables already defined. + +\section{Random Variables} + +The actual definition of a RV is a bit technical. Intuitively, in some +way, it consists of defining first ``the source of all randomness'', +and then every RV is a deterministic function of it. + +Formally, it relies first on the definition of a set $\Omega$ such +that its subsets can be measured, with all the desirable properties, +such as $\mu(\Omega)=1, \mu(\emptyset)=0$ and $A \cap B = \emptyset +\Rightarrow \mu(A \cup B) = \mu(A) + \mu(B)$. + +There is a technical point: for some $\Omega$ it may be impossible to +define such a measure on all its subsets due to tricky +infinity-related pathologies. So the set $\Sigma$ of +\underline{measurable} subsets is explicitly specified and called a +$\sigma$-algebra. In any practical situation this technicality does +not matter, since $\Sigma$ contains anything needed. + +The triplet $(\Omega, \Sigma, \mu)$ is a \underline{measured set}. + +Given such a measured set, an \underline{random variable} $X$ is a +mapping from $\Omega$ into another set, and the +\underline{probability} that $X$ takes the value $x$ is the measure of +the subset of $\Omega$ where $X$ takes the value $x$: +% +\[ +P(X=x) = \mu(X^{-1}(x)) +\] + +You can imagine $\Omega$ as the square $[0,1]^2$ in $\mathbb{R}^2$ +with the usual geometrical area for $\mu$. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +For instance if the two coins $A$ and $B$ are flipped independently, we +could picture possible random variables with the proper distribution +as follows: + +\nopagebreak + +\begin{tikzpicture}[scale=0.8] +\draw[pattern=north east lines] (0,0) rectangle ++(0.5,0.5); +\draw (0,0) rectangle ++(1,0.5); +\node at (2.5,0.2) {$A=\text{head}/\text{tail}$}; + +\draw[fill=red!50] (4.5, 0) rectangle ++(0.5,0.5); +\draw (4.5,0) rectangle ++(1,0.5); +\node at (7.0,0.2) {$B=\text{head}/\text{tail}$}; +\end{tikzpicture} +% + +\nopagebreak + +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50,draw=none] (0, 0) rectangle (2, 4); +\draw[draw=none,pattern=north east lines] (0, 0) rectangle (4,2); +\draw (0,0) rectangle (4,4); + +%% \draw[draw=green,thick] (0,0) rectangle ++(2,2); +%% \draw[draw=green,thick] (0.1,2.1) rectangle ++(1.8257,1.8257); +%% \draw[draw=green,thick] (2.1,0.1) rectangle ++(0.8165,0.8165); + +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50,draw=none] (0, 0) rectangle ++(1, 4); +\draw[fill=red!50,draw=none] (1.5, 0) rectangle ++(1, 4); +\draw[draw=none,pattern=north east lines] (0, 0.25) rectangle ++(4,0.5); +\draw[draw=none,pattern=north east lines] (0, 1.25) rectangle ++(4,0.5); +\draw[draw=none,pattern=north east lines] (0, 2.) rectangle ++(4,0.5); +\draw[draw=none,pattern=north east lines] (0, 2.5) rectangle ++(4,0.5); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50,draw=none] (0, 0) rectangle (2, 2); +\draw[fill=red!50,draw=none] (0, 4)--(2,4)--(4,2)--(2,2)--cycle; +\draw[draw=none,pattern=north east lines] (0.5, 4)--(1.5,4)--(3.5,2)--(2.5,2)--cycle; +\draw[draw=none,pattern=north east lines] (3, 3) rectangle (4,4); +\draw[draw=none,pattern=north east lines] (0,4)--(1,3)--(0,2)--cycle; +\draw[draw=none,pattern=north east lines] (2.25,0) rectangle (3.25,2); +\draw[draw=none,pattern=north east lines] (0, 0) rectangle (2,1); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +And if $A$ is flipped and $B$ is the inverse of $A$, possible RV would +be + +\nopagebreak + +\begin{tikzpicture}[scale=0.8] +%% \node at (3.2, 1) {Flip A and B = inverse(A)}; + +\draw[pattern=north east lines] (0,0) rectangle ++(0.5,0.5); +\draw (0,0) rectangle ++(1,0.5); +\node at (2.5,0.2) {$A=\text{head}/\text{tail}$}; + +\draw[fill=red!50] (4.5, 0) rectangle ++(0.5,0.5); +\draw (4.5,0) rectangle ++(1,0.5); +\node at (7.0,0.2) {$B=\text{head}/\text{tail}$}; +\end{tikzpicture} + +\nopagebreak + +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50] (0,0) rectangle (4,4); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 0) rectangle (2,4); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50] (0,0) rectangle (4,4); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 0) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (1, 0) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (3, 0) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 1) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (2, 1) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 2) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (1, 3) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (2, 3) rectangle ++(1,1); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50] (0,0) rectangle (4,4); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 0)--(1,1)--(3,1)--(3,4)--(0,1)--cycle; +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 3) rectangle ++(2,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (3,0) rectangle ++(1,1); +%% \draw (0,0) grid (4,4); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} + +%% Thanks to this definition, additional random variables can be defined +%% with dependency structures. For instance, if $A$ and $B$ are two +%% separate coin flipping, and then a third variable $C$ is defined by +%% rolling a dice and taking the value of $A$ if it gives $1$ and the +%% value of $B$ otherwise. + +\end{document} -- 2.20.1 From 3ebef0dc89238565da0b211cd8a03859ed9f753d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Mon, 12 Feb 2024 22:36:05 +0100 Subject: [PATCH 03/16] Update. --- randvar.tex | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/randvar.tex b/randvar.tex index 6d3aae5..01007f6 100644 --- a/randvar.tex +++ b/randvar.tex @@ -80,10 +80,10 @@ Fran\c cois Fleuret \end{center} -\underline{Random variables} are central to any model of a random -process, but their mathematical definition is unclear to most. This is -an attempt at giving an intuitive understanding of their definition -and utility. +\underline{Random variables} (RVs) are central to any model of a +random phenomenon, but their mathematical definition is unclear to +most. This is an attempt at giving an intuitive understanding of their +definition and utility. \section{Modeling randomness} -- 2.20.1 From 119ad14a2072217edf3e2315154614815b72ccbd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Sat, 24 Feb 2024 09:06:51 +0100 Subject: [PATCH 04/16] Update. --- elbo.tex | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 elbo.tex diff --git a/elbo.tex b/elbo.tex new file mode 100644 index 0000000..6875ddf --- /dev/null +++ b/elbo.tex @@ -0,0 +1,140 @@ +%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*- + +%% Any copyright is dedicated to the Public Domain. +%% https://creativecommons.org/publicdomain/zero/1.0/ +%% Written by Francois Fleuret + +\documentclass[11pt,a4paper,oneside]{article} +\usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry} +%\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{amsmath,amssymb,dsfont} +\usepackage[pdftex]{graphicx} +\usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue]{hyperref} +\usepackage{tikz} +\usetikzlibrary{arrows,arrows.meta,calc} +\usetikzlibrary{patterns,backgrounds} +\usetikzlibrary{positioning,fit} +\usetikzlibrary{shapes.geometric,shapes.multipart} +\usetikzlibrary{patterns.meta,decorations.pathreplacing,calligraphy} +\usetikzlibrary{tikzmark} +\usetikzlibrary{decorations.pathmorphing} +\usepackage[round]{natbib} +\usepackage[osf]{libertine} +\usepackage{microtype} +\usepackage{fancyvrb} + +\usepackage{mleftright} + +\newcommand{\setmuskip}[2]{#1=#2\relax} +\setmuskip{\thinmuskip}{1.5mu} % by default it is equal to 3 mu +\setmuskip{\medmuskip}{2mu} % by default it is equal to 4 mu +\setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu + +\setlength{\parindent}{0cm} +\setlength{\parskip}{1ex} +%\renewcommand{\baselinestretch}{1.3} +%\setlength{\tabcolsep}{0pt} +%\renewcommand{\arraystretch}{1.0} + +\def\argmax{\operatornamewithlimits{argmax}} +\def\argmin{\operatornamewithlimits{argmin}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\def\given{\,\middle\vert\,} +\def\proba{\operatorname{P}} +\newcommand{\seq}{{S}} +\newcommand{\expect}{\mathds{E}} +\newcommand{\variance}{\mathds{V}} +\newcommand{\empexpect}{\hat{\mathds{E}}} +\newcommand{\mutinf}{\mathds{I}} +\newcommand{\empmutinf}{\hat{\mathds{I}}} +\newcommand{\entropy}{\mathds{H}} +\newcommand{\empentropy}{\hat{\mathds{H}}} +\newcommand{\ganG}{\mathbf{G}} +\newcommand{\ganD}{\mathbf{D}} +\newcommand{\ganF}{\mathbf{F}} + +\newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}} +\newcommand{\djs}{\mathds{D}_{\mathsf{JS}}} + +\allowdisplaybreaks[2] + +\newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}} +\newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}} + +\def\positionalencoding{\operatorname{pos-enc}} +\def\concat{\operatorname{concat}} +\def\crossentropy{\LL_{\operatorname{ce}}} + +\begin{document} + +\vspace*{0ex} + +\begin{center} +{\Large The Evidence Lower Bound} + +Fran\c cois Fleuret + +\today + +\vspace*{1ex} + +\end{center} + +Given a training set $x_1, \dots, x_N$ that follows an unknown +distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it, +maximizing +% +\[ +\sum_n \log \, p_\theta(x_n). +\] +% +If we do not have a analytical form of the marginal $p_\theta(x_n)$ +but only the expression of $p_\theta(x_n,z)$, we can get an estimate +of the marginal by sampling $z$ with any distribution $q$ +% +\begin{align*} +p_\theta(x_n) & = \int_z p_\theta(x_n,z) dz \\ + & = \int_z \frac{p_\theta(x_n,z)}{q(z)} q(z) dz \\ + & = \expect_{Z \sim q(z)} \left[\frac{p_\theta(x_n,Z)}{q(Z)}\right]. +\end{align*} +% +So if we wanted to maximize $p_\theta(x_n)$ alone, we could sample a +$Z$ with $q$ and maximize +% +\begin{equation*} +\frac{p_\theta(x_n,Z)}{q(Z)}.\label{eq:estimator} +\end{equation*} + +But we want to maximize $\sum_n \log \, p_\theta(x_n)$. If we use the +$\log$ of the previous expression, we can decompose its average value +as +\begin{align*} + & \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(x_n,Z)}{q(Z)} \right] \\ + & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n) \, p_\theta(x_n)}{q(Z)} \right] \\ + & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n)}{q(Z)} \right] + \log \, p_\theta(x_n) \\ + & = - \dkl(q(z) \, \| \, p_\theta(z \mid x_n)) + \log \, p_\theta(x_n). +\end{align*} +% +Hence this does not maximize $\log \, p_\theta(x_n)$ on average, but a +\emph{lower bound} of it, since the KL divergence is non-negative. And +since this maximization pushes that KL term down, it also aligns +$p_\theta(z \mid x_n)$ and $q(z)$, and we may get a worse +$p_\theta(x_n)$ to bring $p_\theta(z \mid x_n)$ closer to $q(z)$. + +However, all this analysis is still valid if $q$ is a parameterized +function $q_\alpha(z \mid x_n)$ of $x_n$. In that case, if we optimize +$\theta$ and $\alpha$ to maximize +% +\[ +\expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right], +\] +% +it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid +x_n)$ close to $p_\theta(z \mid x_n)$. + + +\end{document} -- 2.20.1 From 43b0cb04eae4537d95775038d9e700e642087d6d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Sat, 24 Feb 2024 12:11:36 +0100 Subject: [PATCH 05/16] Update. --- elbo.tex | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/elbo.tex b/elbo.tex index 6875ddf..239a657 100644 --- a/elbo.tex +++ b/elbo.tex @@ -71,16 +71,23 @@ \begin{document} -\vspace*{0ex} +\setlength{\abovedisplayskip}{2ex} +\setlength{\belowdisplayskip}{2ex} +\setlength{\abovedisplayshortskip}{2ex} +\setlength{\belowdisplayshortskip}{2ex} + +\vspace*{-4ex} \begin{center} {\Large The Evidence Lower Bound} +\vspace*{1ex} + Fran\c cois Fleuret \today -\vspace*{1ex} +\vspace*{-1ex} \end{center} @@ -102,12 +109,14 @@ p_\theta(x_n) & = \int_z p_\theta(x_n,z) dz \\ & = \expect_{Z \sim q(z)} \left[\frac{p_\theta(x_n,Z)}{q(Z)}\right]. \end{align*} % -So if we wanted to maximize $p_\theta(x_n)$ alone, we could sample a +So if we sample a $Z$ with $q$ and maximize % \begin{equation*} -\frac{p_\theta(x_n,Z)}{q(Z)}.\label{eq:estimator} +\frac{p_\theta(x_n,Z)}{q(Z)}, \end{equation*} +% +we do maximize $p_\theta(x_n)$ on average. But we want to maximize $\sum_n \log \, p_\theta(x_n)$. If we use the $\log$ of the previous expression, we can decompose its average value -- 2.20.1 From 44313fda41b14cbb410ee9aa1363b0e4ff18f0b7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Sun, 25 Feb 2024 09:58:14 +0100 Subject: [PATCH 06/16] Update. --- elbo.tex | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/elbo.tex b/elbo.tex index 239a657..175019c 100644 --- a/elbo.tex +++ b/elbo.tex @@ -91,15 +91,15 @@ Fran\c cois Fleuret \end{center} -Given a training set $x_1, \dots, x_N$ that follows an unknown -distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it, -maximizing +Given a training i.i.d train samples $x_1, \dots, x_N$ that follows an +unknown distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ +to it, maximizing % \[ \sum_n \log \, p_\theta(x_n). \] % -If we do not have a analytical form of the marginal $p_\theta(x_n)$ +If we do not have an analytical form of the marginal $p_\theta(x_n)$ but only the expression of $p_\theta(x_n,z)$, we can get an estimate of the marginal by sampling $z$ with any distribution $q$ % -- 2.20.1 From 05c0721d2f8b578a8a27ed2085dc9812d2249f88 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Sun, 25 Feb 2024 10:21:37 +0100 Subject: [PATCH 07/16] Update. --- elbo.tex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/elbo.tex b/elbo.tex index 175019c..fe91565 100644 --- a/elbo.tex +++ b/elbo.tex @@ -91,9 +91,9 @@ Fran\c cois Fleuret \end{center} -Given a training i.i.d train samples $x_1, \dots, x_N$ that follows an -unknown distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ -to it, maximizing +Given i.i.d training samples $x_1, \dots, x_N$ that follows an unknown +distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it, +maximizing % \[ \sum_n \log \, p_\theta(x_n). -- 2.20.1 From 4b8c58903baa9ff8c508bda798492e10dde9cb7f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Wed, 28 Feb 2024 08:19:50 +0100 Subject: [PATCH 08/16] Update. --- elbo.tex | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/elbo.tex b/elbo.tex index fe91565..4c6cb24 100644 --- a/elbo.tex +++ b/elbo.tex @@ -76,24 +76,25 @@ \setlength{\abovedisplayshortskip}{2ex} \setlength{\belowdisplayshortskip}{2ex} -\vspace*{-4ex} +\vspace*{-3ex} \begin{center} {\Large The Evidence Lower Bound} -\vspace*{1ex} +\vspace*{2ex} Fran\c cois Fleuret +%% \vspace*{2ex} + \today -\vspace*{-1ex} +%% \vspace*{-1ex} \end{center} -Given i.i.d training samples $x_1, \dots, x_N$ that follows an unknown -distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it, -maximizing +Given i.i.d training samples $x_1, \dots, x_N$ we want to fit a model +$p_\theta(x,z)$ to it, maximizing % \[ \sum_n \log \, p_\theta(x_n). @@ -134,6 +135,8 @@ since this maximization pushes that KL term down, it also aligns $p_\theta(z \mid x_n)$ and $q(z)$, and we may get a worse $p_\theta(x_n)$ to bring $p_\theta(z \mid x_n)$ closer to $q(z)$. +\medskip + However, all this analysis is still valid if $q$ is a parameterized function $q_\alpha(z \mid x_n)$ of $x_n$. In that case, if we optimize $\theta$ and $\alpha$ to maximize @@ -145,5 +148,4 @@ $\theta$ and $\alpha$ to maximize it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid x_n)$ close to $p_\theta(z \mid x_n)$. - \end{document} -- 2.20.1 From 5c3ff032a4d2fc50d96f8f94672086ddde45ca75 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Sat, 2 Mar 2024 01:04:42 +0100 Subject: [PATCH 09/16] Update. --- elbo.tex | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/elbo.tex b/elbo.tex index 4c6cb24..563ec3c 100644 --- a/elbo.tex +++ b/elbo.tex @@ -148,4 +148,20 @@ $\theta$ and $\alpha$ to maximize it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid x_n)$ close to $p_\theta(z \mid x_n)$. +\medskip + +A point that may be important in practice is +% +\begin{align*} + & \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right] \\ + & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n \mid Z) p_\theta(Z)}{q_\alpha(Z \mid x_n)} \right] \\ + & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \, p_\theta(x_n \mid Z) \right] \\ + & \hspace*{7em} - \dkl(q_\alpha(z \mid x_n) \, \| \, p_\theta(z)). +\end{align*} +% +This form is useful because for certain $p_\theta$ and $q_\alpha$, for +instance if they are Gaussian, the KL term can be computed exactly +instead of through sampling, which removes one source of noise in the +optimization process. + \end{document} -- 2.20.1 From 00ccb7a22366144caa8278b72f62ea2b5f331d8e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Thu, 25 Apr 2024 08:29:45 +0200 Subject: [PATCH 10/16] Update. --- dlscore.tex | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 dlscore.tex diff --git a/dlscore.tex b/dlscore.tex new file mode 100644 index 0000000..b72c511 --- /dev/null +++ b/dlscore.tex @@ -0,0 +1,163 @@ +%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*- + +\documentclass[11pt,a4paper,twocolumn,twoside]{article} +\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} +\usepackage[utf8]{inputenc} +\usepackage{cmbright} + +\begin{document} + +\noindent One point per item if you know precisely the meaning of the +listed word(s) + +\section{Machine Learning} + +\begin{enumerate} + + \item VC dimension + \item over-fitting, under-fitting + \item logistic regression + \item Q-value + \item kernel trick + \item boosting + \item feature design + \item linear regression + \item expectation-maximization, GMM + \item SVM + \item Bellman equation + \item decision tree + \item train/validation/test sets + \item naive Bayesian model + \item autoregressive model + \item bias-variance dilemma + \item policy gradient + \item random forest + \item k-NN + \item perceptron algorithm + +\end{enumerate} + + +\section{Deep-Learning} + +\begin{enumerate} + + \item Adam + \item softmax + \item residual connections + \item autograd + \item ReLU + \item dropout + \item CLIP + \item Xavier's initialisation + \item Vanishing gradient + \item LeNet + \item ViT + \item transposed convolution layer + \item checkpoint (during the forward pass) + \item minibatch + \item masked model + \item supervised / unsupervised + \item data augmentation + \item attention block + \item SGD + \item batchnorm + \item gradient clipping + \item tokenizer + \item VAE + \item weight decay + \item GELU + \item LSTM, GRU + \item GAN + \item resnet + \item straight-through estimator + \item convolution layer + \item pre-training / fine-tuning + \item perplexity + \item logits + \item cls token + \item forward pass + \item Transformer (original one), GPT + \item backward pass + \item autoencoder, denoising autoencoder + \item layer norm + \item GNN + \item diffusion model + \item cross-entropy + \item max pooling, average pooling + \item RNN + \item contrastive loss + \item positional encoding + \item causal model + \item attention layer + \item SSL + \item MSE + \item positional encoding + \item tensor + +\end{enumerate} + +\section{Math} + +\begin{enumerate} + \item Hessian + \item random variable + \item matrix + \item entropy, mutual information + \item dot product + \item mean, variance + \item L2 norm + \item chain rule (differentiation) + \item Fourier transform + \item continuity, Lipschitz continuity + \item chain rule (probability) + \item polynomial + \item Cantor's diagonal argument + \item Jacobian + \item linear operator + \item gradient + \item Bayes' thorem + \item vector + \item joint law, product law + \item Gaussian distribution + \item distribution + \item determinant, rank + \item eigen-decomposition, svd + \item maximum likelihood + \item Central Limit Theorem + +\end{enumerate} + +\section{Compute Science} + +\begin{enumerate} + + \item polymorphism + \item recursion + \item value passed by reference + \item binary search + \item quick sort + \item parallel scan + \item mutability + \item Turing machine + \item FP32 + \item iterator + \item interpreter, compiler + \item anonymous function + \item set + \item binary heap + \item mutex + \item cache memory + \item scope of a variable or function + \item dynamic programming + \item hash table + \item big-O notation + \item Turing complete + \item class inheritance + \item closure + \item loop unrolling + \item complexity + +\end{enumerate} + +\end{document} -- 2.20.1 From d3ec58e881d629993d490e7b6b3a6a5f7492fc8b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Thu, 25 Apr 2024 08:29:58 +0200 Subject: [PATCH 11/16] Update. --- dlscore.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlscore.tex b/dlscore.tex index b72c511..ae997f5 100644 --- a/dlscore.tex +++ b/dlscore.tex @@ -49,7 +49,7 @@ listed word(s) \item ReLU \item dropout \item CLIP - \item Xavier's initialisation + \item Xavier's initialization \item Vanishing gradient \item LeNet \item ViT -- 2.20.1 From 449f7b43b1fb2b9e30cf099c02037b1dc51276c4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Thu, 25 Apr 2024 08:35:05 +0200 Subject: [PATCH 12/16] Update. --- dlscore.tex | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dlscore.tex b/dlscore.tex index ae997f5..5097381 100644 --- a/dlscore.tex +++ b/dlscore.tex @@ -1,7 +1,7 @@ %% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*- \documentclass[11pt,a4paper,twocolumn,twoside]{article} -\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} +\usepackage[a4paper,top=2cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} \usepackage[utf8]{inputenc} \usepackage{cmbright} @@ -128,10 +128,12 @@ listed word(s) \end{enumerate} -\section{Compute Science} +\section{Computer Science} \begin{enumerate} +%% \itemsep0em + \item polymorphism \item recursion \item value passed by reference -- 2.20.1 From cc8788660c8f69048778d7bc5781100b7a54fbe8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Thu, 25 Apr 2024 08:38:15 +0200 Subject: [PATCH 13/16] Update. --- dlscore.tex | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dlscore.tex b/dlscore.tex index 5097381..e23cf19 100644 --- a/dlscore.tex +++ b/dlscore.tex @@ -100,6 +100,7 @@ listed word(s) \section{Math} \begin{enumerate} + \item Hessian \item random variable \item matrix @@ -132,8 +133,6 @@ listed word(s) \begin{enumerate} -%% \itemsep0em - \item polymorphism \item recursion \item value passed by reference -- 2.20.1 From 6f02a4dbc2799135ef135da72a1c1f83b690c9e5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Thu, 25 Apr 2024 08:40:09 +0200 Subject: [PATCH 14/16] Update. --- dlscore.tex | 1 - 1 file changed, 1 deletion(-) diff --git a/dlscore.tex b/dlscore.tex index e23cf19..dd742d1 100644 --- a/dlscore.tex +++ b/dlscore.tex @@ -92,7 +92,6 @@ listed word(s) \item attention layer \item SSL \item MSE - \item positional encoding \item tensor \end{enumerate} -- 2.20.1 From 321e2a37cbb0e723e6a768541d0d793fa68b2faa Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Thu, 25 Apr 2024 08:54:05 +0200 Subject: [PATCH 15/16] Update. --- dlscore.tex | 1 + 1 file changed, 1 insertion(+) diff --git a/dlscore.tex b/dlscore.tex index dd742d1..743ad62 100644 --- a/dlscore.tex +++ b/dlscore.tex @@ -82,6 +82,7 @@ listed word(s) \item autoencoder, denoising autoencoder \item layer norm \item GNN + \item learning rate schedule \item diffusion model \item cross-entropy \item max pooling, average pooling -- 2.20.1 From cc493838b758b04d940b4cf7f57deee9b12548d4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Fleuret?= Date: Thu, 25 Apr 2024 09:01:36 +0200 Subject: [PATCH 16/16] Update. --- dlscore.tex | 1 + 1 file changed, 1 insertion(+) diff --git a/dlscore.tex b/dlscore.tex index 743ad62..6fd06ac 100644 --- a/dlscore.tex +++ b/dlscore.tex @@ -20,6 +20,7 @@ listed word(s) \item Q-value \item kernel trick \item boosting + \item PCA \item feature design \item linear regression \item expectation-maximization, GMM -- 2.20.1