-
Notifications
You must be signed in to change notification settings - Fork 0
/
chap07_mle_exponential_models.tex
410 lines (380 loc) · 25.5 KB
/
chap07_mle_exponential_models.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
\setchapterpreamble[u]{\margintoc}
\chapter{Maximum likelihood estimation, application to exponential models}
\label{chap:maximum_likelihood_estimation}
\emph{Maximum likelihood estimation} is a fundamental and very general approach for statistical inference of model parameters.
In this chapter, we consider a statistical experiment with data $X : \Omega \go \cX$ and model $\{ P_\theta : \theta \in \Theta \}$ with $\Theta \subset \R^d$ dominated by a $\sigma$-finite measure $\mu$ on $\cX$, so that we can define the family of densities
\begin{equation*}
\marginnote{This uses the Radon-Nikodym theorem}
f_\theta(x) = \frac{d P_\theta}{d \mu (x)}
\end{equation*}
on $\cX$, see Definition~\ref{def:dominated-model} from Chapter~\ref{chap:statistical_models}.
\begin{definition}
The \emph{likelihood} function $L : \Theta \go \R^+$ is defined as
\begin{equation*}
L(\theta) := L(\theta; X) = f_\theta(X).
\end{equation*}
We also introduce the \emph{log-likelihood} $\ell : \Theta \go \R$ given by
\begin{equation*}
\ell(\theta) := \ell(\theta; X) := \log f_\theta(X)
\end{equation*}
whenever $f_\theta(X) > 0$ almost surely for all $\theta \in \Theta$.
\end{definition}
The likelihood and log-likelihood are random functions since they depend on the data $X$.
\paragraph{Maximum likelihood estimation.} % (fold)
We want to infer $\theta$, namely we want to find $\theta_0 \in \Theta$ such that $X \sim P_{\theta_0}$.
Given the data $X$, the likelihood $L(\theta; X)$ is the ``probability'' to observe $X$ whenever the parameter is $\theta$.
So, in order to find $\theta_0$, it is natural to look for $\theta$ that maximizes $\theta \mapsto L(\theta)$ (or equivalently $\theta \mapsto \ell(\theta)$) on $\Theta$, since such a $\theta$ would \emph{maximize the probability of observing} $X$ (since we do observe it).
This simple principle is fundamental and motivates the following definition.
\begin{definition}[Maximum Likelihood Estimator]
We say that $\wh \theta \in \Theta$ is a \emph{maximum likelihood estimator} (MLE) if
\begin{equation*}
L(\wh \theta; X) = \sup_{\theta \in \Theta} L(\theta; X),
\end{equation*}
or equivalently that
\begin{equation*}
\wh \theta \in \argmax_{\theta \in \Theta} L(\theta; X).
\end{equation*}
\end{definition}
Whenever it exists, a MLE $\wh \theta$ depends on $X$ and on the choice of the model $\{ f_\theta : \theta \in \Theta \}$.
If $X = (X_1, \ldots, X_n)$ with $X_i$ iid and density $f_\theta$ then
\begin{equation*}
L_n(\theta) = L(\theta; X) = L(\theta; X_1, \ldots, X_n) = \prod_{i=1}^n f_\theta(X_i)
\end{equation*}
and
\begin{equation*}
\ell_n(\theta) = \ell(\theta; X) = \ell(\theta; X_1, \ldots, X_n) = \sum_{i=1}^n \log f_\theta(X_i).
\end{equation*}
In this case we say that $L$ and $\ell$ are the likelihood and log-likelihood functions of the \emph{$n$-sampled experiment}.
The existence and uniqueness of the maximum likelihood estimator is not granted in general (even on non-pathological examples).%
\marginnote{In the next Chapter, we will see an example where the maximum likelihood estimator of logistic regression (a widely used model for classification) does not exist when data is linearly separable}
\section{A theoretical motivation} % (fold)
\label{sec:a_theoretical_motivation}
Let $X_1, \ldots, X_n$ be iid with distribution $P_{\theta_0} = f_{\theta_0} \cdot \mu$.
Assuming $\E_{\theta_0} | \log f_{\theta}(X_1) | < +\infty$ for any $\theta \in \Theta$,
we have that
\begin{equation}
\label{eq:likelihood-kullback-limit}
\begin{split}
\frac 1n \ell_n(\theta_0) - \frac 1n \ell_n(\theta) &= \frac 1n \sum_{i=1}^n \big(
\log f_{\theta_0}(X_i) - \log f_{\theta}(X_i) \big) \\
\marginnote{This convergence holds in $P_{\theta_0}$-probability using the law of large numbers}
& \gopro \E_{\theta_0} \big[ \log f_{\theta_0}(X_1) - \log f_{\theta}(X_1) \big] \\
&= h(P_{\theta_0}, P_\theta),
\end{split}
\end{equation}
where we introduce the quantity
\begin{equation*}
h(P_{\theta_0}, P_\theta) = \int_{\cX} \log \Big(
\frac{f_{\theta_0}(x)}{f_{\theta}(x)} \Big) f_{\theta_0}(x) \mu(dx)
\end{equation*}
which is called the \emph{relative entropy} between $P_{\theta_0}$ and $P_\theta$.
This is a fundamental quantity which deserves a definition.
\begin{definition}
\label{def:relative-entropy}
Let $P$ and $Q$ be two probability measures on a measurable space $(\Omega, \cA)$.
The quantity given by
\begin{equation*}
h(P, Q) = \E_P \Big[ \log \Big( \frac{dP}{dQ} \Big) \Big] = \int \log \Big( \frac{dP}{dQ}(\omega) \Big) P(d \omega)
\end{equation*}
when $P \ll Q$ and $h(P, Q) = +\infty$ otherwise is called the \emph{relative entropy} between $P$ and $Q$.
It is also called the \emph{Kullback-Liebler divergence} or the \emph{information divergence}.
\end{definition}
Let us give some properties about $h(P, Q)$.
First of all, if $P \ll Q$ then
\begin{equation*}
h(P, Q) = \E_P \Big[\log \frac{dP}{dQ} \Big] = \E_Q \Big[\frac{dP}{dQ}
\log \frac{dP}{dQ} \Big].
\end{equation*}
It is always well-defined, eventually it is $+\infty$ since $x \log \geq -e^{-1}$ for any $x \in (0, +\infty)$.
Also, if $P \ll Q$ then
\begin{equation*}
\marginnote{using Jensen's inequality}
h(P, Q) = \E_Q\Big[ \frac{dP}{dQ} \log \frac{dP}{dQ} \Big]
\geq \E_Q\Big[ \frac{dP}{dQ} \Big] \log \E_Q\Big[ \frac{dP}{dQ} \Big]
= 0,
\end{equation*}
and note also that $h(P, Q) = 0 \Leftrightarrow P = Q$ since $\phi(x) = x \log x$ is strictly convex.
Using~\eqref{eq:likelihood-kullback-limit}, we have that
$(\ell_n(\theta_0) - \ell_n(\theta)) / n \approx h(P_{\theta_0}, P_\theta)$ for $n$ large and as explained above
$h(P_{\theta_0}, P_\theta) = 0$ iff $P_{\theta_0} = P_{\theta}$, namely iff $\theta = \theta_0$ whenever the model is identifiable (see Definition~\ref{def:identifiable-model} from Chapter~\ref{chap:statistical_models}).
This motivates the use of the maximum likelihood estimator, since maximizing $\ell_n(\theta)$ means minimizing $(\ell_n(\theta_0) - \ell_n(\theta)) / n$, that we expect to be minimal at $\theta \approx \theta_0$ when the model is identifiable.
The MLE is a very general principle that can be used for virtually any statistical model.
Its theoretical study requires smoothness assumptions on the family of densities $f_\theta$ regarded as functions of the parameter $\theta$.
In this Chapter, we study the MLE in the specific family of \emph{exponential models} for two reasons: exponential models contain most parametric models that are of interest in practice, and their study is interesting by itself, since they are the basis of generalized linear models that we will study in the next Chapter.
We will see that when $\{ f_\theta : \theta \in \Theta \}$ is an exponential model, the MLE is easy to study, since it corresponds to another estimation approach called \emph{method of moments}.
But, keep in mind that MLE goes way beyond the setting considered here.
\begin{example}
Let us consider $X \sim \gam(a, \lambda)$, namely the likelihood function
\begin{equation*}
L(a, \lambda) = \frac{\lambda^a}{\Gamma(a)} x^{a - 1} e^{-\lambda x} = \exp( \theta^\top T(x) - \log Z(\theta))
\end{equation*}
with $\theta = [a - 1 \; \lambda]^\top$, $T(x) = [\log x \; -x]^\top$ and $Z(\theta) = \Gamma(a) / \lambda^a$.
This is an example of a so-called \emph{exponential model}. Most parametric distributions can be written in this way (Poisson, Exponential, Binomial, Gaussian, etc.), using if necessary a reparametrization (we defined $\theta = [a - 1 \; \lambda]^\top$ in this example).
\end{example}
\section{Exponential models} % (fold)
\label{sec:exponential_models}
Let us first describe the so-called \emph{canonical exponotial model}.
\begin{definition}[Canonical exponential model]
\label{def:canonical-exponential-model}
Let $\mu$ be a $\sigma$-finite measure on a measurable space $\cX$ and let $T : \cX \go \R^d$ be a measurable function.
We define
\begin{equation*}
\Theta_{\dom} = \Big \{ \theta \in \R^d : Z(\theta) := \int_{\cX} e^{\theta^\top T(x)} \mu(dx) < +\infty \Big\}
\end{equation*}
and $\Theta = \inte(\Theta_{\dom})$, the interior of $\Theta_{\dom}$.
We introduce the density
\begin{equation*}
f_\theta(x) = \exp\big(\theta^\top T(x) - \log Z(\theta) \big)
\end{equation*}
with respect to $\mu$ for $\theta \in \Theta$ and define $P_\theta = f_\theta \cdot \mu$.
The family $\{ P_\theta : \theta \in \Theta \}$ is called a \emph{canonical exponential model} and the function $\theta \mapsto Z(\theta)$ is called the \emph{partition function} of the model.
\marginnote{We discussed sufficient statistics in Section~\ref{sec:statistics} of Chapter~\ref{chap:statistical_models}}
Also, we call $T$ the \emph{sufficient statistic} of the model.
\end{definition}
We consider $\{ P_\theta \in \Theta \}$ instead of $\{ P_\theta \in \Theta_{\dom} \}$ since we will perform differential calculus and use the inversion theorem on this open domain.
\begin{proposition}
The set $\Theta_{\dom}$ is convex (if it is not empty) and the function $\Theta_{\dom} \go \R$ defined by $\Theta \mapsto \log Z(\theta)$ is convex.
\end{proposition}
\begin{proof}
Note that if $\theta_1, \theta_2 \in \Theta_{\dom}$ and $\alpha \in [0, 1]$ we have
\begin{align*}
Z(\alpha \theta_1 + (1 - \alpha) \theta_2) &= \int_{\cX}
\big( e^{\theta_1^\top T(x)} \big)^\alpha
\big( e^{\theta_2^\top T(x)} \big)^{1 - \alpha} \mu(dx) \\
\marginnote{Using H\"older's inequality}
&\leq \Big( \int_{\cX} e^{\theta_1^\top T(x)} \mu(dx) \Big)^{\alpha} \\
& \quad \times \Big(\int_{\cX} e^{\theta_2^\top T(x)} \mu(dx) \Big)^{1 - \alpha} < +\infty
\end{align*}
which proves that $\alpha \theta_1 + (1 - \alpha) \theta_2 \in \Theta_{\dom}$ and also that $\log Z$ is convex since we have
\begin{equation*}
\log Z(\alpha \theta_1 + (1 - \alpha) \theta_2) \leq \alpha \log Z(\theta_1) + (1 - \alpha) \log Z(\theta_2). \qedhere
\end{equation*}
\end{proof}
\begin{definition}[Canonical and minimal exponential model]
\label{def:canonical-minimal-exponential-model}
We say that a canonical exponential model is minimal if $T(x)$ does not belong, $P_\theta$-almost surely for all $\theta \in \Theta$, to any hyperplane $H \subset \R^d$, namely if
\begin{equation*}
P_\theta[ \{ x \in \cX : T(x) \in H \}] < 1
\end{equation*}
for any hyperplane $H$ and any $\theta \in \Theta$.
\end{definition}
\begin{proposition}
If a canonical exponential model is minimal, then it is identifiable.
\end{proposition}
\begin{proof}
Let us consider $\theta_1, \theta_2 \in \Theta$ such that $\theta_1 \neq \theta_2$ and such that $P_{\theta_1} = P_{\theta_2}$.
This entails $f_{\theta_1}(x) = f_{\theta_2}(x)$ for any $x \in \cX$ and in particular that
\begin{equation*}
(\theta_1 - \theta_2)^\top T(X) - \log (Z(\theta_1) / Z(\theta_2)) = 0
\end{equation*}
$P_\theta$-almost surely for any $\theta \in \Theta$, which contradicts the fact that the model is minimal according to Definition~\ref{def:canonical-minimal-exponential-model} with the hyperplane $H = \{ x \in \cX : (\theta_1 - \theta_2)^\top T(x) = \log (Z(\theta_1) / Z(\theta_2)) \}$. \qedhere
\end{proof}
From now on, we suppose that the model is \emph{minimal}.
It is a natural assumption: it means that the coordinates of the sufficient statistic $T(X)$ are not almost-surely linearly redundant.
Let us recall also that $\Theta = \inte(\Theta_{\dom}) \neq \emptyset$.
\begin{theorem}
\label{thm:canonical-exponential-model-moments}
Consider a canonical exponential model. Its partition function $\theta \mapsto \log Z(\theta)$ is $C^\infty$ on $\Theta$ and we have
\begin{equation*}
\E_\theta[ |T_j(X)|^k ] < +\infty
\end{equation*}
for any $j=1, \ldots, d$, any $k \in \N$ (all the moments of $T(X)$ with $X \sim P_\theta$ are finite) and any $\theta \in \Theta$.
Furthermore, the following relations
\begin{equation*}
\marginnote{$\grad F(\theta)$ is the gradient of $F$ at $\theta$ while $\grad^2 F(\theta)$ is the Hessian matrix of $F$ at $\theta$}
\grad \log Z(\theta) = \E_\theta[T(X)] \quad \text{and} \quad \grad^2 \log Z(\theta) = \var_\theta[T(X)]
\end{equation*}
hold for any $\theta \in \Theta$.
\end{theorem}
The proof of Theorem~\ref{thm:canonical-exponential-model-moments} is left as an exercise, where we just need to use dominated convergence to inverse differentiation and expectation.
Let us just do the following computation, which explains why the first moment of the sufficient statistic is equal to the gradient of the partition function:
\begin{align*}
\marginnote{We use the notation $X \sim \mu$ to indicate that we integrate with respect to $\mu$ even if $\mu$ is not a probability measure}
\grad \log Z(\theta) &= \grad \log \E_{X \sim \mu} \big[ \exp(\theta^\top T(X)) \big] \\
&= \frac{\E_{X \sim \mu} \big[ T(X) \exp(\theta^\top T(X)) \big]}{\E_{X \sim \mu} \big[ \exp(\theta^\top T(X)) \big]} \\
&= \E_{\theta} [T(X)],
\end{align*}
where we just used the definition of $P_\theta$ in the last equality.
\begin{corollary}
\label{cor:exponential-model-minimal-positive-hessian}
We have $\grad^2 \log Z(\theta) \succ 0$ for all $\theta \in \Theta$ iff the model is minimal.
\end{corollary}
\begin{proof}
For any $u \in \R^d$ and $\theta \in \Theta$ we have $u^\top \grad^2 \log Z(\theta) u = u^\top \var_\theta[T(X)] u = \var_\theta[u^\top T(X)]$ so that $\grad^2 \log Z(\theta)$ is not positive definite iff $u^\top T(X) $ is constant $P_\theta$ almost-surely.
\end{proof}
Note that we recover here the fact that $\theta \mapsto \log Z(\theta)$ is strictly convex when the model is minimal, since $\grad^2 \log Z(\theta) \succ 0$ for any $\theta \in \Theta$.
A consequence of this is that the differential of $S(\theta) = \grad \log Z(\theta)$, which is the Hessian matrix $\grad^2 \log Z(\theta)$, is \emph{invertible} for any $\theta \in \Theta$.
The following computation is insightful:
\begin{align*}
h(P_\theta, P_{\theta'}) &= \E_{\theta} \Big[ \log \frac{d P_\theta}{d P_{\theta'}}(X) \Big] \\
&= \E_\theta \Big[ (\theta - \theta')^\top T(X)
- \log \frac{Z(\theta)}{Z(\theta')} \Big] \\
&= \log Z(\theta') - \log Z(\theta) - (\theta' - \theta)^\top \grad \log Z(\theta)
\end{align*}
which means that $h(P_\theta, P_{\theta'})$ is equivalent to a local ``linearization'' of $\log Z(\theta)$ and therefore approximately equal to
\begin{equation*}
h(P_\theta, P_{\theta'}) \approx \frac 12 (\theta' - \theta)^\top \grad^2 \log Z(\theta) (\theta' - \theta)
\end{equation*}
for $\theta \approx \theta'$. This makes a connection between the \emph{local curvature} of the model $\theta$ and its identifiability.
% \todo{Natural gradient, Optimization, Fisher versus likelihood Hessian}
Another proposition goes as follows.
\begin{proposition}
\label{prop:exponential-model-injective-identifiable}
The function $\theta \mapsto \log Z(\theta)$ is injective on $\Theta$ if and only if the model is identifiable.
\end{proposition}
\begin{proof}
We have that
\begin{equation*}
h(P_\theta, P_{\theta'}) + h(P_{\theta'}, P_{\theta}) = \inr{ \grad \log Z(\theta) - \grad \log Z(\theta'), \theta - \theta'}
\end{equation*}
which is $\geq 0$ by convexity.
If $ \log Z(\theta') = \log Z(\theta)$ and $\theta \neq \theta'$ then $h(P_\theta, P_{\theta'}) = h(P_{\theta'}, P_\theta) = 0$ so that $P_\theta = P_{\theta'}$. Now, if $P_\theta = P_{\theta'}$ for $\theta \neq \theta'$, we have $\E_\theta[T(X)] = \E_{\theta'}[T(X)]$ and therefore $\log Z(\theta') = \log Z(\theta)$ using Theorem~\ref{thm:canonical-exponential-model-moments}.
\end{proof}
We proved the following properties about the function $S : \Theta \go \R^d$ given by $S(\theta) = \grad \log Z(\theta)$:
\begin{enumerate}
\item Whenever the model is identifiable, we know that $S$ is injective on $\Theta$ using Proposition~\ref{prop:exponential-model-injective-identifiable};
\item We know that $S$ is $C^\infty$ on $\Theta$ using Theorem~\ref{thm:canonical-exponential-model-moments};
\item The differential of $S$ is invertible on $\Theta$ if the model is minimal using Corollary~\ref{cor:exponential-model-minimal-positive-hessian}.
\end{enumerate}
We can therefore apply the theorem of global inversion to say that $S$ is a diffeomorphism, that $S(\Theta)$ is open and that its inverse
$S^{-1}$ is also $C^\infty$.
We therefore proved the following theorem.
\begin{theorem}
\label{thm:canonical-minimal-exponential-model-inversion}
In a minimal and canonical exponential model, we have that $S : \Theta \rightarrow S(\Theta)$ given by $S(\theta) = \grad \log Z(\theta)$ is a diffeomorphism, that $S(\Theta)$ is open and that $S^{-1}$ is also $C^\infty$.
\end{theorem}
\section{Maximum likelihood estimation in an exponential model} % (fold)
\label{sec:maximum_likelihood_estimation_in_an_exponential_model}
Let us consider a iid sample $X_1, \ldots, X_n$ with distribution $P_\theta = f_\theta \cdot \mu$ from a canonical and minimal exponential model.
The log-likelihood writes
\begin{equation}
\label{eq:log-likelihood-exponential-model}
\frac 1n \ell_n(\theta) = \theta^\top \wb T_n
- \log Z(\theta)
\end{equation}
where we introduced $\wb T_n = n^{-1} \sum_{i=1}^n T(X_i)$.
\begin{proposition}
\label{prop:mle-is-moments-in-canonical-exponential}
In a canonical and minimal exponential model, the log-likelihood is strictly concave.
This entails that if the MLE $\wh \theta_n = \argmax_{\theta \in \Theta} \ell_n(\theta)$ exists, it is given by
\begin{equation*}
\wh \theta_n = S^{-1}(\wb T_n),
\end{equation*}
where we recall that $S(\theta) = \grad \log Z(\theta)$.
\end{proposition}
\begin{proof}
This is obvious since $\theta \mapsto \theta^\top \wb T_n$ is linear, hence concave, and since $\theta \mapsto \log Z(\theta)$ is strictly concave using Theorem~\ref{thm:canonical-exponential-model-moments}.
Hence, any maximizer of $\ell_n$ must satisfy the first order condition $\grad \ell_n(\theta) = 0$ which means that $S(\theta) = \grad \log Z(\theta) = \wb T_n$.
However, we know using Theorem~\ref{thm:canonical-minimal-exponential-model-inversion} that if it exists, the only solution is $\wh \theta_n = S^{-1}(\wb T_n)$.
\end{proof}
Proposition~\ref{prop:mle-is-moments-in-canonical-exponential} proves that, when it exists, the MLE corresponds to the so-called \emph{method of moments estimator}.
\begin{example}
Consider a iid sample $X_1, \ldots, X_n$ with $\gam(a, \lambda)$ distribution and recall that its density can be written as
\begin{equation*}
f_{a, \lambda}(x) = \frac{\lambda^a}{\Gamma(a)} x^{a - 1} e^{-\lambda x}
= \exp(\theta^\top T(x) - \log Z(\theta))
\end{equation*}
for $x \geq 0$, where $\theta = [a - 1 \; \lambda]^\top$, $T(x) = [\log x \; - x]^\top$ and $Z(\theta) = \Gamma(a) / \lambda^a$.
We have
\begin{equation*}
\grad \log Z(\theta) =
\begin{bmatrix}
\frac{\Gamma'(a)}{\Gamma(a)} - \log \lambda \\
-a / \lambda
\end{bmatrix}
\end{equation*}
so that finding a closed-form estimator means finding a solution to
\begin{align*}
\frac{\Gamma'(a)}{\Gamma(a)} - \log \lambda &= \frac{1}{n} \sum_{i=1}^n \log X_i \\
\frac {a}{\lambda} &= \frac{1}{n} \sum_{i=1}^n X_i.
\end{align*}
Such a solution is not explicit, but can be easily approximated using a convex optimization algorithm.
\end{example}
In summary, what we learned so far about the MLE in a minimal and canonical exponential model is the following:
\begin{itemize}
\item When the MLE exists, we can express it as $\wh \theta_n = S^{-1}(\wb T_n)$, although in general we cannot inverse explicitly $S$, as observed in the previous example;
\item When the MLE exists, we can compute it approximately using a convex optimization algorithm, since the log-likelihood $\ell_n(\theta)$ is strictly concave and smooth.
\end{itemize}
Note that the MLE does not always exist, as shown in the next simple example.
\begin{remark}
Consider $X_1, \ldots, X_n$ iid with geometric distribution, namely a density $f_p(x) = (1 - p)^{x-1} p$ with respect to the counting measure, for $x \in \N \setminus \{ 0 \}$ and $p \in (0, 1)$. We can write it as an exponential model since
\begin{equation*}
f_p(x) = \exp\big( (x - 1) \log(1 - p) + \log p \big),
\end{equation*}
so that the sufficient statistic is $T(x) = 1 - x$ and the canonical parameter is $\theta = - \log(1 - p)$ namely $p = 1 - e^{-\theta}$, so that in canonical form, this exponential model writes
\begin{equation*}
f_\theta(x) = \exp \big( (1 - x) \theta + \log(1 - e^{-\theta}) \big),
\end{equation*}
and the log-likelihood is given by
\begin{equation*}
\ell_n(\theta) = \Big(n - \sum_{i=1}^n X_i \Big) \theta + n \log(1 - e^{-\theta}).
\end{equation*}
On the event $E = \{ X_1 = \cdots = X_n = 1 \}$ which has a probability $\P_\theta[E] = (1 - e^{-\theta})^n > 0$ (although going towards zero quickly), the MLE does not exist since on $E$ we have $\ell_n(\theta) = n \log(1 - e^{-\theta})$ which is concave and strictly increasing.
\end{remark}
\begin{definition}[Score and Fisher information]
\label{def:score-and-fisher}
In a minimal and canonical exponential model, the function $\theta \mapsto \grad \ell_n(\theta)$ is called the \emph{score} function and the matrix
\begin{equation*}
I_n(\theta) = \var_\theta[ \grad \ell_n(\theta) ],
\end{equation*}
which is the covariance matrix of the score, is called the \emph{Fisher information}.
\end{definition}
This definition goes way beyond the particular case of the exponential models considered here.
Let us give some properties of the score and the Fisher information.
First of all, let us remark that the score is a \emph{centered} random vector, since
\begin{equation*}
\marginnote{Using Theorem~\ref{thm:canonical-exponential-model-moments}}
\E_\theta [\grad \ell_n(\theta)] = n(\E_\theta[T(X_1)] - \grad \log Z(\theta)) = 0
\end{equation*}
and let us note that the Fisher information satisfies
\begin{equation*}
\marginnote{Using~\eqref{eq:log-likelihood-exponential-model} and the definition of Fisher information with $n=1$}
I_n(\theta) = \var_\theta[\grad \ell_n(\theta)] = n \var_\theta[T(X_1)] = n I_1(\theta)
\end{equation*}
and also that it satisfies
\begin{equation*}
\marginnote{Using Theorem~\ref{thm:canonical-exponential-model-moments}}
I_n(\theta) = n \var_\theta[T(X_1)] = n \grad^2 \log Z(\theta).
\end{equation*}
Note that since $T(X_1), \ldots, T(X_n)$ are iid and such that $\E_\theta[T(X_1)] = \grad \log Z(\theta) = S(\theta)$ and $\var_\theta[T(X_1)] = I_1(\theta)$, we can use the multivariate central limit theorem to obtain
\begin{equation*}
\marginnote{This a convergence in $P_\theta$ distribution}
\sqrt n (\wb T_n - S(\theta)) \gosto \nor\big(0, I_1(\theta)\big),
\end{equation*}
but since $\wh \theta_n = S^{-1} (\wb T_n)$, we want to use the multivariate $\Delta$-method (the scalar case was covered in Theorem~\ref{thm:delta-method} from Chapter~\ref{chap:statistical_inference}) with $\varphi(t) = S^{-1}(t)$.
The multivariate $\Delta$-method is given by the following theorem.
\begin{theorem}[Multivariate $\Delta$-method]
\label{thm:multivariate-delta-method}
Let $(a_n)_{n \geq 1}$ be a sequence of positive number such that $a_n \rightarrow +\infty$, $(X_n)_{n \geq 1}$ be a sequence of random vectors $X_n \in \R^d$ and $\varphi$ be measurable function. If $a_n(X_n - x) \gosto X$ for some $x \in \R^d$ and some random vector $X$, and if
$\varphi$ is differentiable at $x$, we have
\begin{equation*}
a_n (\varphi(X_n) - \varphi(x)) \gosto J_\varphi(x) X,
\end{equation*}
where $J_\varphi(x)$ is the Jacobian matrix of $f$ at $x$.
\end{theorem}
The proof of Theorem~\ref{thm:multivariate-delta-method} is omitted since it follows the exact same proof as that of Theorem~\ref{thm:delta-method}.
We apply%
\sidenote{Thanks to Theorem~\ref{thm:canonical-minimal-exponential-model-inversion}}
Theorem~\ref{thm:multivariate-delta-method} with $\varphi = S^{-1}$,
so that $J_\varphi(S(\theta)) = (\grad^2 \log Z(\theta))^{-1} = I_1(\theta)^{-1}$ and we end up with
\begin{equation*}
\marginnote{This a convergence in $P_\theta$-distribution}
\sqrt n (\wh \theta_n - \theta) \gosto \nor\big( 0, I_1(\theta)^{-1} \big)
\end{equation*}
since $\var[I_1(\theta)^{-1} Z] = I_1(\theta)^{-1} \var[ Z] I_1(\theta)^{-1} = I_1(\theta)^{-1}$ whenever $Z \sim \nor( 0, I_1(\theta))$.
This proves the following theorem.
\begin{theorem}[Central limit theorem for the MLE]
In a statistical experiment where the model is a minimal and canonical exponential model and where the MLE $\wh \theta_n$ exists, we have
\begin{equation*}
\sqrt n (\wh \theta_n - \theta) \gosto \nor\big( 0, I_1(\theta)^{-1} \big),
\end{equation*}
which is a convergence in $P_\theta$-distribution, where $I_1(\theta)$ is the Fisher information matrix given in Definition~\ref{def:score-and-fisher}.
\end{theorem}
This theorem proves that the MLE is asymptotically normal and that its ``asymptotic variance'' is equal to the inverse of the Fisher information.
In this sense, the Fisher information quantifies the asymptotic performance of the MLE.
Moreover, we can prove that the inverse of the Fisher information matrix is the \emph{smallest achievable} asymptotic variance among all asymptotically normal estimators%
\sidenote{An estimator $\wt \theta_n$ is asymptotically normal if it satisfies $\sqrt n (\wt \theta_n - \theta) \gosto \nor(0, \bV)$ for all $\theta \in \Theta$ with a non-degenerate matrix $\bV$.}
and that the MLE is \emph{efficient}, since it is asymptotically normal with minimal asymptotic variance (given by the inverse Fisher information matrix).
However, we won't go further in this direction, since such results are somewhat ``stylized'': they hold only for an arbitrarily large $n$ and only for \emph{well-specified} models.
Indeed, such asymptotic results hold only when the model is \emph{well-specified}, namely whenever the \emph{true distribution} $P_X$ of the data actually belongs to the model, namely $P_X = P_{\theta^\star}$ for some $\theta^\star \in \Theta$.
If the model is miss-specified, namely $P_X \notin \{ P_\theta : \theta \in \Theta \}$, then the MLE quickly deteriorates.