From cb1ff6da604e86fcd5b3c1331036e2a2afb09534 Mon Sep 17 00:00:00 2001 From: adamingas Date: Tue, 16 Jan 2024 23:16:36 +0000 Subject: [PATCH] fix: rename math markdown file and create new rst math file which supports latex maths. Add mathjax support --- docs/conf.py | 5 +- docs/index.md | 2 +- docs/maths.rst | 252 ++++++++++++++++++++++++++++++++ docs/{maths.md => maths_old.md} | 0 4 files changed, 256 insertions(+), 3 deletions(-) create mode 100644 docs/maths.rst rename docs/{maths.md => maths_old.md} (100%) diff --git a/docs/conf.py b/docs/conf.py index d93c547..ab32491 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,13 +18,14 @@ "autoapi.extension", "sphinx.ext.napoleon", "sphinx.ext.viewcode", - "sphinx_rtd_theme" + "sphinx_rtd_theme", + "sphinx.ext.mathjax" ] autoapi_dirs = ["../ordinalgbt"] # location to parse for API reference html_theme = "sphinx_rtd_theme" exclude_patterns = [] nb_execution_mode = "off" - +mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output diff --git a/docs/index.md b/docs/index.md index cf0cd54..cf14380 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,6 +7,6 @@ comparison_with_classifiers.ipynb shap.ipynb -maths.md +maths autoapi/index ``` \ No newline at end of file diff --git a/docs/maths.rst b/docs/maths.rst new file mode 100644 index 0000000..71cafd9 --- /dev/null +++ b/docs/maths.rst @@ -0,0 +1,252 @@ +.. role:: raw-latex(raw) + :format: latex +.. + +Loss +==== + +Maths +----- + +Immediate Thresholds +~~~~~~~~~~~~~~~~~~~~ + +As is usual in ordinal regression model formulations, we build a +regressor that learns a latent variable :math:`y`, and then use a set of +thresholds :math:`\Theta` to produce the probability estimates for each +label. The set of thresholds is ordered, does not include infinities, +and has as many members as the numbers of labels minus one. + +We want to come up with a a way to map the latent variable :math:`y` to +the probability space such that when :math:`y` is in +:math:`(\theta_{k-1},\theta_{k})` the probability of label :math:`k` is +maximised. + +In a three ordered labeled problem, we only need two thresholds, +:math:`\theta_1` and :math:`\theta_2`, to define the three regions which +are associated to each label +:math:`(-\infty,\theta_1], (\theta_1, \theta_2], (\theta_2, \infty)`. + +Deriving probabilities +~~~~~~~~~~~~~~~~~~~~~~ + +A property we want our mapping from latent variable to probability to +have is for the cummulative probability of label :math:`z` being at most +label :math:`k` to increase as the label increases. This means that +:math:`P(z\leq k;y,\Theta)` should increase as :math:`k` increases +(i.e. as we consider more labels). + +Another property is that as the latent variable :math:`y` gets smaller, +the cummulative probability should also increase, and as it gets larger +it should decrease. + +To satisfies this properties we use a function :math:`F` which grows as +its argument grows and shrinks as the arguments shrink. We can then +define the cumulative probability as: + +.. math:: + + + P(z \leq k; y,\Theta ) = F(\theta_k - y) , + +making sure that the range of :math:`F` is contrained to the +:math:`(0,1)` interval. This formulation satisfies all of our +properties. As we consider larger (higher in the order) labels +:math:`k`, the threshold :math:`\theta_k` grows and so does the +cumulative probability. As :math:`y` grows, the input to :math:`F` +shrinks and so does the cumulative probability. + +Naturally, the probability of :math:`z` being any particular label is +then: + +.. math:: + + + \newcommand{\problessthank}{P(z \leq k; y,\Theta )} + % \newcommand{\bbeta}{\mathbf{b}} + % \newcommand{\btheta}{\mathbf{\theta}} + \begin{align*} + P(z = k; y,\Theta ) &=P(z \leq k; y,\Theta) -P(z \leq k-1; y,\Theta ) \hspace{2mm} \\ + &= F(\theta_k - y) - F(\theta_{k-1} - y) + \end{align*} + +A function that satisfies all these conditions is the sigmoid function, +hereafter denoted as :math:`\sigma`. ### Deriving the loss function + +Given n samples, the likelihood of our set of predictions :math:`\bf y` +is: + +.. math:: + + + L({\bf y} ;\Theta) = \prod_{i =0}^n I(z_i=k)P(z_i = k; y_i,\Theta) + +As is usual in machine learning we use the negative log likelihhod as +our loss: + +.. math:: + + + \begin{align*} + l({\bf y};\Theta) &= -\log L({\bf y},\theta)\\ + &= -\sum_{i=0}^n I(z_i=k)\log(P(z_i = k; y_i,\Theta)) \\ + &= -\sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right) + \end{align*} + +### Deriving the gradient and hessian + +To use a custom loss function with gradient boosting tree frameworks +(i.e. lightgbm), we have to first derive the gradient and hessian of the +loss with respect to **the raw predictions**, in our case the latent +variable :math:`y_i`. + +We denote the first and second order derivative of the sigmoid as +:math:`\sigma'` and :math:`\sigma''` respectively. + +The gradient is denoted as: + +.. math:: + \begin{align*} + \mathcal{G}&=\frac{\partial l({\bf y};\Theta)}{\partial {\bf y}} \\ + &= -\frac{\partial }{\partial {\bf y}} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right) \\ + &= + \begin{pmatrix} + -\frac{\partial }{\partial y_1} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right) \\ + ... \\ + -\frac{\partial }{\partial y_n} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right) \\ + \end{pmatrix} \\ + &= + \begin{pmatrix} + I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right) \\ + ... \\ + I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right) \\ + \end{pmatrix} + \end{align*} + +The summmation is gone when calculating the derivative for variable +:math:`y_i` as every element of the summation depends only on one latent +variable: + +.. math:: + \begin{align*} + \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial {\bf y}} &= + \begin{pmatrix} + \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_1} \\ + \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_2} \\ + \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_3} \\ + \end{pmatrix} \\ + &= + \begin{pmatrix} + \frac{\partial f(y_1)}{\partial y_1} \\ + \frac{\partial f(y_2)}{\partial y_2} \\ + \frac{\partial f(y_3)}{\partial y_3} \\ + \end{pmatrix} + \end{align*} + +The hessian is the partial derivative of the gradient with respect to +the latent variable vector. This means that for each element of the +gradient vector we calculate the partial derivative w.r.t. the whole +latent variable vector. Thus the hessian is a matrix of partial +derivatives: + +.. math:: + \begin{pmatrix} + \frac{\partial}{\partial y_1 y_1} & ... & + \frac{\partial}{\partial y_1 y_n} \\ + .&&.\\.&&.\\.&&.\\ + \frac{\partial}{\partial y_n y_1} & ... & + \frac{\partial}{\partial y_n y_n} + \end{pmatrix}l({\bf y};\Theta) + +However, since we know that the partial derivative of the loss w.r.t. +the latent variable :math:`y_i` depends only on the :math:`i^{th}` +element of the :math:`y` vector, the off diagonal elements of the +hessian matrix are reduced to zero: + +.. math:: + + + \frac{\partial}{\partial y_i y_j} l({\bf y};\Theta) = 0 \text{ if } i \neq j + +The hessian is then reduced to a vetor: + +.. math:: + + + \begin{align*} + \mathcal{H} &= + \begin{pmatrix} + \frac{\partial}{\partial y_1 y_1} \\ + ... \\ + \frac{\partial}{\partial y_n y_n} + \end{pmatrix}l({\bf y};\Theta) \\ + &= + \begin{pmatrix} + \frac{\partial}{\partial y_1 }I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right) \\ + ... \\ + \frac{\partial}{\partial y_n } + I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right) + \end{pmatrix}\\ + &= + \begin{pmatrix} + -I(z_i = k) \left( \frac{\sigma''(\theta_k-y_1) - \sigma''(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right) + + I(z_n = k)\left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)^2 \\ + ... \\ + -I(z_n = k) \left( \frac{\sigma''(\theta_k-y_n) - \sigma''(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right) + + I(z_n = k)\left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)^2 \\ + \end{pmatrix} + \end{align*} + +Miscellanious +~~~~~~~~~~~~~ + +The gradient of the sigmoid function is: + +.. math:: + + + \sigma'(x) = \sigma(x)(1-\sigma(x)) + +and the hessian is: + +.. math:: + + + \begin{align*} + \sigma''(x) &= \frac{d}{dx}\sigma(x)(1-\sigma(x)) \\ + &= \sigma'(x)(1-\sigma(x)) - \sigma'(x)\sigma(x)\\ + &= \sigma(x)(1-\sigma(x))(1-\sigma(x)) -\sigma(x)(1-\sigma(x))\sigma(x) \\ + &= (1-\sigma(x))\left(\sigma(x)-2\sigma(x)^2\right) + \end{align*} + +.. raw:: html + + + +Code +---- + + Coming soon diff --git a/docs/maths.md b/docs/maths_old.md similarity index 100% rename from docs/maths.md rename to docs/maths_old.md