From dcb454295dc535b9f3a59658ce3821eeff7c478e Mon Sep 17 00:00:00 2001
From: adamingas <adamingas@gmail.com>
Date: Fri, 12 Jan 2024 19:46:48 +0000
Subject: [PATCH 1/7] docs: Add derivation of loss, gradient, and hessian

---
 docs/maths.md | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 docs/maths.md

diff --git a/docs/maths.md b/docs/maths.md
new file mode 100644
index 0000000..d3c081b
--- /dev/null
+++ b/docs/maths.md
@@ -0,0 +1,80 @@
+# Maths
+## Immediate Thresholds
+
+As is usual in ordinal regression model formulations, we build a regressor that learns a latent variable $y$, and then use a set of thresholds $\Theta$ to produce the probability estimates for each label. The set of thresholds is ordered, does not include infinities, and has as many members as the numbers of labels minus one.
+
+We want to come up with a a way to map the latent variable $y$ to the probability space such that when $y$ is in $(\theta_{k-1},\theta_{k})$ the probability of label $k$ is maximised.
+
+In a three ordered labeled problem, we only need two thresholds, $\theta_1$ and $\theta_2$, to define the three regions which are associated to each label $(-\infty,\theta_1], (\theta_1, \theta_2], (\theta_2, \infty)$.
+
+## Deriving probabilities
+
+A property we want our mapping from latent variable to probability to have is for the cummulative probability of label $z$ being at most label $k$ to increase as the label increases. This means that $P(z\leq k;y,\Theta)$ should increase as $k$ increases (i.e. as we consider more labels).
+
+Another property is that as the latent variable $y$ gets smaller, the cummulative probability should also increase, and as it gets larger it should decrease.
+
+To satisfies this properties we use a function $F$ which grows as its argument grows and shrinks as the arguments shrink. We can then define the cumulative probability as:
+$$
+P(z \leq k; y,\Theta  ) = F(\theta_k - y) ,
+$$
+
+making sure that the range of $F$ is contrained to the $(0,1)$ interval. This formulation satisfies all of our properties. As we consider larger (higher in the order) labels $k$, the threshold $\theta_k$ grows and so does the cumulative probability. As $y$ grows, the input to $F$ shrinks and so does the cumulative probability.
+
+Naturally, the probability of $z$ being any particular label is then:
+$$
+\newcommand{\problessthank}{P(z \leq k; y,\Theta  )}
+% \newcommand{\bbeta}{\mathbf{b}}
+% \newcommand{\btheta}{\mathbf{\theta}}
+\begin{align*}
+P(z = k; y,\Theta  ) &=P(z \leq k; y,\Theta) -P(z \leq k-1; y,\Theta  )  \hspace{2mm} \\
+&= F(\theta_k - y) - F(\theta_{k-1} - y)
+\end{align*}
+$$
+
+
+A function that satisfies all these conditions is the sigmoid function, hereafter denoted as $\sigma$.
+## Deriving the loss function
+
+Given n samples, the likelihood of our set of predictions $y_i$ is:
+$$
+L(Y;\Theta) = \prod_{i =0}^n I(z_i=k)P(z_i = k; y_i,\Theta)
+$$
+
+As is usual in machine learning we use the negative log likelihhod as our loss:
+
+$$
+\begin{align}
+l(Y;\Theta) &= -\log L(Y,\theta)\\
+&= -\sum_{i=0}^n I(z_i=k)\log(P(z_i = k; y_i,\Theta)) \\
+&= -\sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y) - \sigma(\theta_{k-1} - y)\right)
+\end{align}
+$$
+## Deriving the gradient and hessian
+
+To use a custom loss function with gradient boosting tree frameworks (i.e. lightgbm), we have to first derive the gradient and hessian of the loss with respect to **the raw predictions**, in our case the latent variable $y_i$.
+
+
+
+
+$$
+\begin{align*}
+\log L(\bbeta) &= l(\bbeta;\btheta) = \sum_{i=1}^n I(y_i=k) \log  \big[ \sigma(\theta_k - \eta_i) - \sigma(\theta_{k-1} - \eta_i) \big] \\
+\eta_i &= \bx_i^T \bbeta \\
+\frac{\partial l(\bbeta;\btheta)}{\partial \bbeta} &= \nabla_\bbeta = -\sum_{i=1}^n \bx_i I(y_i = k) \Bigg( \frac{\sigma'(\theta_k-\eta_i) + \sigma'(\theta_{k-1}-\eta_i)}{d_{ik}} \Bigg) \\
+d_{ik} &= \sigma(\theta_k-\eta_i) - \sigma(\theta_{k-1}-\eta_i) \\
+\frac{\partial l(\bbeta;\btheta)}{\partial \btheta} &= \nabla_\btheta = \sum_{i=1}^n \Bigg( I(y_i = k) \frac{\sigma'(\theta_k-\eta_i)}{d_{ik}} - I(y_i = k+1) \frac{\sigma'(\theta_k-\eta_i)}{d_{ik+1}} \Bigg)
+\end{align*}
+$$
+
+
+$$
+\begin{align*}
+\tilde y &= \arg\max_k [P(y=k|\bbeta;\btheta;\tilde\bx)] \\
+P(y=k|\bbeta;\btheta;\tilde\bx)  &= \begin{cases}
+1 - \sigma(\theta_{K-1}-\tilde\eta) & \text{ if } k=K \\
+\sigma(\theta_{K-1}-\tilde\eta) - \sigma(\theta_{K-2}-\tilde\eta) & \text{ if } k=K-1 \\
+\vdots & \vdots \\
+\sigma'(\theta_{1}-\tilde\eta) - 0 & \text{ if } k=1
+\end{cases}
+\end{align*}
+$$
\ No newline at end of file

From 6dfe99ac9ef902cd823cc23c391666929d9ffff7 Mon Sep 17 00:00:00 2001
From: adamingas <adamingas@gmail.com>
Date: Fri, 12 Jan 2024 20:51:37 +0000
Subject: [PATCH 2/7] docs: Add gradient derivation and start hesian derivation

---
 docs/maths.md | 89 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 4 deletions(-)

diff --git a/docs/maths.md b/docs/maths.md
index d3c081b..48764d4 100644
--- a/docs/maths.md
+++ b/docs/maths.md
@@ -35,18 +35,18 @@ $$
 A function that satisfies all these conditions is the sigmoid function, hereafter denoted as $\sigma$.
 ## Deriving the loss function
 
-Given n samples, the likelihood of our set of predictions $y_i$ is:
+Given n samples, the likelihood of our set of predictions $\bf y$ is:
 $$
-L(Y;\Theta) = \prod_{i =0}^n I(z_i=k)P(z_i = k; y_i,\Theta)
+L({\bf y} ;\Theta) = \prod_{i =0}^n I(z_i=k)P(z_i = k; y_i,\Theta)
 $$
 
 As is usual in machine learning we use the negative log likelihhod as our loss:
 
 $$
 \begin{align}
-l(Y;\Theta) &= -\log L(Y,\theta)\\
+l({\bf y};\Theta) &= -\log L({\bf y},\theta)\\
 &= -\sum_{i=0}^n I(z_i=k)\log(P(z_i = k; y_i,\Theta)) \\
-&= -\sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y) - \sigma(\theta_{k-1} - y)\right)
+&= -\sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)
 \end{align}
 $$
 ## Deriving the gradient and hessian
@@ -54,6 +54,87 @@ $$
 To use a custom loss function with gradient boosting tree frameworks (i.e. lightgbm), we have to first derive the gradient and hessian of the loss with respect to **the raw predictions**, in our case the latent variable $y_i$.
 
 
+We denote the first and second order derivative of the sigmoid as $\sigma'$ and $\sigma''$ respectively.
+
+The gradient is denoted as :TODO:
+
+$$
+\begin{align*}
+\mathcal{G}&=\frac{\partial l({\bf y};\Theta)}{\partial {\bf y}} \\
+&= -\frac{\partial }{\partial {\bf y}} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+&=
+\begin{pmatrix}
+    -\frac{\partial }{\partial y_1} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+    ... \\
+    -\frac{\partial }{\partial y_n} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+\end{pmatrix} \\
+&=
+\begin{pmatrix}
+   I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
+   ... \\
+
+   I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  \\ 
+\end{pmatrix}
+\end{align*}
+$$
+
+The summmation is gone when calculating the derivative for variable $y_i$ as every element of the summation depends only on one latent variable:
+
+$$
+\begin{align*}
+\frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial {\bf y}} &=
+    \begin{pmatrix}
+        \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_1} \\
+        \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_2} \\
+        \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_3} \\
+    \end{pmatrix} \\
+&=
+
+    \begin{pmatrix}
+        \frac{\partial f(y_1)}{\partial y_1} \\
+        \frac{\partial f(y_2)}{\partial y_2} \\
+        \frac{\partial f(y_3)}{\partial y_3} \\
+    \end{pmatrix}
+\end{align*}
+$$
+
+
+The hessian is the partial derivative of the gradient with respect to the latent variable vector. This means that for each element of the gradient vector we calculate the partial derivative w.r.t. the whole latent variable vector. Thus the hessian is a matrix of partial derivatives:
+
+$$
+\begin{pmatrix}
+\frac{\partial}{\partial y_1 y_1} & ... &
+\frac{\partial}{\partial y_1 y_n} \\
+.&&.\\.&&.\\.&&.\\
+\frac{\partial}{\partial y_n y_1} & ... &
+\frac{\partial}{\partial y_n y_n}
+\end{pmatrix}l({\bf y};\Theta)
+$$
+
+However, since we know that the partial derivative of the loss w.r.t. the latent variable $y_i$ depends only on the $i^{th}$ element of the $y$ vector, the off diagonal elements of the hessian matrix are reduced to zero:
+$$
+\frac{\partial}{\partial y_i y_j} l({\bf y};\Theta) = 0 \text{ if } i \neq j
+$$
+
+The hessian is then reduced to a vetor:
+
+$$
+\begin{align*}
+\mathcal{H} &=  
+    \begin{pmatrix}
+        \frac{\partial}{\partial y_1 y_1}  \\
+        ... \\
+        \frac{\partial}{\partial y_n y_n}
+    \end{pmatrix}l({\bf y};\Theta) \\
+    &=
+    \begin{pmatrix}
+        \frac{\partial}{\partial y_1 y_1}(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
+        ... \\.. \\
+        \frac{\partial}{\partial y_n y_n}
+        (z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  \\ 
+    \end{pmatrix}
+\end{align*}
+$$
 
 
 $$

From aa82e838be5401ebb7a1e0a0efd2c417a19fe753 Mon Sep 17 00:00:00 2001
From: adamingas <adamingas@gmail.com>
Date: Fri, 12 Jan 2024 21:53:49 +0000
Subject: [PATCH 3/7] docs: Fiinish maths section, left to do code explanation

---
 docs/index.md |  1 +
 docs/maths.md | 75 +++++++++++++++++++++++++++++++++++----------------
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 03eb3f5..cf0cd54 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,5 +7,6 @@
 
 comparison_with_classifiers.ipynb
 shap.ipynb
+maths.md
 autoapi/index
 ```
\ No newline at end of file
diff --git a/docs/maths.md b/docs/maths.md
index 48764d4..3bc40a7 100644
--- a/docs/maths.md
+++ b/docs/maths.md
@@ -1,5 +1,6 @@
-# Maths
-## Immediate Thresholds
+# Loss
+## Maths
+### Immediate Thresholds
 
 As is usual in ordinal regression model formulations, we build a regressor that learns a latent variable $y$, and then use a set of thresholds $\Theta$ to produce the probability estimates for each label. The set of thresholds is ordered, does not include infinities, and has as many members as the numbers of labels minus one.
 
@@ -7,7 +8,7 @@ We want to come up with a a way to map the latent variable $y$ to the probabilit
 
 In a three ordered labeled problem, we only need two thresholds, $\theta_1$ and $\theta_2$, to define the three regions which are associated to each label $(-\infty,\theta_1], (\theta_1, \theta_2], (\theta_2, \infty)$.
 
-## Deriving probabilities
+### Deriving probabilities
 
 A property we want our mapping from latent variable to probability to have is for the cummulative probability of label $z$ being at most label $k$ to increase as the label increases. This means that $P(z\leq k;y,\Theta)$ should increase as $k$ increases (i.e. as we consider more labels).
 
@@ -33,7 +34,7 @@ $$
 
 
 A function that satisfies all these conditions is the sigmoid function, hereafter denoted as $\sigma$.
-## Deriving the loss function
+### Deriving the loss function
 
 Given n samples, the likelihood of our set of predictions $\bf y$ is:
 $$
@@ -43,13 +44,13 @@ $$
 As is usual in machine learning we use the negative log likelihhod as our loss:
 
 $$
-\begin{align}
+\begin{align*}
 l({\bf y};\Theta) &= -\log L({\bf y},\theta)\\
 &= -\sum_{i=0}^n I(z_i=k)\log(P(z_i = k; y_i,\Theta)) \\
 &= -\sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)
-\end{align}
+\end{align*}
 $$
-## Deriving the gradient and hessian
+### Deriving the gradient and hessian
 
 To use a custom loss function with gradient boosting tree frameworks (i.e. lightgbm), we have to first derive the gradient and hessian of the loss with respect to **the raw predictions**, in our case the latent variable $y_i$.
 
@@ -62,19 +63,19 @@ $$
 \begin{align*}
 \mathcal{G}&=\frac{\partial l({\bf y};\Theta)}{\partial {\bf y}} \\
 &= -\frac{\partial }{\partial {\bf y}} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
-&=
-\begin{pmatrix}
-    -\frac{\partial }{\partial y_1} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+    &=
+    \begin{pmatrix}
+        -\frac{\partial }{\partial y_1} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+        ... \\
+        -\frac{\partial }{\partial y_n} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+    \end{pmatrix} \\
+    &=
+    \begin{pmatrix}
+    I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
     ... \\
-    -\frac{\partial }{\partial y_n} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
-\end{pmatrix} \\
-&=
-\begin{pmatrix}
-   I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
-   ... \\
 
-   I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  \\ 
-\end{pmatrix}
+    I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  \\ 
+    \end{pmatrix}
 \end{align*}
 $$
 
@@ -128,14 +129,40 @@ $$
     \end{pmatrix}l({\bf y};\Theta) \\
     &=
     \begin{pmatrix}
-        \frac{\partial}{\partial y_1 y_1}(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
-        ... \\.. \\
-        \frac{\partial}{\partial y_n y_n}
-        (z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  \\ 
+        \frac{\partial}{\partial y_1 }I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
+        ... \\
+        \frac{\partial}{\partial y_n }
+        I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  
+    \end{pmatrix}\\
+    &=
+    \begin{pmatrix}
+        -I(z_i = k) \left( \frac{\sigma''(\theta_k-y_1) - \sigma''(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  +
+          I(z_n = k)\left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)^2 \\ 
+        ... \\
+        -I(z_n = k) \left( \frac{\sigma''(\theta_k-y_n) - \sigma''(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  +
+          I(z_n = k)\left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)^2 \\ 
     \end{pmatrix}
 \end{align*}
 $$
 
+### Miscellanious
+
+The gradient of the sigmoid function is:
+$$
+\sigma'(x) = \sigma(x)(1-\sigma(x))
+$$
+and the hessian is:
+$$
+\begin{align*}
+    \sigma''(x) &= \frac{d}{dx}\sigma(x)(1-\sigma(x)) \\
+    &= \sigma'(x)(1-\sigma(x)) - \sigma'(x)\sigma(x)\\
+    &= \sigma(x)(1-\sigma(x))(1-\sigma(x)) -\sigma(x)(1-\sigma(x))\sigma(x) \\ 
+    &= (1-\sigma(x))\left(\sigma(x)-2\sigma(x)^2\right)
+\end{align*}
+$$
+
+
+<!-- 
 
 $$
 \begin{align*}
@@ -158,4 +185,6 @@ P(y=k|\bbeta;\btheta;\tilde\bx)  &= \begin{cases}
 \sigma'(\theta_{1}-\tilde\eta) - 0 & \text{ if } k=1
 \end{cases}
 \end{align*}
-$$
\ No newline at end of file
+$$ -->
+
+## Code

From 5dd4cd019bf1bb88aa84d747e21df4c1d5a9b701 Mon Sep 17 00:00:00 2001
From: adamingas <adamingas@gmail.com>
Date: Tue, 16 Jan 2024 23:15:41 +0000
Subject: [PATCH 4/7] fix: Change order of cd job steps

---
 .github/workflows/python-app.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 2bbf3e6..9510610 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -48,20 +48,20 @@ jobs:
     # Define job steps
     steps:
     - name: Set up Python 3.9
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: 3.9
     - name: Install dependencies
       run: |
           python -m pip install --upgrade pip
           pip install build
+    - uses: actions/checkout@v3
       # Here we run build to create a wheel and a
       # .tar.gz source distribution.
     - name: Build package
       run: python -m build --sdist --wheel
       # Finally, we use a pre-defined action to publish
       # our package in place of twine.
-    - uses: actions/checkout@v3
     - name: Publish to TestPyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:

From cb1ff6da604e86fcd5b3c1331036e2a2afb09534 Mon Sep 17 00:00:00 2001
From: adamingas <adamingas@gmail.com>
Date: Tue, 16 Jan 2024 23:16:36 +0000
Subject: [PATCH 5/7] fix: rename math markdown file and create new rst math
 file which supports latex maths. Add mathjax support

---
 docs/conf.py                    |   5 +-
 docs/index.md                   |   2 +-
 docs/maths.rst                  | 252 ++++++++++++++++++++++++++++++++
 docs/{maths.md => maths_old.md} |   0
 4 files changed, 256 insertions(+), 3 deletions(-)
 create mode 100644 docs/maths.rst
 rename docs/{maths.md => maths_old.md} (100%)

diff --git a/docs/conf.py b/docs/conf.py
index d93c547..ab32491 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,13 +18,14 @@
     "autoapi.extension",
     "sphinx.ext.napoleon",
     "sphinx.ext.viewcode",
-    "sphinx_rtd_theme"
+    "sphinx_rtd_theme",
+    "sphinx.ext.mathjax"
 ]
 autoapi_dirs = ["../ordinalgbt"]  # location to parse for API reference
 html_theme = "sphinx_rtd_theme"
 exclude_patterns = []
 nb_execution_mode = "off"
-
+mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
diff --git a/docs/index.md b/docs/index.md
index cf0cd54..cf14380 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,6 +7,6 @@
 
 comparison_with_classifiers.ipynb
 shap.ipynb
-maths.md
+maths
 autoapi/index
 ```
\ No newline at end of file
diff --git a/docs/maths.rst b/docs/maths.rst
new file mode 100644
index 0000000..71cafd9
--- /dev/null
+++ b/docs/maths.rst
@@ -0,0 +1,252 @@
+.. role:: raw-latex(raw)
+   :format: latex
+..
+
+Loss
+====
+
+Maths
+-----
+
+Immediate Thresholds
+~~~~~~~~~~~~~~~~~~~~
+
+As is usual in ordinal regression model formulations, we build a
+regressor that learns a latent variable :math:`y`, and then use a set of
+thresholds :math:`\Theta` to produce the probability estimates for each
+label. The set of thresholds is ordered, does not include infinities,
+and has as many members as the numbers of labels minus one.
+
+We want to come up with a a way to map the latent variable :math:`y` to
+the probability space such that when :math:`y` is in
+:math:`(\theta_{k-1},\theta_{k})` the probability of label :math:`k` is
+maximised.
+
+In a three ordered labeled problem, we only need two thresholds,
+:math:`\theta_1` and :math:`\theta_2`, to define the three regions which
+are associated to each label
+:math:`(-\infty,\theta_1], (\theta_1, \theta_2], (\theta_2, \infty)`.
+
+Deriving probabilities
+~~~~~~~~~~~~~~~~~~~~~~
+
+A property we want our mapping from latent variable to probability to
+have is for the cummulative probability of label :math:`z` being at most
+label :math:`k` to increase as the label increases. This means that
+:math:`P(z\leq k;y,\Theta)` should increase as :math:`k` increases
+(i.e. as we consider more labels).
+
+Another property is that as the latent variable :math:`y` gets smaller,
+the cummulative probability should also increase, and as it gets larger
+it should decrease.
+
+To satisfies this properties we use a function :math:`F` which grows as
+its argument grows and shrinks as the arguments shrink. We can then
+define the cumulative probability as:
+
+.. math::
+
+
+   P(z \leq k; y,\Theta  ) = F(\theta_k - y) ,
+
+making sure that the range of :math:`F` is contrained to the
+:math:`(0,1)` interval. This formulation satisfies all of our
+properties. As we consider larger (higher in the order) labels
+:math:`k`, the threshold :math:`\theta_k` grows and so does the
+cumulative probability. As :math:`y` grows, the input to :math:`F`
+shrinks and so does the cumulative probability.
+
+Naturally, the probability of :math:`z` being any particular label is
+then:
+
+.. math::
+
+
+   \newcommand{\problessthank}{P(z \leq k; y,\Theta  )}
+   % \newcommand{\bbeta}{\mathbf{b}}
+   % \newcommand{\btheta}{\mathbf{\theta}}
+   \begin{align*}
+   P(z = k; y,\Theta  ) &=P(z \leq k; y,\Theta) -P(z \leq k-1; y,\Theta  )  \hspace{2mm} \\
+   &= F(\theta_k - y) - F(\theta_{k-1} - y)
+   \end{align*}
+
+A function that satisfies all these conditions is the sigmoid function,
+hereafter denoted as :math:`\sigma`. ### Deriving the loss function
+
+Given n samples, the likelihood of our set of predictions :math:`\bf y`
+is:
+
+.. math::
+
+
+   L({\bf y} ;\Theta) = \prod_{i =0}^n I(z_i=k)P(z_i = k; y_i,\Theta)
+
+As is usual in machine learning we use the negative log likelihhod as
+our loss:
+
+.. math::
+
+
+   \begin{align*}
+   l({\bf y};\Theta) &= -\log L({\bf y},\theta)\\
+   &= -\sum_{i=0}^n I(z_i=k)\log(P(z_i = k; y_i,\Theta)) \\
+   &= -\sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)
+   \end{align*}
+
+### Deriving the gradient and hessian
+
+To use a custom loss function with gradient boosting tree frameworks
+(i.e. lightgbm), we have to first derive the gradient and hessian of the
+loss with respect to **the raw predictions**, in our case the latent
+variable :math:`y_i`.
+
+We denote the first and second order derivative of the sigmoid as
+:math:`\sigma'` and :math:`\sigma''` respectively.
+
+The gradient is denoted as:
+
+.. math::
+   \begin{align*}
+   \mathcal{G}&=\frac{\partial l({\bf y};\Theta)}{\partial {\bf y}} \\
+   &= -\frac{\partial }{\partial {\bf y}} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+   &=
+   \begin{pmatrix}
+   -\frac{\partial }{\partial y_1} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+   ... \\
+   -\frac{\partial }{\partial y_n} \sum_{i=0}^n I(z_i=k)\log \left(\sigma(\theta_k - y_i) - \sigma(\theta_{k-1} - y_i)\right)  \\
+   \end{pmatrix} \\
+   &=
+   \begin{pmatrix}
+   I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
+   ... \\
+   I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  \\ 
+   \end{pmatrix}
+   \end{align*}
+
+The summmation is gone when calculating the derivative for variable
+:math:`y_i` as every element of the summation depends only on one latent
+variable: 
+
+.. math::
+   \begin{align*}
+   \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial {\bf y}} &=
+   \begin{pmatrix}
+   \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_1} \\
+   \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_2} \\
+   \frac{\partial f(y_1)+f(y_2)+f(y_3)}{\partial y_3} \\
+   \end{pmatrix} \\
+   &=
+   \begin{pmatrix}
+   \frac{\partial f(y_1)}{\partial y_1} \\
+   \frac{\partial f(y_2)}{\partial y_2} \\
+   \frac{\partial f(y_3)}{\partial y_3} \\
+   \end{pmatrix}
+   \end{align*}
+
+The hessian is the partial derivative of the gradient with respect to
+the latent variable vector. This means that for each element of the
+gradient vector we calculate the partial derivative w.r.t. the whole
+latent variable vector. Thus the hessian is a matrix of partial
+derivatives:
+
+.. math::
+   \begin{pmatrix}
+   \frac{\partial}{\partial y_1 y_1} & ... &
+   \frac{\partial}{\partial y_1 y_n} \\
+   .&&.\\.&&.\\.&&.\\
+   \frac{\partial}{\partial y_n y_1} & ... &
+   \frac{\partial}{\partial y_n y_n}
+   \end{pmatrix}l({\bf y};\Theta)
+
+However, since we know that the partial derivative of the loss w.r.t.
+the latent variable :math:`y_i` depends only on the :math:`i^{th}`
+element of the :math:`y` vector, the off diagonal elements of the
+hessian matrix are reduced to zero:
+
+.. math::
+
+
+   \frac{\partial}{\partial y_i y_j} l({\bf y};\Theta) = 0 \text{ if } i \neq j
+
+The hessian is then reduced to a vetor:
+
+.. math::
+
+
+   \begin{align*}
+   \mathcal{H} &=  
+       \begin{pmatrix}
+           \frac{\partial}{\partial y_1 y_1}  \\
+           ... \\
+           \frac{\partial}{\partial y_n y_n}
+       \end{pmatrix}l({\bf y};\Theta) \\
+       &=
+       \begin{pmatrix}
+           \frac{\partial}{\partial y_1 }I(z_1 = k) \left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  \\ 
+           ... \\
+           \frac{\partial}{\partial y_n }
+           I(z_n = k) \left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  
+       \end{pmatrix}\\
+       &=
+       \begin{pmatrix}
+           -I(z_i = k) \left( \frac{\sigma''(\theta_k-y_1) - \sigma''(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)  +
+             I(z_n = k)\left( \frac{\sigma'(\theta_k-y_1) - \sigma'(\theta_{k-1}-y_1)}{\sigma(\theta_k-y_1) - \sigma(\theta_{k-1}-y_1)} \right)^2 \\ 
+           ... \\
+           -I(z_n = k) \left( \frac{\sigma''(\theta_k-y_n) - \sigma''(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)  +
+             I(z_n = k)\left( \frac{\sigma'(\theta_k-y_n) - \sigma'(\theta_{k-1}-y_n)}{\sigma(\theta_k-y_n) - \sigma(\theta_{k-1}-y_n)} \right)^2 \\ 
+       \end{pmatrix}
+   \end{align*}
+
+Miscellanious
+~~~~~~~~~~~~~
+
+The gradient of the sigmoid function is:
+
+.. math::
+
+
+   \sigma'(x) = \sigma(x)(1-\sigma(x))
+
+and the hessian is:
+
+.. math::
+
+
+   \begin{align*}
+       \sigma''(x) &= \frac{d}{dx}\sigma(x)(1-\sigma(x)) \\
+       &= \sigma'(x)(1-\sigma(x)) - \sigma'(x)\sigma(x)\\
+       &= \sigma(x)(1-\sigma(x))(1-\sigma(x)) -\sigma(x)(1-\sigma(x))\sigma(x) \\ 
+       &= (1-\sigma(x))\left(\sigma(x)-2\sigma(x)^2\right)
+   \end{align*}
+
+.. raw:: html
+
+   <!-- 
+
+   $$
+   \begin{align*}
+   \log L(\bbeta) &= l(\bbeta;\btheta) = \sum_{i=1}^n I(y_i=k) \log  \big[ \sigma(\theta_k - \eta_i) - \sigma(\theta_{k-1} - \eta_i) \big] \\
+   \eta_i &= \bx_i^T \bbeta \\
+   \frac{\partial l(\bbeta;\btheta)}{\partial \bbeta} &= \nabla_\bbeta = -\sum_{i=1}^n \bx_i I(y_i = k) \Bigg( \frac{\sigma'(\theta_k-\eta_i) + \sigma'(\theta_{k-1}-\eta_i)}{d_{ik}} \Bigg) \\
+   d_{ik} &= \sigma(\theta_k-\eta_i) - \sigma(\theta_{k-1}-\eta_i) \\
+   \frac{\partial l(\bbeta;\btheta)}{\partial \btheta} &= \nabla_\btheta = \sum_{i=1}^n \Bigg( I(y_i = k) \frac{\sigma'(\theta_k-\eta_i)}{d_{ik}} - I(y_i = k+1) \frac{\sigma'(\theta_k-\eta_i)}{d_{ik+1}} \Bigg)
+   \end{align*}
+   $$
+
+
+   $$
+   \begin{align*}
+   \tilde y &= \arg\max_k [P(y=k|\bbeta;\btheta;\tilde\bx)] \\
+   P(y=k|\bbeta;\btheta;\tilde\bx)  &= \begin{cases}
+   1 - \sigma(\theta_{K-1}-\tilde\eta) & \text{ if } k=K \\
+   \sigma(\theta_{K-1}-\tilde\eta) - \sigma(\theta_{K-2}-\tilde\eta) & \text{ if } k=K-1 \\
+   \vdots & \vdots \\
+   \sigma'(\theta_{1}-\tilde\eta) - 0 & \text{ if } k=1
+   \end{cases}
+   \end{align*}
+   $$ -->
+
+Code
+----
+
+   Coming soon
diff --git a/docs/maths.md b/docs/maths_old.md
similarity index 100%
rename from docs/maths.md
rename to docs/maths_old.md

From 3dded49d6f13e2f3b6f4ce08fb0d04aa0ec5f86f Mon Sep 17 00:00:00 2001
From: adamingas <adamingas@gmail.com>
Date: Tue, 16 Jan 2024 23:18:45 +0000
Subject: [PATCH 6/7] bump version to 0.1.2

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index a765892..86de138 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,7 +14,7 @@ install_requires=
 [metadata]
 name = ordinalgbt
 description = A library to build Gradient boosted trees for ordinal labels
-version = 0.1.1
+version = 0.1.2
 long_description = file:README.md
 long_description_content_type = text/markdown
 author = Adamos Spanashis

From f2cb37b83f88a3adf868e54475f9c889d0f3bd0f Mon Sep 17 00:00:00 2001
From: adamingas <adamingas@gmail.com>
Date: Tue, 16 Jan 2024 23:21:42 +0000
Subject: [PATCH 7/7] Undo new version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 86de138..a765892 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,7 +14,7 @@ install_requires=
 [metadata]
 name = ordinalgbt
 description = A library to build Gradient boosted trees for ordinal labels
-version = 0.1.2
+version = 0.1.1
 long_description = file:README.md
 long_description_content_type = text/markdown
 author = Adamos Spanashis