Merge pull request #1 from adamingas/development

Minor fixes to gradient and hessian calculation
adamingas · Jan 11, 2024 · cd0a919 · cd0a919
2 parents 248034a + 4d322db
commit cd0a919
Show file tree

Hide file tree

Showing 7 changed files with 216 additions and 22 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -5,15 +5,14 @@ name: Python application
 
 on:
   push:
-    branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
 
 permissions:
   contents: read
 
 jobs:
-  build:
+  ci:
 
     runs-on: ubuntu-latest
 
@@ -39,3 +38,39 @@ jobs:
     - name: Test with pytest
       run: |
         pytest
+  cd:
+    needs: ci
+    # Only run this job if new work is pushed to "main"
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    # Set up operating system
+    runs-on: ubuntu-latest
+
+    # Define job steps
+    steps:
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+          python -m pip install --upgrade pip
+          pip install build
+      # Here we run build to create a wheel and a
+      # .tar.gz source distribution.
+    - name: Build package
+      run: python -m build --sdist --wheel
+      # Finally, we use a pre-defined action to publish
+      # our package in place of twine.
+    - uses: actions/checkout@v3
+    - name: Publish to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        user: __token__
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository_url: https://test.pypi.org/legacy/
+    - name: Test install from TestPyPI
+      run: |
+        pip install \
+        --index-url https://test.pypi.org/simple/ \
+        --extra-index-url https://pypi.org/simple \
+        pycounts
diff --git a/notebooks/comparison_with_classifiers.ipynb b/notebooks/comparison_with_classifiers.ipynb
@@ -0,0 +1,128 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Comparing ordinal with usual classification"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook we use the sklearn diabetes dataset as a comparison between the LGBMOrdinal, LGBMClassifier, and Logistic regression models. We convert the continuous label to classes by binnging it using quantiles.\n",
+    "\n",
+    "We then train and test the models several times with different train/test splits and evaluate their mean absolute deviation instead of accuracy. This metric penalises wrong predictions that are further appart from the true label more than those which are closer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from lightgbm import LGBMClassifier\n",
+    "from sklearn.datasets import load_diabetes\n",
+    "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "from ordinalgbt.lgb import LGBMOrdinal\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = load_diabetes()\n",
+    "X = pd.DataFrame(data[\"data\"], columns = data[\"feature_names\"])\n",
+    "y = data[\"target\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nq = 10\n",
+    "thresholds = np.append(np.append(y.min()-1,np.quantile(y,np.arange(0,1,1/nq)[1:])),y.max()+1)\n",
+    "yq = pd.cut(x=y,bins=thresholds,right=True,labels=['q'+str(z+1) for z in range(nq)])\n",
+    "yord = yq.astype('category').codes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   mdl  MAE\n",
+      "0          LGBMOrdinal  2.0\n",
+      "1  SKlearn Multinomial  2.5\n",
+      "2       LGBMClassifier  2.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "holder, coef = [], []\n",
+    "nsim = 10\n",
+    "for ii in range(nsim):\n",
+    "  # Do a train/test split (80/20)\n",
+    "  ytrain, ytest, Xtrain, Xtest = train_test_split(yord, X, stratify=yord,test_size=0.2,\n",
+    "                                                  random_state=ii)\n",
+    "  # Ordinal model\n",
+    "  mdl_ord = LGBMOrdinal()\n",
+    "  mdl_ord.fit(Xtrain, ytrain)\n",
+    "  # Multinomial LGBM model\n",
+    "  mdl_class = LGBMClassifier()\n",
+    "  mdl_class.fit(Xtrain, ytrain)\n",
+    "  # Multinomial Regression model\n",
+    "  mdl_multi = LogisticRegression(penalty='l2',solver='lbfgs',max_iter=1000)\n",
+    "  mdl_multi.fit(Xtrain,ytrain)\n",
+    "  # Make predictions\n",
+    "  yhat_ord = mdl_ord.predict(Xtest)\n",
+    "  yhat_multi = mdl_multi.predict(Xtest)\n",
+    "  yhat_class = mdl_class.predict(Xtest)\n",
+    "  # Get MAE\n",
+    "  acc_class = np.abs(yhat_class - ytest).mean()\n",
+    "  acc_multi = np.abs(yhat_multi - ytest).mean()\n",
+    "  acc_ord = np.abs(yhat_ord - ytest).mean()\n",
+    "  holder.append(pd.DataFrame({'ord':acc_ord,'multi':acc_multi,'class':acc_class},index=[ii]))\n",
+    "\n",
+    "df_mae = pd.concat(holder).mean(axis=0).reset_index().rename(columns={'index':'mdl',0:'MAE'})\n",
+    "di_lbls = {'ord':'LGBMOrdinal','multi':'SKlearn Multinomial','class':'LGBMClassifier'}\n",
+    "df_mae = df_mae.assign(mdl=lambda x: x.mdl.map(di_lbls))\n",
+    "print(np.round(df_mae,1))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/ordinalgbt/lgb.py b/ordinalgbt/lgb.py
@@ -64,10 +64,7 @@ def __init__(
         # self.threshold_interval = threshold_interval
 
     def _initialise_theta(self):
-        return np.linspace(0, (self.n_classes - 2) * 2, self.n_classes - 1)
-
-    def _initialise_alpha(self):
-        return theta2alpha(np.linspace(0, (self.n_classes - 2) * 2, self.n_classes - 1))
+        return np.linspace(0, (self.n_classes - 2) * 1, self.n_classes - 1)
 
     def _lgb_loss_factory(self):
         self.theta = self._initialise_theta()
@@ -96,7 +93,8 @@ def _optimise_alpha(self, y_true, y_preds):
         """
         loss = self._alpha_loss_factory(y_true, y_preds)
         alpha = theta2alpha(self.theta)
-        self._alpha_optimisation_report = minimize(loss, alpha)
+        bounds = [(None,3.58)]*len(alpha)
+        self._alpha_optimisation_report = minimize(loss, alpha, bounds=bounds)
         alpha = self._alpha_optimisation_report.x
         self.theta = alpha2theta(alpha)
 

diff --git a/ordinalgbt/loss.py b/ordinalgbt/loss.py
@@ -6,7 +6,7 @@
 def dec_clip_y_pred(fun):
     @wraps(fun)
     def wrapped(*, y_true, y_preds, theta):
-        y_preds = np.clip(y_preds, -20, a_max=700 + min(theta))
+        y_preds = np.clip(y_preds, max(theta)-36, a_max=700 + min(theta))
         return fun(y_true=y_true, y_preds=y_preds, theta=theta)
 
     return wrapped
@@ -151,12 +151,12 @@ def probas_from_y_pred(y_preds, theta):
     c_probas = stack_zeros_ones(s_array)
 
     probas = c_probas[:, 1 : len(theta) + 2] - c_probas[:, 0 : len(theta) + 1]
-    probas = np.clip(
-        probas, a_min=np.finfo(float).eps, a_max=1 - 3 * np.finfo(float).eps
-    )
+    # probas = np.clip(
+    #     probas, a_min=np.finfo(float).eps, a_max=1 - len(theta) * np.finfo(float).eps
+    # )
     return probas
 
-
+@dec_clip_y_pred
 def ordinal_logistic_nll(y_true: np.ndarray, y_preds: np.ndarray, theta: np.ndarray):
     """Ordinal Negative log lilelihood
 
@@ -180,6 +180,11 @@ def ordinal_logistic_nll(y_true: np.ndarray, y_preds: np.ndarray, theta: np.ndar
     probas = probas_from_y_pred(y_preds, theta)
     # probabilities associated with the correct label
     label_probas = probas[np.arange(0, len(y_true)), y_true]
+    label_probas = np.clip(
+        label_probas,
+        a_min=np.finfo(float).eps,
+        a_max=1 - len(theta) * np.finfo(float).eps
+    )
     # loss
     return -np.sum(np.log(label_probas))
 
@@ -255,6 +260,7 @@ def hessian_ordinal_logistic_nll(
     hessian = -(h_probas / probas - np.power(g_probas / probas, 2))[
         np.arange(0, len(y_true)), y_true
     ]
+    # hessian[np.abs(hessian) <=np.finfo(float).eps] = -np.finfo(float).eps
     return hessian
 
 

diff --git a/setup.cfg b/setup.cfg
@@ -14,7 +14,7 @@ install_requires=
 [metadata]
 name = ordinalgbt
 description = A library to build Gradient boosted trees for ordinal labels
-version = 0.1
+version = 0.1.1
 long_description = file:README.md
 long_description_content_type = text/markdown
 author = Adamos Spanashis

diff --git a/tests/test_lgb.py b/tests/test_lgb.py
@@ -14,16 +14,9 @@
 def test_initialise_theta():
     model = LGBMOrdinal()
     model.n_classes = 5
-    expected_theta = np.array([0., 2., 4., 6.])
+    expected_theta = np.array([0., 1., 2., 3.])
     assert np.array_equal(model._initialise_theta(), expected_theta)
 
-def test_initialise_alpha():
-    model = LGBMOrdinal()
-    model.n_classes = 5
-    expected_theta = np.array([0., 2., 4., 6.])
-    expected_alpha = theta2alpha(expected_theta)
-    assert np.array_equal(model._initialise_alpha(), expected_alpha)
-
 def test_lgb_loss_factory():
     model = LGBMOrdinal()
     model.n_classes = 5

diff --git a/tests/test_loss.py b/tests/test_loss.py
@@ -62,7 +62,7 @@ def test_ordinal_logistic_nll():
     expected_loss = -np.sum(np.log(
         sigmoid(np.array([1,500,-3])) - sigmoid(np.array([-1,-2,-500]))
         ))
-    loss = ordinal_logistic_nll(y_true, y_preds, theta)
+    loss = ordinal_logistic_nll(y_true= y_true, y_preds= y_preds, theta= theta)
     assert isinstance(loss, float)
     assert loss == pytest.approx(expected_loss)
 
@@ -76,6 +76,19 @@ def test_gradient_ordinal_logistic_nll():
                                    np.array([0, 0, 1]),
                                    decimal=3)
 
+def test_gradient_ordinal_logistic_nll_monotonic():
+    """
+    Testing at extreeme values of y_pred where the resolution
+    of float point arithmetic might fail
+    """
+    y_preds = np.linspace(0,150,100)
+    y_true = np.array([5]*100)
+    theta = np.arange(0,18,2)
+
+    gradient = gradient_ordinal_logistic_nll(y_true, y_preds, theta)
+    monotonic = (gradient[1:]- gradient[:-1]) >= 0
+    assert  monotonic.all() , "Not strictly monotonic gradient"
+
 def test_hessian_ordinal_logistic_nll():
     y_preds = np.array([1.5, 15, -38])
     y_true = np.array([1, 2, 0])
@@ -86,6 +99,27 @@ def test_hessian_ordinal_logistic_nll():
                                    np.array([0.47, 0, 0]),
                                    decimal=5)
 
+def test_hessian_ordinal_logistic_nll_monotonic():
+    """
+    Testing at extreeme values of y_pred where the resolution
+    of float point arithmetic might fail
+    """
+    y_preds = np.linspace(0,150,100)
+    y_true = np.array([5]*100)
+    theta = np.arange(0,18,2)
+    expected_max_mask = np.logical_and(y_preds<theta[5],y_preds>theta[4])
+    hessian = hessian_ordinal_logistic_nll(y_true, y_preds, theta)
+    np.testing.assert_almost_equal(hessian[expected_max_mask], hessian.max())
+
+    expected_max_indx = np.where(expected_max_mask)[0]
+    ascending = hessian[:expected_max_indx[0]]
+    assert ((ascending[1:] - ascending[:-1]) >=0).all()
+
+    descending = hessian[expected_max_indx[0]:]
+    assert ((descending[1:] - descending[:-1]) <=0).all()
+
+
+
 def test_lgb_ordinal_loss():
     y_preds = np.array([1.5, 15, -38])
     y_true = np.array([1, 2, 0])