diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 4c2c30e..2bbf3e6 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -5,7 +5,6 @@ name: Python application on: push: - branches: [ "main" ] pull_request: branches: [ "main" ] @@ -13,7 +12,7 @@ permissions: contents: read jobs: - build: + ci: runs-on: ubuntu-latest @@ -39,3 +38,39 @@ jobs: - name: Test with pytest run: | pytest + cd: + needs: ci + # Only run this job if new work is pushed to "main" + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + # Set up operating system + runs-on: ubuntu-latest + + # Define job steps + steps: + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + # Here we run build to create a wheel and a + # .tar.gz source distribution. + - name: Build package + run: python -m build --sdist --wheel + # Finally, we use a pre-defined action to publish + # our package in place of twine. + - uses: actions/checkout@v3 + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + - name: Test install from TestPyPI + run: | + pip install \ + --index-url https://test.pypi.org/simple/ \ + --extra-index-url https://pypi.org/simple \ + pycounts \ No newline at end of file diff --git a/notebooks/comparison_with_classifiers.ipynb b/notebooks/comparison_with_classifiers.ipynb new file mode 100644 index 0000000..b9e5ff2 --- /dev/null +++ b/notebooks/comparison_with_classifiers.ipynb @@ -0,0 +1,128 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparing ordinal with usual classification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook we use the sklearn diabetes dataset as a comparison between the LGBMOrdinal, LGBMClassifier, and Logistic regression models. We convert the continuous label to classes by binnging it using quantiles.\n", + "\n", + "We then train and test the models several times with different train/test splits and evaluate their mean absolute deviation instead of accuracy. This metric penalises wrong predictions that are further appart from the true label more than those which are closer." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from lightgbm import LGBMClassifier\n", + "from sklearn.datasets import load_diabetes\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from ordinalgbt.lgb import LGBMOrdinal\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "data = load_diabetes()\n", + "X = pd.DataFrame(data[\"data\"], columns = data[\"feature_names\"])\n", + "y = data[\"target\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "nq = 10\n", + "thresholds = np.append(np.append(y.min()-1,np.quantile(y,np.arange(0,1,1/nq)[1:])),y.max()+1)\n", + "yq = pd.cut(x=y,bins=thresholds,right=True,labels=['q'+str(z+1) for z in range(nq)])\n", + "yord = yq.astype('category').codes\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mdl MAE\n", + "0 LGBMOrdinal 2.0\n", + "1 SKlearn Multinomial 2.5\n", + "2 LGBMClassifier 2.1\n" + ] + } + ], + "source": [ + "holder, coef = [], []\n", + "nsim = 10\n", + "for ii in range(nsim):\n", + " # Do a train/test split (80/20)\n", + " ytrain, ytest, Xtrain, Xtest = train_test_split(yord, X, stratify=yord,test_size=0.2,\n", + " random_state=ii)\n", + " # Ordinal model\n", + " mdl_ord = LGBMOrdinal()\n", + " mdl_ord.fit(Xtrain, ytrain)\n", + " # Multinomial LGBM model\n", + " mdl_class = LGBMClassifier()\n", + " mdl_class.fit(Xtrain, ytrain)\n", + " # Multinomial Regression model\n", + " mdl_multi = LogisticRegression(penalty='l2',solver='lbfgs',max_iter=1000)\n", + " mdl_multi.fit(Xtrain,ytrain)\n", + " # Make predictions\n", + " yhat_ord = mdl_ord.predict(Xtest)\n", + " yhat_multi = mdl_multi.predict(Xtest)\n", + " yhat_class = mdl_class.predict(Xtest)\n", + " # Get MAE\n", + " acc_class = np.abs(yhat_class - ytest).mean()\n", + " acc_multi = np.abs(yhat_multi - ytest).mean()\n", + " acc_ord = np.abs(yhat_ord - ytest).mean()\n", + " holder.append(pd.DataFrame({'ord':acc_ord,'multi':acc_multi,'class':acc_class},index=[ii]))\n", + "\n", + "df_mae = pd.concat(holder).mean(axis=0).reset_index().rename(columns={'index':'mdl',0:'MAE'})\n", + "di_lbls = {'ord':'LGBMOrdinal','multi':'SKlearn Multinomial','class':'LGBMClassifier'}\n", + "df_mae = df_mae.assign(mdl=lambda x: x.mdl.map(di_lbls))\n", + "print(np.round(df_mae,1))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ordinalgbt/lgb.py b/ordinalgbt/lgb.py index cc611ff..ef70378 100644 --- a/ordinalgbt/lgb.py +++ b/ordinalgbt/lgb.py @@ -64,10 +64,7 @@ def __init__( # self.threshold_interval = threshold_interval def _initialise_theta(self): - return np.linspace(0, (self.n_classes - 2) * 2, self.n_classes - 1) - - def _initialise_alpha(self): - return theta2alpha(np.linspace(0, (self.n_classes - 2) * 2, self.n_classes - 1)) + return np.linspace(0, (self.n_classes - 2) * 1, self.n_classes - 1) def _lgb_loss_factory(self): self.theta = self._initialise_theta() @@ -96,7 +93,8 @@ def _optimise_alpha(self, y_true, y_preds): """ loss = self._alpha_loss_factory(y_true, y_preds) alpha = theta2alpha(self.theta) - self._alpha_optimisation_report = minimize(loss, alpha) + bounds = [(None,3.58)]*len(alpha) + self._alpha_optimisation_report = minimize(loss, alpha, bounds=bounds) alpha = self._alpha_optimisation_report.x self.theta = alpha2theta(alpha) diff --git a/ordinalgbt/loss.py b/ordinalgbt/loss.py index ce769dd..4237284 100644 --- a/ordinalgbt/loss.py +++ b/ordinalgbt/loss.py @@ -6,7 +6,7 @@ def dec_clip_y_pred(fun): @wraps(fun) def wrapped(*, y_true, y_preds, theta): - y_preds = np.clip(y_preds, -20, a_max=700 + min(theta)) + y_preds = np.clip(y_preds, max(theta)-36, a_max=700 + min(theta)) return fun(y_true=y_true, y_preds=y_preds, theta=theta) return wrapped @@ -151,12 +151,12 @@ def probas_from_y_pred(y_preds, theta): c_probas = stack_zeros_ones(s_array) probas = c_probas[:, 1 : len(theta) + 2] - c_probas[:, 0 : len(theta) + 1] - probas = np.clip( - probas, a_min=np.finfo(float).eps, a_max=1 - 3 * np.finfo(float).eps - ) + # probas = np.clip( + # probas, a_min=np.finfo(float).eps, a_max=1 - len(theta) * np.finfo(float).eps + # ) return probas - +@dec_clip_y_pred def ordinal_logistic_nll(y_true: np.ndarray, y_preds: np.ndarray, theta: np.ndarray): """Ordinal Negative log lilelihood @@ -180,6 +180,11 @@ def ordinal_logistic_nll(y_true: np.ndarray, y_preds: np.ndarray, theta: np.ndar probas = probas_from_y_pred(y_preds, theta) # probabilities associated with the correct label label_probas = probas[np.arange(0, len(y_true)), y_true] + label_probas = np.clip( + label_probas, + a_min=np.finfo(float).eps, + a_max=1 - len(theta) * np.finfo(float).eps + ) # loss return -np.sum(np.log(label_probas)) @@ -255,6 +260,7 @@ def hessian_ordinal_logistic_nll( hessian = -(h_probas / probas - np.power(g_probas / probas, 2))[ np.arange(0, len(y_true)), y_true ] + # hessian[np.abs(hessian) <=np.finfo(float).eps] = -np.finfo(float).eps return hessian diff --git a/setup.cfg b/setup.cfg index f48f5f8..a765892 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,7 +14,7 @@ install_requires= [metadata] name = ordinalgbt description = A library to build Gradient boosted trees for ordinal labels -version = 0.1 +version = 0.1.1 long_description = file:README.md long_description_content_type = text/markdown author = Adamos Spanashis diff --git a/tests/test_lgb.py b/tests/test_lgb.py index d476b43..0d1167a 100644 --- a/tests/test_lgb.py +++ b/tests/test_lgb.py @@ -14,16 +14,9 @@ def test_initialise_theta(): model = LGBMOrdinal() model.n_classes = 5 - expected_theta = np.array([0., 2., 4., 6.]) + expected_theta = np.array([0., 1., 2., 3.]) assert np.array_equal(model._initialise_theta(), expected_theta) -def test_initialise_alpha(): - model = LGBMOrdinal() - model.n_classes = 5 - expected_theta = np.array([0., 2., 4., 6.]) - expected_alpha = theta2alpha(expected_theta) - assert np.array_equal(model._initialise_alpha(), expected_alpha) - def test_lgb_loss_factory(): model = LGBMOrdinal() model.n_classes = 5 diff --git a/tests/test_loss.py b/tests/test_loss.py index a64a58e..deae449 100644 --- a/tests/test_loss.py +++ b/tests/test_loss.py @@ -62,7 +62,7 @@ def test_ordinal_logistic_nll(): expected_loss = -np.sum(np.log( sigmoid(np.array([1,500,-3])) - sigmoid(np.array([-1,-2,-500])) )) - loss = ordinal_logistic_nll(y_true, y_preds, theta) + loss = ordinal_logistic_nll(y_true= y_true, y_preds= y_preds, theta= theta) assert isinstance(loss, float) assert loss == pytest.approx(expected_loss) @@ -76,6 +76,19 @@ def test_gradient_ordinal_logistic_nll(): np.array([0, 0, 1]), decimal=3) +def test_gradient_ordinal_logistic_nll_monotonic(): + """ + Testing at extreeme values of y_pred where the resolution + of float point arithmetic might fail + """ + y_preds = np.linspace(0,150,100) + y_true = np.array([5]*100) + theta = np.arange(0,18,2) + + gradient = gradient_ordinal_logistic_nll(y_true, y_preds, theta) + monotonic = (gradient[1:]- gradient[:-1]) >= 0 + assert monotonic.all() , "Not strictly monotonic gradient" + def test_hessian_ordinal_logistic_nll(): y_preds = np.array([1.5, 15, -38]) y_true = np.array([1, 2, 0]) @@ -86,6 +99,27 @@ def test_hessian_ordinal_logistic_nll(): np.array([0.47, 0, 0]), decimal=5) +def test_hessian_ordinal_logistic_nll_monotonic(): + """ + Testing at extreeme values of y_pred where the resolution + of float point arithmetic might fail + """ + y_preds = np.linspace(0,150,100) + y_true = np.array([5]*100) + theta = np.arange(0,18,2) + expected_max_mask = np.logical_and(y_predstheta[4]) + hessian = hessian_ordinal_logistic_nll(y_true, y_preds, theta) + np.testing.assert_almost_equal(hessian[expected_max_mask], hessian.max()) + + expected_max_indx = np.where(expected_max_mask)[0] + ascending = hessian[:expected_max_indx[0]] + assert ((ascending[1:] - ascending[:-1]) >=0).all() + + descending = hessian[expected_max_indx[0]:] + assert ((descending[1:] - descending[:-1]) <=0).all() + + + def test_lgb_ordinal_loss(): y_preds = np.array([1.5, 15, -38]) y_true = np.array([1, 2, 0])