From 415f861412d2e69d52f3078e209f26d0f20c636e Mon Sep 17 00:00:00 2001 From: Santiago Suarez Date: Fri, 19 Jul 2024 18:43:59 -0300 Subject: [PATCH 1/3] Update pandas version --- poetry.lock | 104 +++++++++++++++++++++++++++++++++---------------- pyproject.toml | 2 +- 2 files changed, 72 insertions(+), 34 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6deb3aa..c29609b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1913,45 +1913,72 @@ files = [ [[package]] name = "pandas" -version = "1.3.5" +version = "2.2.2" description = "Powerful data structures for data analysis, time series, and statistics" optional = false -python-versions = ">=3.7.1" -files = [ - {file = "pandas-1.3.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:62d5b5ce965bae78f12c1c0df0d387899dd4211ec0bdc52822373f13a3a022b9"}, - {file = "pandas-1.3.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:adfeb11be2d54f275142c8ba9bf67acee771b7186a5745249c7d5a06c670136b"}, - {file = "pandas-1.3.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:60a8c055d58873ad81cae290d974d13dd479b82cbb975c3e1fa2cf1920715296"}, - {file = "pandas-1.3.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd541ab09e1f80a2a1760032d665f6e032d8e44055d602d65eeea6e6e85498cb"}, - {file = "pandas-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2651d75b9a167cc8cc572cf787ab512d16e316ae00ba81874b560586fa1325e0"}, - {file = "pandas-1.3.5-cp310-cp310-win_amd64.whl", hash = "sha256:aaf183a615ad790801fa3cf2fa450e5b6d23a54684fe386f7e3208f8b9bfbef6"}, - {file = "pandas-1.3.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:344295811e67f8200de2390093aeb3c8309f5648951b684d8db7eee7d1c81fb7"}, - {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:552020bf83b7f9033b57cbae65589c01e7ef1544416122da0c79140c93288f56"}, - {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5cce0c6bbeb266b0e39e35176ee615ce3585233092f685b6a82362523e59e5b4"}, - {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d28a3c65463fd0d0ba8bbb7696b23073efee0510783340a44b08f5e96ffce0c"}, - {file = "pandas-1.3.5-cp37-cp37m-win32.whl", hash = "sha256:a62949c626dd0ef7de11de34b44c6475db76995c2064e2d99c6498c3dba7fe58"}, - {file = "pandas-1.3.5-cp37-cp37m-win_amd64.whl", hash = "sha256:8025750767e138320b15ca16d70d5cdc1886e8f9cc56652d89735c016cd8aea6"}, - {file = "pandas-1.3.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fe95bae4e2d579812865db2212bb733144e34d0c6785c0685329e5b60fcb85dd"}, - {file = "pandas-1.3.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f261553a1e9c65b7a310302b9dbac31cf0049a51695c14ebe04e4bfd4a96f02"}, - {file = "pandas-1.3.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b6dbec5f3e6d5dc80dcfee250e0a2a652b3f28663492f7dab9a24416a48ac39"}, - {file = "pandas-1.3.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3bc49af96cd6285030a64779de5b3688633a07eb75c124b0747134a63f4c05f"}, - {file = "pandas-1.3.5-cp38-cp38-win32.whl", hash = "sha256:b6b87b2fb39e6383ca28e2829cddef1d9fc9e27e55ad91ca9c435572cdba51bf"}, - {file = "pandas-1.3.5-cp38-cp38-win_amd64.whl", hash = "sha256:a395692046fd8ce1edb4c6295c35184ae0c2bbe787ecbe384251da609e27edcb"}, - {file = "pandas-1.3.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bd971a3f08b745a75a86c00b97f3007c2ea175951286cdda6abe543e687e5f2f"}, - {file = "pandas-1.3.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37f06b59e5bc05711a518aa10beaec10942188dccb48918bb5ae602ccbc9f1a0"}, - {file = "pandas-1.3.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c21778a688d3712d35710501f8001cdbf96eb70a7c587a3d5613573299fdca6"}, - {file = "pandas-1.3.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3345343206546545bc26a05b4602b6a24385b5ec7c75cb6059599e3d56831da2"}, - {file = "pandas-1.3.5-cp39-cp39-win32.whl", hash = "sha256:c69406a2808ba6cf580c2255bcf260b3f214d2664a3a4197d0e640f573b46fd3"}, - {file = "pandas-1.3.5-cp39-cp39-win_amd64.whl", hash = "sha256:32e1a26d5ade11b547721a72f9bfc4bd113396947606e00d5b4a5b79b3dcb006"}, - {file = "pandas-1.3.5.tar.gz", hash = "sha256:1e4285f5de1012de20ca46b188ccf33521bff61ba5c5ebd78b4fb28e5416a9f1"}, +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"}, + {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"}, + {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, + {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"}, + {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"}, + {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, ] [package.dependencies] -numpy = {version = ">=1.21.0", markers = "python_version >= \"3.10\""} -python-dateutil = ">=2.7.3" -pytz = ">=2017.3" +numpy = {version = ">=1.22.4", markers = "python_version < \"3.11\""} +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" [package.extras] -test = ["hypothesis (>=3.58)", "pytest (>=6.0)", "pytest-xdist"] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] [[package]] name = "parso" @@ -2928,6 +2955,17 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + [[package]] name = "urllib3" version = "2.2.2" @@ -3239,4 +3277,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "~3.10" -content-hash = "6bcf5787c5c5ea9f571129e0b0bf8a96277ec9c9396ae13e085ae5e3bf8f3167" +content-hash = "cdd4dc4f62d0aec436717db61682e1e92468a3ecf808a1e6eef25388229f0e6e" diff --git a/pyproject.toml b/pyproject.toml index 084b0da..de46239 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ fastapi = "^0.111.0" pydantic = "~1.10.2" uvicorn = "~0.15.0" numpy = "~1.22.4" -pandas = "~1.3.5" +pandas = "^2.0.0" scikit-learn = "~1.3.0" [tool.poetry.group.dev.dependencies] From a68d6d44e1f6c915e9d57d65b07c52d6e47934ee Mon Sep 17 00:00:00 2001 From: Santiago Suarez Date: Fri, 19 Jul 2024 19:29:56 -0300 Subject: [PATCH 2/3] Improve model --- challenge/api.py | 5 +- challenge/model.py | 152 ++++++++++++++++++++++++--------------------- 2 files changed, 83 insertions(+), 74 deletions(-) diff --git a/challenge/api.py b/challenge/api.py index e18305a..5731c9c 100644 --- a/challenge/api.py +++ b/challenge/api.py @@ -126,9 +126,6 @@ async def post_predict(flight_data: FlightData) -> dict: preprocessed_data = model.preprocess(flight_data_df) # sorts column to feed the model - column_order = model._model.feature_names_in_ - preprocessed_data = preprocessed_data[column_order] - pred = model.predict(preprocessed_data) return {"predict": pred} @@ -138,5 +135,5 @@ async def post_predict(flight_data: FlightData) -> dict: with open("error_logs.txt", "a") as f: f.write(f"{datetime.now(timezone.utc)}: encounter error {e}") raise HTTPException( - status_code=500, detail="Internal server error during prediction" + status_code=500, detail="Internal server error during prediction." ) diff --git a/challenge/model.py b/challenge/model.py index 5358195..f978ca8 100644 --- a/challenge/model.py +++ b/challenge/model.py @@ -6,61 +6,91 @@ import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression +from xgboost import XGBClassifier -FEATURES_COLS = [ - "OPERA_Latin American Wings", - "MES_7", - "MES_10", - "OPERA_Grupo LATAM", - "MES_12", - "TIPOVUELO_I", - "MES_4", - "MES_11", - "OPERA_Sky Airline", - "OPERA_Copa Air", -] +class DelayModel: + FEATURES_COLS = [ + "MES_4", + "MES_7", + "MES_10", + "MES_11", + "MES_12", + "OPERA_Copa Air", + "OPERA_Grupo LATAM", + "OPERA_Latin American Wings", + "OPERA_Sky Airline", + "TIPOVUELO_I", + ] + + THRESHOLD_IN_MINUTES = 15 -def get_min_diff(data): - fecha_o = datetime.strptime(data["Fecha-O"], "%Y-%m-%d %H:%M:%S") - fecha_i = datetime.strptime(data["Fecha-I"], "%Y-%m-%d %H:%M:%S") - min_diff = ((fecha_o - fecha_i).total_seconds()) / 60 - return min_diff + def __init__(self): + self._model = LogisticRegression() + def _get_min_diff(self, data: pd.Series) -> float: + """ + Auxiliary function to get target. -def get_delay_target(data: pd.DataFrame) -> pd.Series: - data["min_diff"] = data.apply(get_min_diff, axis=1) - threshold_in_minutes = 15 - data["delay"] = np.where(data["min_diff"] > threshold_in_minutes, 1, 0) + Args: + data (pd.Series): raw data row. - return data["delay"].to_frame() + Returns: + float: difference between two rows in minutes. + """ + fecha_o = datetime.strptime(data["Fecha-O"], "%Y-%m-%d %H:%M:%S") + fecha_i = datetime.strptime(data["Fecha-I"], "%Y-%m-%d %H:%M:%S") + min_diff = ((fecha_o - fecha_i).total_seconds()) / 60 + return min_diff + def _get_delay_target(self, data: pd.DataFrame) -> pd.Series: + """ + Compute and return target to train the model with, from raw data. -def get_features(data: pd.DataFrame) -> pd.DataFrame: - # get the one hot enconding of the columns suggested by the DS - features = pd.concat( - [ - pd.get_dummies(data["OPERA"], prefix="OPERA"), - pd.get_dummies(data["TIPOVUELO"], prefix="TIPOVUELO"), - pd.get_dummies(data["MES"], prefix="MES"), - ], - axis=1, - ) - present_features = list(set(FEATURES_COLS).intersection(set(features.columns))) - missing_features = list(set(FEATURES_COLS).difference(set(features.columns))) + Args: + data (pd.DataFrame): raw data. - features = features[present_features] + Returns: + pd.Series: target to predict. + """ + data["min_diff"] = data.apply(self._get_min_diff, axis=1) + data["delay"] = np.where(data["min_diff"] > self.THRESHOLD_IN_MINUTES, 1, 0) - # fill missing features with 0 due to one-hot encoding of features - for feature in missing_features: - features[feature] = 0 + return data["delay"].to_frame() - return features + def _get_features(self, data: pd.DataFrame) -> pd.DataFrame: + """ + Compute and return input features to feed the model from raw data. + Args: + data (pd.DataFrame): raw_data. -class DelayModel: - def __init__(self): - self._model = LogisticRegression() + Returns: + pd.DataFrame: features with columns in a specific order. + """ + # get the one hot enconding of the columns suggested by the DS + # the existance of these three columns is enforced by the api above this code + features = pd.concat( + [ + pd.get_dummies(data["OPERA"], prefix="OPERA"), + pd.get_dummies(data["TIPOVUELO"], prefix="TIPOVUELO"), + pd.get_dummies(data["MES"], prefix="MES"), + ], + axis=1, + ) + valid_features = list( + set(self.FEATURES_COLS).intersection(set(features.columns)) + ) + missing_features = list( + set(self.FEATURES_COLS).difference(set(features.columns)) + ) + + # get valid features and fill missin with 0 due to one-hot encoding + features = features[valid_features] + features[missing_features] = 0 + + # return dataframe with sorted columns + return features[self.FEATURES_COLS] def preprocess( self, data: pd.DataFrame, target_column: Optional[str] = None @@ -78,20 +108,20 @@ def preprocess( pd.DataFrame: features. """ # retrieve features from the data - x = get_features(data) + x = self._get_features(data) # return different sets, depending on the target if target_column is None: return x elif target_column == "delay": - y = get_delay_target(data) + y = self._get_delay_target(data) return (x, y) else: raise NotImplementedError("Only implemented 'delay' as target column") def fit(self, features: pd.DataFrame, target: pd.DataFrame) -> None: """ - Fit model with preprocessed data. + Fit model with data preprocessed by this class. Args: features (pd.DataFrame): preprocessed data. @@ -130,7 +160,7 @@ def load_model(self, path: str): def predict(self, features: pd.DataFrame) -> List[int]: """ - Predict delays for new flights. + Predict delays for new flights on data preprocessed by this class. Args: features (pd.DataFrame): preprocessed data. @@ -146,38 +176,20 @@ def predict(self, features: pd.DataFrame) -> List[int]: return pred -if __name__ == "__main__": - from sklearn.metrics import classification_report, mean_squared_error - from sklearn.model_selection import train_test_split +def main(): + # perform a training of the model with all available data for production deployment - # perform a training of the model for production deployment + # get data and initial model model = DelayModel() data = pd.read_csv(filepath_or_buffer="data/data.csv") + # preprocess data and fit features, target = model.preprocess(data=data, target_column="delay") - - _, features_validation, _, target_validation = train_test_split( - features, target, test_size=0.33, random_state=42 - ) - model.fit(features=features, target=target) - predicted_target = model.predict(features_validation) - - report = classification_report( - target_validation, predicted_target, output_dict=True - ) - # save model.save_model("models") - # re instantiate to override model - model = DelayModel() - model.load_model("models") - - predicted_target_load = model.predict(features_validation) - print( - "The difference in prediction is:" - f" {mean_squared_error(predicted_target, predicted_target_load)}" - ) +if __name__ == "__main__": + main() From f9ad8cf34735b6ba46678355acecf7960a71fc8b Mon Sep 17 00:00:00 2001 From: Santiago Suarez Date: Fri, 19 Jul 2024 20:36:40 -0300 Subject: [PATCH 3/3] Add documentation on the modelling stage, and change model used --- challenge/model.py | 8 +-- docs/challenge.md | 136 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 111 insertions(+), 33 deletions(-) diff --git a/challenge/model.py b/challenge/model.py index f978ca8..2b87b4d 100644 --- a/challenge/model.py +++ b/challenge/model.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier @@ -26,7 +25,7 @@ class DelayModel: THRESHOLD_IN_MINUTES = 15 def __init__(self): - self._model = LogisticRegression() + self._model = XGBClassifier() def _get_min_diff(self, data: pd.Series) -> float: """ @@ -133,10 +132,11 @@ def fit(self, features: pd.DataFrame, target: pd.DataFrame) -> None: # get values to compensate unbalancing n_y0 = len(target[target[target_column] == 0]) n_y1 = len(target[target[target_column] == 1]) + scale = n_y0 / n_y1 # instantiate model and fit - self._model = LogisticRegression( - class_weight={1: n_y0 / len(target), 0: n_y1 / len(target)} + self._model = XGBClassifier( + random_state=1, learning_rate=0.01, scale_pos_weight=scale ) self._model.fit(features, target[target_column]) diff --git a/docs/challenge.md b/docs/challenge.md index dffe313..e66f696 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -1,24 +1,89 @@ # Challange Notes -## Reviewing the notebook +Notes during the development of the challange. -### Generalities +## Part I Model selection and transcription -In the documentation (README) it is said that the data has a column named -DATA-I when talking about the additional DS features, but this column does not -exist. Instead, from the code and the description, we can assume this column -name should be FECHA-I. +### Reviewing the DS's notebook. -### Feedback to the DS +There are quite some issues with how the notebook was presented. The first issue +is that it not runs properly, due to the abscense of the `x=` and `y=` kwargs +missing in the barplots. A second issue is that the `.set` method of seaborn +was deprecated in favour of `.set_theme`; also, this method could only be called +once. And lastly, some subtitles in markdown did not corresponded to the cells +they had below along the First Sight on the data analysis. While not critical, +nor relevant to the result they do not give a good impression of the care taken +while summarising the DS's experiments into this notebook. These errors where +fixed to properly execute the notebook (alongside with the change from +`../data/data.csv` to `data/data.csv`). -I would like to ask a few things to the DS as it seems unclear why he did some -of the selections he did. I am supposing this is a kind of summary notebook, so -some details may have been missed in the condensation of his/her analysis. +#### Data analysis + +The colour lightblue is a really poor choice for a contrast between the graph +background and the bars. + +The "flight by destination" graph is so cramped up, its hard to see which bar +corresponds to which destination. + +#### Features generation + +The additional features the DS computed makes sense at first sight, but the +target doesn't match up with what was asked. If the idea was to predict the +**probability** of a flight being delayed or not, then regressor should be +trained, and this information on "how much delay" the flight had, could be +implemented into this encoding. This may have more to do with the modelling part +than with the feature generation, but it is very tightly coupled. Also, the +value of 15 minutes to be considered a delay is not appropriately justified. + +#### Data analysis II + +First cell of this part shows bad coding practices. + +#### Training + +`training_data` was defined in second cell but not used again. + +The selection of three features is done with no explanation whatsoever. A binary +encong of the `TIPOVUELO` may benefit the model by reducing the input dimension. +a cyclical encoding of the months could benefit the model by: reducing the input +dimension, and, redundantly, adding the cyclicality nature of the months to the +model. + +On the xgboost training, the DS added an artificial threshold to its outputs, +this wasn't needed as the model already outputs 0/1. It should also be a red +flag that the model predicted always 0, meaning that it was not capturing enough +the information in the dataset. This result could be used as an argument of the +unbalancing of the dataset (in addition to the Logistic Regression below). By no +means should this trained model used to get the most important features, as it +would take the features that mostly predict the 0 value. + +Notice that the top 10 features that are selected, do not match with the top 10 +features of the graph (at least in this run). + +No mention is done on this whatsoever, a comment on such calling result +should be done. + +#### Training II + +For a second round of training, the DS considered the most important features to +be more relevant than the balancing, and made the experimentation as such, +getting expected results when the balancing is not done. The results without +the balancing lose meaning in this context. + +Though, with the balancing, some better results are obtained. But, there is no +explanation whether this is enough or not for the buisness. I don't think that +should be explicilty in the DS's analysis, but would be a nice to have as a +conclusion of the work. + +#### Conclusions and next steps: + +I would send back an email to the DS, asking for some more explanations of the +results, and their interpretations. Pointing out some of the mistakes I've +found. Something of the sort: 1. I see that at the data splitting step, you decided to keep just three features. Why keep these features specifically? It seems to me that other -features, as the destination, may encode some information on the delay process; -but maybe I am missing something. +features, as the destination, may encode some information on the delay process. 2. On the target encoding, I understood that we wanted to predict the probability of a delay on a specific flight. Your encoding just predicts whether the flight would be more than 15 minutes delayed or not. Have you came up with @@ -34,30 +99,43 @@ this decision. 4. The 10 features you selected to train with, are not the top 10 I am seeing in the graph of feature importance from the xgboost. I think it might be due to some random see issue. Would you care to go over it? +5. Are these metrics enough from a business perspective? Or they are expected to +be improved on further iterations? Also pointed out some comments in the code, but wouldn't bother the DS with -them, as it was not the main focus of the work. +them, as it was not the main focus of the work, and I haven't found clear bugs +on the used prediction features, that's what I would use later. Though, I would +report any bugs in the computation of features if I found one. + +##### Model selection + +At the first iteration of the challange, I choose the LogisticRegression as the +model of choice, for its simplicity, ease of explainability (which feature +it gives more weight to), and because it is part of scikit learn, our training +framework. But given that there are a lot of unsolved mysteries, and we may need +more predictive power, I will now choose the xgboost model. To solve any +additional issue we might have early, and for easiness of retraining. With the +added benefit that we don't loose that much explainability. + +##### Feature selection + +I will use the features selected by the DS as-is, because it's the best baseline +we have and it does not compromises time-to-delivery on a first iteration. +Putting this list as a parameter and updating it if it needs to isn't too hard. +##### Code-wise -### How to continue +Implemented the functions as similar to what they are in the notebook as +possible, improving as much code as I could. -I will proceed by moving the pipeline the DS implemented here as similar as -possible to the production pipeline, as this is the best predictions we have -yet. Though, I will also try to make it as versatile as possible in the feature -selection stage, that I think was the one that may need a revision from his -side; though, without compromising the time-to-production of the system. +Added a code to be executed if the script was run with +`python challenge/model.py` to generate and save a trained model in the path: +`models/model.pkl`. Pickle is not the best format to use, due to compatibility +reasons, but as I just saved the LogisticRegression, and used it inside the same +class as it was generated in, I don't expect errors. And converting it to a more +general format (like ONNX) and then running it properly is not trivial. -#### On the model selection -The final model might be modified if another stage of experimenting would be -done by the DS but, as both linear regression and xgboost support the same -interface for prediction, it shouldn't be that much of an issue to change it -afterwards if implemented appropriately. -I will go with the linear regression, as it's execution time is determined only -by the number of features, and not on a highly tunned hyperparameter (as the -number of trees in xgboost). Also, it has the advantage that we can limit -ourselves to only one framework (scikit learn), and have less imcompatibility -issues when trying to move our model to production. ## Part II API developement