From f9ad8cf34735b6ba46678355acecf7960a71fc8b Mon Sep 17 00:00:00 2001 From: Santiago Suarez Date: Fri, 19 Jul 2024 20:36:40 -0300 Subject: [PATCH] Add documentation on the modelling stage, and change model used --- challenge/model.py | 8 +-- docs/challenge.md | 136 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 111 insertions(+), 33 deletions(-) diff --git a/challenge/model.py b/challenge/model.py index f978ca8..2b87b4d 100644 --- a/challenge/model.py +++ b/challenge/model.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier @@ -26,7 +25,7 @@ class DelayModel: THRESHOLD_IN_MINUTES = 15 def __init__(self): - self._model = LogisticRegression() + self._model = XGBClassifier() def _get_min_diff(self, data: pd.Series) -> float: """ @@ -133,10 +132,11 @@ def fit(self, features: pd.DataFrame, target: pd.DataFrame) -> None: # get values to compensate unbalancing n_y0 = len(target[target[target_column] == 0]) n_y1 = len(target[target[target_column] == 1]) + scale = n_y0 / n_y1 # instantiate model and fit - self._model = LogisticRegression( - class_weight={1: n_y0 / len(target), 0: n_y1 / len(target)} + self._model = XGBClassifier( + random_state=1, learning_rate=0.01, scale_pos_weight=scale ) self._model.fit(features, target[target_column]) diff --git a/docs/challenge.md b/docs/challenge.md index dffe313..e66f696 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -1,24 +1,89 @@ # Challange Notes -## Reviewing the notebook +Notes during the development of the challange. -### Generalities +## Part I Model selection and transcription -In the documentation (README) it is said that the data has a column named -DATA-I when talking about the additional DS features, but this column does not -exist. Instead, from the code and the description, we can assume this column -name should be FECHA-I. +### Reviewing the DS's notebook. -### Feedback to the DS +There are quite some issues with how the notebook was presented. The first issue +is that it not runs properly, due to the abscense of the `x=` and `y=` kwargs +missing in the barplots. A second issue is that the `.set` method of seaborn +was deprecated in favour of `.set_theme`; also, this method could only be called +once. And lastly, some subtitles in markdown did not corresponded to the cells +they had below along the First Sight on the data analysis. While not critical, +nor relevant to the result they do not give a good impression of the care taken +while summarising the DS's experiments into this notebook. These errors where +fixed to properly execute the notebook (alongside with the change from +`../data/data.csv` to `data/data.csv`). -I would like to ask a few things to the DS as it seems unclear why he did some -of the selections he did. I am supposing this is a kind of summary notebook, so -some details may have been missed in the condensation of his/her analysis. +#### Data analysis + +The colour lightblue is a really poor choice for a contrast between the graph +background and the bars. + +The "flight by destination" graph is so cramped up, its hard to see which bar +corresponds to which destination. + +#### Features generation + +The additional features the DS computed makes sense at first sight, but the +target doesn't match up with what was asked. If the idea was to predict the +**probability** of a flight being delayed or not, then regressor should be +trained, and this information on "how much delay" the flight had, could be +implemented into this encoding. This may have more to do with the modelling part +than with the feature generation, but it is very tightly coupled. Also, the +value of 15 minutes to be considered a delay is not appropriately justified. + +#### Data analysis II + +First cell of this part shows bad coding practices. + +#### Training + +`training_data` was defined in second cell but not used again. + +The selection of three features is done with no explanation whatsoever. A binary +encong of the `TIPOVUELO` may benefit the model by reducing the input dimension. +a cyclical encoding of the months could benefit the model by: reducing the input +dimension, and, redundantly, adding the cyclicality nature of the months to the +model. + +On the xgboost training, the DS added an artificial threshold to its outputs, +this wasn't needed as the model already outputs 0/1. It should also be a red +flag that the model predicted always 0, meaning that it was not capturing enough +the information in the dataset. This result could be used as an argument of the +unbalancing of the dataset (in addition to the Logistic Regression below). By no +means should this trained model used to get the most important features, as it +would take the features that mostly predict the 0 value. + +Notice that the top 10 features that are selected, do not match with the top 10 +features of the graph (at least in this run). + +No mention is done on this whatsoever, a comment on such calling result +should be done. + +#### Training II + +For a second round of training, the DS considered the most important features to +be more relevant than the balancing, and made the experimentation as such, +getting expected results when the balancing is not done. The results without +the balancing lose meaning in this context. + +Though, with the balancing, some better results are obtained. But, there is no +explanation whether this is enough or not for the buisness. I don't think that +should be explicilty in the DS's analysis, but would be a nice to have as a +conclusion of the work. + +#### Conclusions and next steps: + +I would send back an email to the DS, asking for some more explanations of the +results, and their interpretations. Pointing out some of the mistakes I've +found. Something of the sort: 1. I see that at the data splitting step, you decided to keep just three features. Why keep these features specifically? It seems to me that other -features, as the destination, may encode some information on the delay process; -but maybe I am missing something. +features, as the destination, may encode some information on the delay process. 2. On the target encoding, I understood that we wanted to predict the probability of a delay on a specific flight. Your encoding just predicts whether the flight would be more than 15 minutes delayed or not. Have you came up with @@ -34,30 +99,43 @@ this decision. 4. The 10 features you selected to train with, are not the top 10 I am seeing in the graph of feature importance from the xgboost. I think it might be due to some random see issue. Would you care to go over it? +5. Are these metrics enough from a business perspective? Or they are expected to +be improved on further iterations? Also pointed out some comments in the code, but wouldn't bother the DS with -them, as it was not the main focus of the work. +them, as it was not the main focus of the work, and I haven't found clear bugs +on the used prediction features, that's what I would use later. Though, I would +report any bugs in the computation of features if I found one. + +##### Model selection + +At the first iteration of the challange, I choose the LogisticRegression as the +model of choice, for its simplicity, ease of explainability (which feature +it gives more weight to), and because it is part of scikit learn, our training +framework. But given that there are a lot of unsolved mysteries, and we may need +more predictive power, I will now choose the xgboost model. To solve any +additional issue we might have early, and for easiness of retraining. With the +added benefit that we don't loose that much explainability. + +##### Feature selection + +I will use the features selected by the DS as-is, because it's the best baseline +we have and it does not compromises time-to-delivery on a first iteration. +Putting this list as a parameter and updating it if it needs to isn't too hard. +##### Code-wise -### How to continue +Implemented the functions as similar to what they are in the notebook as +possible, improving as much code as I could. -I will proceed by moving the pipeline the DS implemented here as similar as -possible to the production pipeline, as this is the best predictions we have -yet. Though, I will also try to make it as versatile as possible in the feature -selection stage, that I think was the one that may need a revision from his -side; though, without compromising the time-to-production of the system. +Added a code to be executed if the script was run with +`python challenge/model.py` to generate and save a trained model in the path: +`models/model.pkl`. Pickle is not the best format to use, due to compatibility +reasons, but as I just saved the LogisticRegression, and used it inside the same +class as it was generated in, I don't expect errors. And converting it to a more +general format (like ONNX) and then running it properly is not trivial. -#### On the model selection -The final model might be modified if another stage of experimenting would be -done by the DS but, as both linear regression and xgboost support the same -interface for prediction, it shouldn't be that much of an issue to change it -afterwards if implemented appropriately. -I will go with the linear regression, as it's execution time is determined only -by the number of features, and not on a highly tunned hyperparameter (as the -number of trees in xgboost). Also, it has the advantage that we can limit -ourselves to only one framework (scikit learn), and have less imcompatibility -issues when trying to move our model to production. ## Part II API developement