From fd480fe8b88aba342b30d6da5b752469046e643d Mon Sep 17 00:00:00 2001 From: fabioacl <33908066+fabioacl@users.noreply.github.com> Date: Wed, 13 Dec 2023 16:26:24 +0100 Subject: [PATCH] New material on `scikit-learn` library (#162) * Add skeleton for sklearn library notebook. * add scikit-learn dependency. * Add explanation texts to the library_sklearn.ipynb * Add explanation texts to the library_sklearn.ipynb * Adding more explanations and exercises to the librarty_sklearn.ipynb notebook * Correct basic_datatypes.ipynb * Add more exercises to the library_sklearn.ipynb notebook * Add Exercises entry to the Table of contents * Reformat notebook. * Add some prints and figures to the library_sklearn.ipynb notebook * Add reference to the library in the index.ipynb * Remove solutions from the notebook. --------- Co-authored-by: Aliaksandr Yakutovich --- basic_datatypes.ipynb | 2 +- binder/environment.yml | 1 + index.ipynb | 6 +- library_sklearn.ipynb | 885 +++++++++++++++++++++++++ tutorial/tests/test_library_sklearn.py | 277 ++++++++ 5 files changed, 1169 insertions(+), 2 deletions(-) create mode 100644 library_sklearn.ipynb create mode 100644 tutorial/tests/test_library_sklearn.py diff --git a/basic_datatypes.ipynb b/basic_datatypes.ipynb index 0cab454f..988f4676 100644 --- a/basic_datatypes.ipynb +++ b/basic_datatypes.ipynb @@ -2737,7 +2737,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.10.13" }, "vscode": { "interpreter": { diff --git a/binder/environment.yml b/binder/environment.yml index 0b5d15d2..7ae14e50 100644 --- a/binder/environment.yml +++ b/binder/environment.yml @@ -16,5 +16,6 @@ dependencies: - pytest-timeout - markdown - pre-commit + - scikit-learn - attrs - multiprocess diff --git a/index.ipynb b/index.ipynb index d8872e6e..5c5253ab 100644 --- a/index.ipynb +++ b/index.ipynb @@ -20,7 +20,11 @@ "- [Manage Python project](./manage_python_project.ipynb)\n", "- [Advanced functions](./functions_advanced.ipynb)\n", "- [Advanced Object-oriented programming](./object_oriented_programming_advanced.ipynb)\n", - "- [Parallelism and concurrency in Python](./threads.ipynb)\n" + "- [Parallelism and concurrency in Python](./threads.ipynb)\n", + "\n", + "# Libraries\n", + "\n", + "- [scikit-learn](./library_sklearn.ipynb)" ] } ], diff --git a/library_sklearn.ipynb b/library_sklearn.ipynb new file mode 100644 index 00000000..bedb05ff --- /dev/null +++ b/library_sklearn.ipynb @@ -0,0 +1,885 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Scikit-learn library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table of Contents\n", + " - [Scikit-learn library](#Scikit-learn-library)\n", + " - [References](#References)\n", + " - [Introduction](#Introduction)\n", + " - [Dataset](#Dataset)\n", + " - [Load data](#Load-data)\n", + " - [Train-Validation-Test split](#Train-Validation-Test-split)\n", + " - [Data Preprocessing](#Data-Preprocessing)\n", + " - [Standardising data](#Standardising-data)\n", + " - [Dimensionality reduction](#Dimensionality-reduction)\n", + " - [Feature selection](#Feature-selection)\n", + " - [Feature projection](#Feature-projection)\n", + " - [Plot dataset](#Plot-dataset)\n", + " - [Exercises on dimensionality reduction](#Exercises-on-dimensionality-reduction)\n", + " - [Unsupervised learning](#Unsupervised-learning)\n", + " - [Exercises on unsupervised learning](#Exercises-on-unsupervised-learning)\n", + " - [Supervised learning](#Supervised-learning)\n", + " - [Classification](#Classification)\n", + " - [Logistic Regression](#Logistic-Regression)\n", + " - [Decision Tree](#Decision-Tree)\n", + " - [Evaluation](#Evaluation)\n", + " - [Regression](#Regression)\n", + " - [Load dataset and train-validation-test split](#Load-dataset-and-train-validation-test-split)\n", + " - [Data preprocessing and develop regressor using Pipeline object](#Data-preprocessing-and-develop-regressor-using-Pipeline-object)\n", + " - [Evaluation](#Evaluation)\n", + " - [Exercises on supervised learning](#Exercises-on-supervised-learning)\n", + " - [Exercises](#Exercises)\n", + " - [House pricing prediction](#House-pricing-prediction)\n", + " - [News classifier](#News-classifier)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Important modules\n", + "import numpy as np\n", + "import matplotlib\n", + "from matplotlib import pyplot as plt\n", + "# Reshape figures to a larger figsize\n", + "matplotlib.rcParams['figure.figsize'] = [14,14]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "Here are some additional references to guide you while self-learning:\n", + "\n", + "* Official documentation for [Datasets available in Scikit-learn](https://scikit-learn.org/stable/datasets/toy_dataset.html#toy-datasets)\n", + "* Official documentation for [Data splitting](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection)\n", + "* Official documentation for [Data preprocessing](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-data)\n", + "* Official documentation for [Feature selection](https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection)\n", + "* Official documentation for [Feature projection or decomposition](https://scikit-learn.org/stable/modules/decomposition.html)\n", + "* Official documentation for [Linear classifiers](https://scikit-learn.org/stable/modules/linear_model.html#classification)\n", + "* Official documentation for [Decision tree classifiers](https://scikit-learn.org/stable/modules/tree.html#classification)\n", + "* Official documentation for [Model selection and evaluation](https://scikit-learn.org/stable/model_selection.html#model-selection-and-evaluation)\n", + "* Official documentation for [Pipelines in Scikit-learn](https://scikit-learn.org/stable/modules/compose.html#build-a-pipeline)\n", + "* Official documentation for [Linear regression](https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html#linear-regression)\n", + "* Official documentation for [Agglomerative Clustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html)\n", + "* Official documentation for [Support Vector Machines](https://scikit-learn.org/stable/modules/svm.html#support-vector-machines)\n", + "* Official documentation for [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#random-forests)\n", + "* [Machine Learning Mastery](https://machinelearningmastery.com/start-here/): A very detailed source of information about machine learning\n", + "* [Machine Learning Course](https://www.youtube.com/watch?v=jGwO_UgTS7I&list=PLoROMvodv4rMiGQp3WXShtMGgzqpfVfbU): From Stanford University\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "Scikit-learn is an open-source library in Python highly used for machine learning tasks such as data preprocessing, classification, regression, and clustering.\n", + "It is built on top of other scientific libraries such as NumPy, SciPy, and Matplotlib. \n", + "It contains several tools for preparing data and building intelligent models with it.\n", + "It is also user-friendly and efficient for both beginners and experts in data science.\n", + "\n", + "This notebook covers different tools to perform supervised (classification and regression) and unsupervised learning (clustering).\n", + "Therefore, it contains some basic machine learning concepts.\n", + "However, the goal is not to teach about the algorithms and procedures used in this field, but rather to give the user an idea of what can be done with this Python library." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data\n", + "\n", + "In order to explore the library, firstly we load a dataset from the list of Toy datasets available in the scikit-learn library.\n", + "The dataset that we are going to use firstly is the Breast cancer wisconsin (diagnostic) dataset.\n", + "This dataset contains 569 samples with 30 different features. Each sample is classified either as malignant or benign." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import datasets\n", + "data = datasets.load_breast_cancer()\n", + "features = data['data']\n", + "targets = data['target']\n", + "feature_names = data['feature_names']\n", + "target_names = data['target_names']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train-Validation-Test split\n", + "\n", + "Train-validation-test split is fundamental to produce machine learning models.\n", + "Machine learning algorithms are very powerful and can easily memorise the data used for training.\n", + "Therefore, it is highly recommended to split the data into three sets: a training set, a validation set, and a test set.\n", + "The first one is used for optimising the model.\n", + "The second one is used to perform hyperparameter optimisation, i.e., we train the models several times using different settings and select the one that performs the best in the validation set.\n", + "Finally, the model should be tested in an out-of-sample set (test set) in order to get the performance of the model in a situation similar to a real-life scenario.\n", + "\n", + "Using ``` model_selection.train_test_split```, we first divide the dataset in train and test sets.\n", + "Then, we further divide the train set into a smaller train set and a validation set.\n", + "Usually, the ratio between the different sets is 60% for training, 20% for validation, and 20% for test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import model_selection\n", + "features_train, features_test, targets_train, targets_test = model_selection.train_test_split(features, targets, test_size=0.20, random_state=42)\n", + "features_train, features_val, targets_train, targets_val = model_selection.train_test_split(features_train, targets_train, test_size=0.25, random_state=42)\n", + "\n", + "print(f\"Complete Dataset: {features.shape}\")\n", + "print(f\"Train Dataset: {features_train.shape}\")\n", + "print(f\"Validation Dataset: {features_test.shape}\")\n", + "print(f\"Test Dataset: {features_test.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing\n", + "\n", + "Before developing the models, data should be preprocessed.\n", + "Be careful that toy datasets are usually already very clean and ready to use.\n", + "However, in real-life situations data contain lots of errors that should be handled.\n", + "Furthermore, datasets usually contain raw data.\n", + "Therefore, feature extraction must be performed to get information that may be used to develop a machine learning model.\n", + "However, as machine learning is not the main focus of the workshop, we will skip this task and use a dataset that already contains some features." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Standardising data\n", + "\n", + "Standardisation is the process of rescaling the features so that they have a mean of 0 and a standard deviation of 1.\n", + "It is also known as Z-score normalisation.\n", + "The formula is given by \n", + "\n", + "$$\n", + "z = \\frac{x - μ}{σ}\n", + "$$\n", + "\n", + "This procedure is very useful for centering the data around the mean, helping the algorithms converge faster during training and for scaling and comparing different variables preventing features with larger values from dominating in certain algorithms or models.\n", + "It should be noted that this scaling procedure should only be used in numerical features.\n", + "Other techniques should be used in the case of other types of features, e.g. categorical features.\n", + "\n", + "First, we calculate the mean and the standard deviation of the train set using `StandardScaler().fit()` and then using `StandardScaler().transform()` we scale the different sets.\n", + "Only train set is used for fitting the standard scaler in order to not have data leakage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "standard_scaler = preprocessing.StandardScaler()\n", + "standard_scaler.fit(features_train)\n", + "features_train_standardised = standard_scaler.transform(features_train)\n", + "features_val_standardised = standard_scaler.transform(features_val)\n", + "features_test_standardised = standard_scaler.transform(features_test)\n", + "\n", + "feature_index = 0\n", + "\n", + "fig = plt.figure()\n", + "plt.suptitle(f\"Compare feature '{feature_names[feature_index]}' before and after standardisation\")\n", + "\n", + "# Data before standardisation\n", + "plt.subplot(1, 2, 1)\n", + "plt.hist(features_train[:,feature_index])\n", + "plt.title(f\"Feature '{feature_names[feature_index]}' (not standardised)\")\n", + "\n", + "# Data after standardisation\n", + "plt.subplot(1, 2, 2)\n", + "plt.hist(features_train_standardised[:,feature_index])\n", + "plt.title(f\"Feature '{feature_names[feature_index]}' (standardised)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dimensionality reduction\n", + "\n", + "#### Feature selection\n", + "\n", + "Feature selection is the process of choosing the most relevant and informative features from the available ones.\n", + "Its goal is to improve model performance, reduce overfitting, decrease computational overhead, and enhance interpretability.\n", + "There are different ways of performing it.\n", + "In this case, we use a filter method, which removes the less informative variables without considering the used machine learning models.\n", + "\n", + "Using `SelectKBest()` with the `f_classif` feature selection function, we select the best 10 features (randomly chosen number).\n", + "Again, for preparing the method, we use the train set.\n", + "Then, we use the already fitted model to transform the different sets.\n", + "Afterwards, using `SelectKBest().get_support()`, we get the indexes of the selected features (the most informative ones)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import feature_selection\n", + "feature_selection_function = feature_selection.f_classif\n", + "select_best_features = feature_selection.SelectKBest(score_func = feature_selection_function, k = 10)\n", + "select_best_features.fit(features_train_standardised, targets_train)\n", + "features_train_selected = select_best_features.transform(features_train_standardised)\n", + "features_val_selected = select_best_features.transform(features_val_standardised)\n", + "features_test_selected = select_best_features.transform(features_test_standardised)\n", + "\n", + "selected_features_indexes = select_best_features.get_support(indices=True)\n", + "selected_features_names = feature_names[selected_features_indexes]\n", + "\n", + "print(f\"Selected features indexes: {selected_features_indexes}\")\n", + "print(f\"Selected features names: {selected_features_names}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Feature projection\n", + "\n", + "Feature projection, also known as dimensionality reduction transforms the features into a new set of variables.\n", + "In this case, we are going to use Principal Component Analysis (PCA), which transforms the variables into orthogonal variables called principal variables.\n", + "It is usually for reducing the number of dimensions while retaining most of the variability as possible, facilitating data visualisation in lower-dimensional spaces.\n", + "However, note that once the features are transformed they are not interpretable anymore as the new features are just a linear combination of the original ones.\n", + "This technique may also be considered an unsupervised learning method.\n", + "However, we will keep it here to make the notebook easier to understand.\n", + "\n", + "First, we select the number of components that we want to see in the new set `PCA(n_components = 2)`.\n", + "Then, we fit the model using the train set. Finally, we transform the different sets.\n", + "Using `PCA().explained_variance_ratio_`, we can check how much variance is explained by the computed components.\n", + "A rule of thumb is to keep at least 70-80% of the explained variance.\n", + "Otherwise, the model is not reliable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import decomposition\n", + "pca_model = decomposition.PCA(n_components = 2)\n", + "pca_model.fit(features_train_selected)\n", + "features_train_reduced = pca_model.transform(features_train_selected)\n", + "features_val_reduced = pca_model.transform(features_val_selected)\n", + "features_test_reduced = pca_model.transform(features_test_selected)\n", + "explained_variance_ratio = pca_model.explained_variance_ratio_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Plot dataset\n", + "\n", + "After reducing the dimension of our dataset to just 2 variables, we are able to see the samples in a 2D plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "_ = plt.scatter(features_train_reduced[:,0], features_train_reduced[:,1], c = targets_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Exercises on dimensionality reduction\n", + "\n", + "1. Write a function that receives as input a list with already scaled features and targets and returns the indices of the best 5 features using ```SelectKBest``` with mutual information (classification) as scoring function.\n", + "2. Write a function that receives as input a list with already scaled features and targets and computes Principal Component Analysis with 5 components and returns the total explained variance ratio. Use ```random_state = 42```." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%reload_ext tutorial.tests.testsuite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%ipytest\n", + "\n", + "def solution_obtain_five_best_features(dataset):\n", + " from sklearn import feature_selection\n", + " # Write your code here.\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%ipytest\n", + "\n", + "def solution_obtain_total_explained_variance_ratio(dataset):\n", + " from sklearn import decomposition\n", + " # Write your code here.\n", + " return" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unsupervised learning\n", + "\n", + "Unsupervised learning is a type of machine learning where the model is trained on unlabeled data.\n", + "The goal of this field is to discover hidden patterns, structures, or relationships within the data.\n", + "KMeans is a highly used unsupervised learning method.\n", + "It consists of grouping data points in such a way that points within the same cluster are more similar than those in the other clusters.\n", + "The centroid of each cluster is the average of all data points belonging to it.\n", + "\n", + "First, we define how many clusters we have in our model `KMeans(n_clusters = 2)`.\n", + "Then, we fit the model and get the assigned labels.\n", + "As we want to see the samples in a 2D plot, we use the projected features for training the clustering method.\n", + "To streamline the notebook, we exclusively use the train set for this task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import cluster\n", + "kmeans_model = cluster.KMeans(n_clusters = 2, random_state = 42)\n", + "kmeans_model.fit(features_train_reduced)\n", + "\n", + "targets_pred_train_cluster = kmeans_model.labels_\n", + "\n", + "# Lets see the result\n", + "true_positive_train = features_train_reduced[np.where((targets_train==1) & (targets_pred_train_cluster==1))[0]]\n", + "true_negative_train = features_train_reduced[np.where((targets_train==0) & (targets_pred_train_cluster==0))[0]]\n", + "false_positive_train = features_train_reduced[np.where((targets_train==0) & (targets_pred_train_cluster==1))[0]]\n", + "false_negative_train = features_train_reduced[np.where((targets_train==1) & (targets_pred_train_cluster==0))[0]]\n", + "plt.scatter(true_positive_train[:,0], true_positive_train[:,1], color = 'blue', marker = '.')\n", + "plt.scatter(true_negative_train[:,0], true_negative_train[:,1], color = 'green', marker = '.')\n", + "plt.scatter(false_positive_train[:,0], false_positive_train[:,1], color = 'blue', marker = 'x')\n", + "plt.scatter(false_negative_train[:,0], false_negative_train[:,1], color = 'green', marker = 'x')\n", + "plt.legend(['TP', 'TN', 'FP', 'FN'])\n", + "plt.title('Clustering data')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Exercises on unsupervised learning\n", + "\n", + "1. Write a function that receives as input a list with already scaled features and targets and returns the cluster indexes for all samples. Use agglomerative clustering with default parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext tutorial.tests.testsuite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%ipytest\n", + "def solution_obtain_clustering_labels(dataset):\n", + " from sklearn import cluster\n", + " # Write your code here.\n", + " return" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Supervised learning\n", + "\n", + "Supervised learning is a type of machine learning where the algorithm learns patterns and relations directly from labeled training data.\n", + "It can be split in classification, in which the model predicts categories (e.g. image classification or spam filters) and in regression, in which the model predicts a continuous numerical output (e.g. house price prediction or time-series forecasting).\n", + "\n", + "#### Classification\n", + "\n", + "As aforementioned, classification is used for labeling samples to predefined classes.\n", + "Two highly used methods for performing this task are the logistic regression (linear classifier) and the decision tree (non-linear classifier).\n", + "Both are easily interpretable and do not require a significant amount of computational resources to be optimised.\n", + "We use the sets before the projection because we want to be able to interpret the models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Logistic Regression\n", + "\n", + "First, we fit the model using the train set (```LogisticRegression().fit()```) and then using ```predict```, we predict the labels in the validation set.\n", + "This procedure is used for selecting the algorithm hyperparameters.\n", + "In this case, as it is not intended to get the best classifier overall, we concatenate the train and the validation sets and retrain the model.\n", + "After training the model, one can get the coefficients necessary to reconstruct the equation used for predicting the labels by calling the functions `LogisticRegression().intercept_` and `LogisticRegression().coef_`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import linear_model\n", + "\n", + "logistic_regression_model = linear_model.LogisticRegression()\n", + "logistic_regression_model.fit(features_train_selected, targets_train)\n", + "\n", + "targets_pred_val = logistic_regression_model.predict(features_val_selected)\n", + "\n", + "# Change the classifier to get the best validation performance\n", + "complete_features_train_selected = np.vstack((features_train_selected, features_val_selected))\n", + "complete_targets_train = np.hstack((targets_train, targets_val))\n", + "logistic_regression_model.fit(complete_features_train_selected, complete_targets_train)\n", + "\n", + "targets_pred_test_lr = logistic_regression_model.predict(features_test_selected)\n", + "lr_intercept = logistic_regression_model.intercept_\n", + "lr_coefs = np.squeeze(logistic_regression_model.coef_)\n", + "\n", + "equation = \"logit(X) = %.2f + (%.2f x0) + (%.2f x1) + (%.2f x2) + (%.2f x3) + (%.2f x4) + (%.2f x5) + (%.2f x6) + (%.2f x7) + (%.2f x8) + (%.2f x9)\" % (lr_intercept[0], \n", + " lr_coefs[0], \n", + " lr_coefs[1], \n", + " lr_coefs[2], \n", + " lr_coefs[3], \n", + " lr_coefs[4], \n", + " lr_coefs[5], \n", + " lr_coefs[6], \n", + " lr_coefs[7], \n", + " lr_coefs[8], \n", + " lr_coefs[9])\n", + "print(equation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Decision Tree\n", + "\n", + "First, we fit the model using the train set (`DecisionTreeClassifier().fit()`) and then using `predict`, we predict the labels in the validation set.\n", + "Again, this procedure is used for selecting the algorithm hyperparameters.\n", + "In this case, as it is not intended to get the best classifier overall, we concatenate the train and the validation sets and retrain the model.\n", + "After training the model, one can draw the tree architecture and see what features were used and how they were split to classify the different classes, using the function `tree.plot_tree`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import tree\n", + "\n", + "decision_tree_model = tree.DecisionTreeClassifier(max_depth = 3)\n", + "decision_tree_model.fit(features_train_selected, targets_train)\n", + "\n", + "targets_pred_val = decision_tree_model.predict(features_val_selected)\n", + "\n", + "complete_features_train_selected = np.vstack((features_train_selected, features_val_selected))\n", + "complete_targets_train = np.hstack((targets_train, targets_val))\n", + "decision_tree_model.fit(complete_features_train_selected, complete_targets_train)\n", + "\n", + "targets_pred_test_dt = decision_tree_model.predict(features_test_selected)\n", + "\n", + "tree.plot_tree(decision_tree_model, feature_names = selected_features_names, class_names = target_names, filled = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Evaluation\n", + "\n", + "After developing the classifiers, you should measure how well the models are performing in terms of correctness, accuracy, and reliability.\n", + "For that, confusion matrixes are usually used because they contain a lot of information inside, which you can use to calculate different metrics, such as accuracy, recall, precision, among others.\n", + "For that, `metrics.confusion_matrix(true_labels, predicted_labels)` is used.\n", + "Scikit-learn also contains a function that returns a small report about the models being evaluated (`metrics.classification_report(true_labels, predicted_labels)`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import metrics\n", + "confusion_matrix_test_lr = metrics.confusion_matrix(targets_test, targets_pred_test_lr)\n", + "confusion_matrix_test_dt = metrics.confusion_matrix(targets_test, targets_pred_test_dt)\n", + "\n", + "print(\"Logistic Regression\")\n", + "print(metrics.classification_report(targets_test, targets_pred_test_lr, target_names = target_names))\n", + "\n", + "print(\"Decision Tree\")\n", + "print(metrics.classification_report(targets_test, targets_pred_test_dt, target_names = target_names))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Regression\n", + "\n", + "As mentioned earlier, regression is used for predicting continuous output values based on input features.\n", + "For performing regression, we are going to use linear regression.\n", + "This algorithm is simple, easily interpretable, and highly used for building regressors." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Load dataset and train-validation-test split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Load dataset\n", + "from sklearn import datasets\n", + "data = datasets.load_diabetes()\n", + "features = data['data']\n", + "targets = data['target']\n", + "feature_names = np.array(data['feature_names'])\n", + "\n", + "# Train-validation-test split\n", + "from sklearn import model_selection\n", + "features_train, features_test, targets_train, targets_test = model_selection.train_test_split(features, targets, test_size = 0.20, random_state = 42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Data preprocessing and develop regressor using Pipeline object\n", + "\n", + "Different from what was done earlier, here we are going to use ```pipeline.Pipeline()``` to condense the scaler, the feature selection, and the regressor into just one object. Actually this object is very useful for developing machine learning models because it makes the code more organised, prevents errors and data leakage, and it may be incorporate into grid search functions making it easier to perform hyperparamemeter tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "from sklearn import feature_selection\n", + "from sklearn import linear_model\n", + "from sklearn import pipeline\n", + "\n", + "# Creating the pipeline\n", + "regression_pipeline = pipeline.Pipeline([\n", + " ('scaler', preprocessing.StandardScaler()),\n", + " ('feature_selection', feature_selection.SelectKBest(score_func = feature_selection.f_regression, k = 2)),\n", + " ('regressor', linear_model.LinearRegression())\n", + "])\n", + "\n", + "# Fitting the pipeline on the training data\n", + "regression_pipeline.fit(features_train, targets_train)\n", + "\n", + "# Selected features names\n", + "selected_features_indexes = regression_pipeline['feature_selection'].get_support(indices=True)\n", + "selected_features_names = feature_names[selected_features_indexes]\n", + "\n", + "# Print linear regression equation\n", + "linreg_intercept = regression_pipeline['regressor'].intercept_\n", + "linreg_coefs = regression_pipeline['regressor'].coef_\n", + "equation = \"y = %.2f + (%.2f x0) + (%.2f x1)\" % (lr_intercept[0], linreg_coefs[0], linreg_coefs[1])\n", + "print(equation)\n", + "\n", + "# Predicting on the test set\n", + "targets_pred_test_linreg = regression_pipeline.predict(features_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Evaluation\n", + "\n", + "After developing the regressor, you should measure how well the models are performing.\n", + "For that, $R^2$ and Root Mean Squared Error (RMSE) are usually used.\n", + "$R^2$ or Coefficient of Determination measures the proportion of variance in the target variable that is predictable from the independent variables.\n", + "It ranges from 0 to 1. It is computed using `metrics.r2_score(true_targets, predicted_targets)`.\n", + "RMSE is the square root of the mean squared error, providing an interpretable measure in the same units as the target variable.\n", + "It is computed using `metrics.mean_squared_error(true_targets, predicted_targets, squared = False)`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn import metrics\n", + "r2_score_linreg = metrics.r2_score(targets_test, targets_pred_test_linreg)\n", + "rmse_linreg = metrics.mean_squared_error(targets_test, targets_pred_test_linreg, squared = False)\n", + "\n", + "print(f'RMSE: {rmse_linreg:.2f}')\n", + "print(f'R-squared: {r2_score_linreg:.2f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Exercises on supervised learning\n", + "\n", + "1. Write a function that receives as input a train set and a test set and returns the accuracy of the classifier that you developed, tested on the test set. Use Support Vector Machines (```sklearn.svm.SVC```) with C equals to 0.5 and a linear kernel. Do not forget to use ```random_state = 42```.\n", + "2. Write a function that receives as input a train set and a test set and returns the RMSE of the regressor that you developed, tested on the test set. Use Random Forest with number of trees equals to 32. Do not forget to use ```random_state = 42```." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext tutorial.tests.testsuite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%ipytest\n", + "def solution_train_classifier_and_obtain_accuracy(train_set, test_set):\n", + " features_train, targets_train = train_set\n", + " features_test, targets_test = test_set\n", + " # Your code starts here\n", + " return\n", + " # Your code ends here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%ipytest\n", + "def solution_train_regressor_and_obtain_rmse(train_set, test_set):\n", + " features_train, targets_train = train_set\n", + " features_test, targets_test = test_set\n", + " # Your code starts here\n", + " return\n", + " # Your code ends here" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercises\n", + "\n", + "Here you can test the knowledges acquired in this notebook.\n", + "\n", + "Note: Use `random_state = 42` in all the methods that require a random state number." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### House pricing prediction\n", + "\n", + "The goal of this exercise is getting the RMSE of house pricing prediction model.\n", + "Using california housing dataset, do the following tasks: \n", + "\n", + "* Divide the data into train and test sets using a 70/30 ratio;\n", + "* Normalise the data using Z-score;\n", + "* Select the best 3 features using mutual information for regression;\n", + "* Train a Random Forest Regression model with 128 trees;\n", + "* Predict the market values for the test samples;\n", + "* Evaluate the model using RMSE metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext tutorial.tests.testsuite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%ipytest\n", + "def solution_build_regressor_and_obtain_rmse():\n", + " from sklearn import datasets, model_selection, pipeline, preprocessing, feature_selection, ensemble, metrics\n", + " # Your code starts here\n", + " return\n", + " # Your code ends here\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### News classifier\n", + "\n", + "The goal of this exercise is getting the accuracy of a news classifier. Using 20 newsgroups dataset, do the following tasks:\n", + "\n", + "* Divide the data into train, validation, and test sets using a 60/20/20 ratio;\n", + "* Obtain features using `sklearn.feature_extraction.text.TfIdfVectorizer`.\n", + "Fit the method using the train set and then use it to convert both sets into a group of features. \n", + "Note: The matrixes returned by this method are sparse matrixes.\n", + "If you want to use numpy methods later, you have to convert them to numpy arrays first using `array_you_want_to_convert.toarray()`.\n", + "* There is no need to use a normaliser because the features are already in the same range;\n", + "* Select the best 10 features using mutual information for classification;\n", + "* Train 5 Random Forest Classifier models with 16, 32, 64, 128, and 256 trees;\n", + "* Select the best one considering the validation set;\n", + "* Concatenate train and validation sets;\n", + "* Train the best model with the new train set;\n", + "* Predict the news class for the test samples;\n", + "* Evaluate the model using F1-score metric (use `average = 'weighted'`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%ipytest\n", + "def solution_build_classifier_and_obtain_f1score():\n", + " import numpy as np\n", + " from sklearn import datasets, model_selection, pipeline, feature_extraction, feature_selection, ensemble, metrics\n", + " # Your code starts here\n", + " return\n", + " # Your code ends here" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/tests/test_library_sklearn.py b/tutorial/tests/test_library_sklearn.py new file mode 100644 index 00000000..586b38f3 --- /dev/null +++ b/tutorial/tests/test_library_sklearn.py @@ -0,0 +1,277 @@ +import numpy as np +import pytest +from sklearn import datasets, model_selection, preprocessing + + +def get_scaled_dataset(dataset_type: str): + if dataset_type == "classification": + data = datasets.load_breast_cancer() + else: + data = datasets.load_diabetes() + + features = data["data"] + targets = data["target"] + ( + features_train, + features_test, + targets_train, + targets_test, + ) = model_selection.train_test_split( + features, targets, test_size=0.20, random_state=42 + ) + ( + features_train, + features_test, + targets_train, + targets_test, + ) = model_selection.train_test_split( + features_train, targets_train, test_size=0.25, random_state=42 + ) + standard_scaler = preprocessing.StandardScaler() + standard_scaler.fit(features_train) + features_train_standardised = standard_scaler.transform(features_train) + features_test_standardised = standard_scaler.transform(features_test) + + return ( + features_train_standardised, + targets_train, + features_test_standardised, + targets_test, + ) + + +def reference_obtain_five_best_features(dataset): + from sklearn import feature_selection + + features, targets = dataset + select_kbest_model = feature_selection.SelectKBest( + score_func=feature_selection.mutual_info_classif, k=5 + ) + select_kbest_model.fit(features, targets) + return select_kbest_model.get_support(indices=True) + + +@pytest.mark.parametrize("dataset", [get_scaled_dataset("classification")]) +def test_obtain_five_best_features(dataset, function_to_test): + features_train, targets_train, _, _ = dataset + train_set = [features_train, targets_train] + assert np.array_equal( + reference_obtain_five_best_features(train_set), function_to_test(train_set) + ) + + +def reference_obtain_total_explained_variance_ratio(dataset): + from sklearn import decomposition + + features, _ = dataset + pca_model = decomposition.PCA(n_components=5, random_state=42) + pca_model.fit(features) + explained_variance_ratio_by_component = pca_model.explained_variance_ratio_ + total_explained_variance_ratio = sum(explained_variance_ratio_by_component) + return total_explained_variance_ratio + + +@pytest.mark.parametrize( + "dataset", [get_scaled_dataset("classification"), get_scaled_dataset("regression")] +) +def test_obtain_total_explained_variance_ratio(dataset, function_to_test): + features_train, targets_train, _, _ = dataset + train_set = [features_train, targets_train] + assert reference_obtain_total_explained_variance_ratio( + train_set + ) == function_to_test(train_set) + + +def reference_obtain_clustering_labels(dataset): + from sklearn import cluster + + features, _ = dataset + clustering_model = cluster.AgglomerativeClustering() + clustering_model.fit(features) + return clustering_model.labels_ + + +@pytest.mark.parametrize("dataset", [get_scaled_dataset("classification")]) +def test_obtain_clustering_labels(dataset, function_to_test): + features_train, targets_train, _, _ = dataset + train_set = [features_train, targets_train] + assert np.array_equal( + reference_obtain_clustering_labels(train_set), function_to_test(train_set) + ) + + +def reference_train_classifier_and_obtain_accuracy(train_set, test_set): + from sklearn import metrics, svm + + features_train, targets_train = train_set + features_test, targets_test = test_set + svm_model = svm.SVC(C=0.5, kernel="linear", random_state=42) + svm_model.fit(features_train, targets_train) + predicted_test = svm_model.predict(features_test) + accuracy_test = metrics.accuracy_score(targets_test, predicted_test) + return accuracy_test + + +@pytest.mark.parametrize("dataset", [get_scaled_dataset("classification")]) +def test_train_classifier_and_obtain_accuracy(dataset, function_to_test): + features_train, targets_train, features_test, targets_test = dataset + train_set = [features_train, targets_train] + test_set = [features_test, targets_test] + assert reference_train_classifier_and_obtain_accuracy( + train_set, test_set + ) == function_to_test(train_set, test_set) + + +def reference_train_regressor_and_obtain_rmse(train_set, test_set): + from sklearn import ensemble, metrics + + features_train, targets_train = train_set + features_test, targets_test = test_set + rfr_model = ensemble.RandomForestRegressor(n_estimators=32, random_state=42) + rfr_model.fit(features_train, targets_train) + predicted_test = rfr_model.predict(features_test) + rmse_test = metrics.mean_squared_error(targets_test, predicted_test, squared=False) + return rmse_test + + +@pytest.mark.parametrize("dataset", [get_scaled_dataset("regression")]) +def test_train_regressor_and_obtain_rmse(dataset, function_to_test): + features_train, targets_train, features_test, targets_test = dataset + train_set = [features_train, targets_train] + test_set = [features_test, targets_test] + assert reference_train_regressor_and_obtain_rmse( + train_set, test_set + ) == function_to_test(train_set, test_set) + + +def reference_build_regressor_and_obtain_rmse(): + from sklearn import ( + datasets, + ensemble, + feature_selection, + metrics, + model_selection, + pipeline, + preprocessing, + ) + + data = datasets.fetch_california_housing() + features = data["data"] + targets = data["target"] + data["feature_names"] + ( + features_train, + features_test, + targets_train, + targets_test, + ) = model_selection.train_test_split( + features, targets, test_size=0.3, random_state=42 + ) + + model_pipeline = pipeline.Pipeline( + [ + ("scaler", preprocessing.StandardScaler()), + ( + "feature_selector", + feature_selection.SelectKBest( + score_func=feature_selection.mutual_info_regression, k=3 + ), + ), + ( + "rfr_regressor", + ensemble.RandomForestRegressor(n_estimators=128, random_state=42), + ), + ] + ) + + model_pipeline.fit(features_train, targets_train) + + predicted_test = model_pipeline.predict(features_test) + + rmse_test = metrics.mean_squared_error(targets_test, predicted_test, squared=False) + + return rmse_test + + +def test_build_regressor_and_obtain_rmse(function_to_test): + assert reference_build_regressor_and_obtain_rmse() == function_to_test() + + +def reference_build_classifier_and_obtain_f1score(): + import numpy as np + from sklearn import ( + datasets, + ensemble, + feature_extraction, + feature_selection, + metrics, + model_selection, + pipeline, + ) + + data = datasets.fetch_20newsgroups() + texts = data["data"] + targets = data["target"] + ( + texts_train, + texts_test, + targets_train, + targets_test, + ) = model_selection.train_test_split(texts, targets, test_size=0.2, random_state=42) + + ( + texts_train, + texts_val, + targets_train, + targets_val, + ) = model_selection.train_test_split( + texts_train, targets_train, test_size=0.25, random_state=42 + ) + + features_pipeline = pipeline.Pipeline( + [ + ("feature_extraction", feature_extraction.text.TfidfVectorizer()), + ( + "feature_selector", + feature_selection.SelectKBest( + score_func=feature_selection.f_classif, k=100 + ), + ), + ] + ) + + features_train = features_pipeline.fit_transform(texts_train, targets_train) + features_train = features_train.toarray() + features_val = features_pipeline.transform(texts_val).toarray() + features_test = features_pipeline.transform(texts_test).toarray() + + range_nr_trees = [16, 32, 64, 128, 256] + validation_scores = [] + for nr_trees in range_nr_trees: + rfr_classifier = ensemble.RandomForestClassifier( + n_estimators=nr_trees, random_state=42 + ) + rfr_classifier.fit(features_train, targets_train) + predicted_val = rfr_classifier.predict(features_val) + f1score_val = metrics.f1_score(targets_val, predicted_val, average="weighted") + validation_scores.append(f1score_val) + + best_model_index = np.argmax(validation_scores) + + features_train = np.vstack((features_train, features_val)) + targets_train = np.hstack((targets_train, targets_val)) + + final_rfr_classifier = ensemble.RandomForestClassifier( + n_estimators=range_nr_trees[best_model_index], random_state=42 + ) + final_rfr_classifier.fit(features_train, targets_train) + + predicted_test = final_rfr_classifier.predict(features_test) + + f1score_test = metrics.f1_score(targets_test, predicted_test, average="weighted") + + return f1score_test + + +def test_build_classifier_and_obtain_f1score(function_to_test): + assert reference_build_classifier_and_obtain_f1score() == function_to_test()