From 50b496e6a63ac554f0e259c6dbfb2debfde4f8a8 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Thu, 25 Feb 2021 22:00:29 -0500 Subject: [PATCH 01/21] Ajout fonctionnel de techniques avec plusieurs datasets. --- deslib/multi_datasets.py | 318 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100644 deslib/multi_datasets.py diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py new file mode 100644 index 0000000..150a1fe --- /dev/null +++ b/deslib/multi_datasets.py @@ -0,0 +1,318 @@ +# coding=utf-8 + +# Author: Rafael Menelau Oliveira e Cruz +# +# License: BSD 3 clause + +import copy +import numpy as np +from scipy.stats import mode +from sklearn.utils.validation import (check_is_fitted, check_array) + +from deslib.base import BaseDS +from deslib.dcs.base import BaseDCS +from deslib.des.base import BaseDES +from deslib.util.aggregation import (weighted_majority_voting_rule, + majority_voting_rule, + aggregate_proba_ensemble_weighted) +from deslib.util.instance_hardness import hardness_region_competence + +# Créer à partir de KNORA-U +class MultiDatasets(BaseDS): + def __init__(self, ds_classifier, pool_classifiers=None): + super(MultiDatasets, self).__init__(pool_classifiers) + self.ds_classifier = ds_classifier + + def fit(self, X, y): + """ + Parameters + ---------- + X : array of shape (n_datasets, n_samples, n_features) + The input data. + + y : array of shape (n_datasets, n_samples) + class labels of each example in X. + """ + n_datasets = len(X) + self.ds_classifiers = [] + for i in range(n_datasets): + ds_classifier = copy.deepcopy(self.ds_classifier) + ds_classifier.pool_classifiers = self.pool_classifiers[i] + ds_classifier.fit(X[i], y[i]) + self.ds_classifiers.append(ds_classifier) + self._setup_label_encoder(y[0]) + + def predict(self, X): + merged_base_probabilities = [] + merged_base_predictions = [] + n_datasets = len(X) + for i in range(n_datasets): + base_probabilities, base_predictions = \ + self._get_base_proba_and_pred(self.ds_classifiers[i], X[i]) + merged_base_probabilities.append(base_probabilities) + merged_base_predictions.append(base_predictions) + + if merged_base_probabilities[0] is not None: + merged_base_probabilities = np.concatenate( + merged_base_probabilities, axis=1) + else: + merged_base_probabilities = None + merged_base_predictions = np.concatenate( + merged_base_predictions, axis=1) + + n_samples = len(X[0]) + predicted_labels = np.empty(n_samples, dtype=np.intp) + + all_agree_vector = BaseDS._all_classifier_agree(merged_base_predictions) + ind_all_agree = np.where(all_agree_vector)[0] + + # Since the predictions are always the same, get the predictions of the + # first base classifier. + if ind_all_agree.size: + predicted_labels[ind_all_agree] = merged_base_predictions[ + ind_all_agree, 0] + + # For the samples with disagreement, perform the dynamic selection + # steps. First step is to collect the samples with disagreement + # between base classifiers + ind_disagreement = np.where(~all_agree_vector)[0] + if ind_disagreement.size: + merged_left_base_predictions = [] + merged_competences = [] + for i in range(n_datasets): + ds_classifier = self.ds_classifiers[i] + X_DS = X[i][ind_disagreement, :] + base_probabilities, base_predictions = \ + self._get_base_proba_and_pred(ds_classifier, X[i]) + + # If the method is based on clustering and does not use IH there + # is no need to compute the Neighbors + if hasattr(self.ds_classifier, "clustering_") \ + and not ds_classifier.with_IH: + distances = neighbors = None + else: + # Then, we estimate the nearest neighbors for all samples + # that we need to call DS routines + distances, neighbors = \ + ds_classifier._get_region_competence(X_DS) + + if ds_classifier.with_IH: + raise ValueError("TODO: traiter avec tous les inputs.") + ind_ds_classifier, predicted_labels, neighbors, \ + distances = \ + self._calculate_hardness_level(ds_classifier, + ind_disagreement, + predicted_labels, + neighbors, + distances) + else: + # IH was not considered. So all samples with disagreement + # are passed down to the DS algorithm + ind_ds_classifier = np.arange(ind_disagreement.size) + + # At this stage the samples which all base classifiers agrees or + # that are associated with low hardness were already classified. + # The remaining samples are now passed down to the DS techniques + # for classification. + + # First check whether there are still samples to be classified. + if ind_ds_classifier.size: + # IF the DFP pruning is considered, calculate the DFP mask + # for all samples in X + DFP_mask = self._get_DFP_mask( + ds_classifier, ind_ds_classifier, neighbors) + + # Get the real indices_ of the samples that will be classified + # using a DS algorithm. + ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] + + if ds_classifier.needs_proba: + selected_probabilities = base_probabilities[ + ind_ds_original_matrix] + else: + selected_probabilities = None + + competences = self._get_competences( + ds_classifier, + X_DS[ind_ds_classifier], + base_predictions[ind_ds_original_matrix], + selected_probabilities, + neighbors=neighbors, + distances=distances, + DFP_mask=DFP_mask) + + merged_competences.append(competences) + merged_left_base_predictions.append(base_predictions[ + ind_ds_original_matrix]) + + merged_left_base_predictions = np.concatenate( + merged_left_base_predictions, axis=1) + merged_competences = np.concatenate(merged_competences, axis=1) + + if issubclass(type(self.ds_classifier), BaseDCS): + pred_ds = self._get_dcs_predicted_label(self.ds_classifier, + merged_left_base_predictions, merged_competences) + elif issubclass(type(self.ds_classifier), BaseDES): + pred_ds = self._get_des_predicted_label(self.ds_classifier, + merged_left_base_predictions, merged_competences) + + predicted_labels[ind_ds_original_matrix] = pred_ds + + return self.classes_.take(predicted_labels) + + def predict_proba(self, X): + raise ValueError("Méthode incomplète!") + + def _get_base_proba_and_pred(self, ds_classifier, X): + # Check if the DS model was trained + check_is_fitted(ds_classifier, + ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) + + # Check if X is a valid input + X = check_array(X) + ds_classifier._check_num_features(X) + + if self.needs_proba: + base_probabilities = ds_classifier._predict_proba_base(X) + base_predictions = base_probabilities.argmax(axis=2) + else: + base_probabilities = None + base_predictions = ds_classifier._predict_base(X) + + return base_probabilities, base_predictions + + def _calculate_hardness_level(self, ds_classifier, ind_disagreement, + predicted_labels, neighbors, distances): + """ + This function exists so parameters represent all datasets. + This code has been copied, but "self" has been replaced by + "ds_classifier" because this function doesn't exist elsewhere. + """ + self = ds_classifier + # if IH is used, calculate the hardness level associated with + # each sample + hardness = hardness_region_competence(neighbors, + self.DSEL_target_, + self.safe_k) + + # Get the index associated with the easy and hard samples. + # Samples with low hardness are passed down to the knn + # classifier while samples with high hardness are passed down + # to the DS methods. So, here we split the samples that are + # passed to down to each stage by calculating their indices_. + easy_samples_mask = hardness < self.IH_rate + ind_knn_classifier = np.where(easy_samples_mask)[0] + ind_ds_classifier = np.where(~easy_samples_mask)[0] + + if ind_knn_classifier.size: + # all samples with low hardness should be classified by + # the knn method here: + # First get the class associated with each neighbor + y_neighbors = self.DSEL_target_[ + neighbors[ind_knn_classifier, :self.safe_k]] + + # Accessing which samples in the original matrix are + # associated with the low instance hardness indices_. This + # is important since the low hardness indices + # ind_knn_classifier was estimated based on a subset + # of samples + ind_knn_original_matrix = ind_disagreement[ind_knn_classifier] + prediction_knn, _ = mode(y_neighbors, axis=1) + predicted_labels[ + ind_knn_original_matrix] = prediction_knn.reshape(-1, ) + + # Remove from the neighbors and distance matrices the + # samples that were classified using the KNN + neighbors = np.delete(neighbors, ind_knn_classifier,axis=0) + distances = np.delete(distances, ind_knn_classifier,axis=0) + + return ind_ds_classifier, predicted_labels, neighbors, distances + + def _get_DFP_mask(self, ds_classifier, ind_ds_classifier, neighbors): + if ds_classifier.DFP: + DFP_mask = frienemy_pruning_preprocessed( + neighbors, + ds_classifier.DSEL_target_, + ds_classifier.DSEL_processed_) + else: + DFP_mask = np.ones( + (ind_ds_classifier.size, ds_classifier.n_classifiers_)) + + def _get_competences(self, ds_classifier, query, predictions, + probabilities=None, neighbors=None, distances=None, + DFP_mask=None): + if query.ndim < 2: + query = query.reshape(1, -1) + + if predictions.ndim < 2: + predictions = predictions.reshape(1, -1) + + if query.shape[0] != predictions.shape[0]: + raise ValueError( + 'The arrays query and predictions must have the same number' + ' of samples. query.shape is {}' + 'and predictions.shape is {}'.format(query.shape, + predictions.shape)) + + if self.needs_proba: + competences = ds_classifier.estimate_competence_from_proba( + query, + neighbors=neighbors, + distances=distances, + probabilities=probabilities) + else: + competences = ds_classifier.estimate_competence( + query, + neighbors=neighbors, + distances=distances, + predictions=predictions) + + if self.DFP: competences = competences * DFP_mask + + return competences + + def _get_dcs_predicted_label(self, ds_classifier, predictions, competences): + """ + This function exists so parameters represent all datasets. + This code has been copied, but "self" has been replaced by + "ds_classifier" because this function doesn't exist elsewhere. + """ + self = ds_classifier + if self.selection_method != 'all': + # only one classifier is selected + clf_index = self.select(competences) + predicted_label = predictions[ + np.arange(predictions.shape[0]), clf_index] + else: + # Selected ensemble of classifiers is combined using Majority + # Voting + indices = self.select(competences) + votes = np.ma.MaskedArray(predictions, ~indices) + predicted_label = majority_voting_rule(votes) + + return predicted_label + + def _get_des_predicted_label(self, ds_classifier, predictions, competences): + """ + This function exists so parameters represent all datasets. + This code has been copied, but "self" has been replaced by + "ds_classifier" because this function doesn't exist elsewhere. + """ + self = ds_classifier + if self.mode == "selection": + # The selected_classifiers matrix is used as a mask to remove + # the predictions of certain base classifiers. + selected_classifiers = ds_classifier.select(competences) + votes = np.ma.MaskedArray(predictions, ~selected_classifiers) + predicted_label = majority_voting_rule(votes) + elif self.mode == "weighting": + votes = np.atleast_2d(predictions) + predicted_label = weighted_majority_voting_rule(votes, competences, + np.arange(ds_classifier.n_classes_)) + else: + selected_classifiers = ds_classifier.select(competences) + votes = np.ma.MaskedArray(predictions, ~selected_classifiers) + predicted_label = weighted_majority_voting_rule(votes, competences, + np.arange(ds_classifier.n_classes_)) + + return predicted_label From e555206f76776b67797bae0fa3efa34ed4b6e82c Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Fri, 26 Feb 2021 19:41:48 -0500 Subject: [PATCH 02/21] Ajout de l'oracle avec plusieurs datasets. --- deslib/multi_datasets.py | 43 +++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 150a1fe..f034487 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -12,6 +12,7 @@ from deslib.base import BaseDS from deslib.dcs.base import BaseDCS from deslib.des.base import BaseDES +from deslib.static.oracle import Oracle from deslib.util.aggregation import (weighted_majority_voting_rule, majority_voting_rule, aggregate_proba_ensemble_weighted) @@ -42,16 +43,20 @@ class labels of each example in X. self.ds_classifiers.append(ds_classifier) self._setup_label_encoder(y[0]) - def predict(self, X): + def predict(self, X, y=None): + if issubclass(type(self.ds_classifier), Oracle): + return self._predict_oracle(X, y) + merged_base_probabilities = [] merged_base_predictions = [] n_datasets = len(X) + for i in range(n_datasets): base_probabilities, base_predictions = \ self._get_base_proba_and_pred(self.ds_classifiers[i], X[i]) merged_base_probabilities.append(base_probabilities) merged_base_predictions.append(base_predictions) - + if merged_base_probabilities[0] is not None: merged_base_probabilities = np.concatenate( merged_base_probabilities, axis=1) @@ -59,13 +64,13 @@ def predict(self, X): merged_base_probabilities = None merged_base_predictions = np.concatenate( merged_base_predictions, axis=1) - + n_samples = len(X[0]) predicted_labels = np.empty(n_samples, dtype=np.intp) - + all_agree_vector = BaseDS._all_classifier_agree(merged_base_predictions) ind_all_agree = np.where(all_agree_vector)[0] - + # Since the predictions are always the same, get the predictions of the # first base classifier. if ind_all_agree.size: @@ -160,6 +165,34 @@ def predict(self, X): return self.classes_.take(predicted_labels) + def _predict_oracle(self, X, y): + n_datasets = len(X) + predicted_labels = -np.ones(y.size, dtype=int) + + for sample_index in range(len(y)): + predictions = [] + + for i in range(n_datasets): + classifier = self.ds_classifiers[i] + X[i] = check_array(X[i]) + y = classifier.enc_.transform(y) + x_sample = X[i][sample_index] + y_sample = y[sample_index] + + for clf in classifier.pool_classifiers_: + predictions.append(clf.predict(x_sample.reshape(1, -1))[0]) + + for p in predictions: + # If one base classifier predicts the correct answer, + # consider as a correct prediction + if p == y_sample: + p = int(p) + predicted_labels[sample_index] = p + break + predicted_labels[sample_index] = p + + return self.classes_.take(predicted_labels) + def predict_proba(self, X): raise ValueError("Méthode incomplète!") From 0daac6af8252ef8a65831508693cfd7ef7eb2360 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Fri, 26 Feb 2021 23:13:43 -0500 Subject: [PATCH 03/21] =?UTF-8?q?Ajout=20de=20commentaires,=20ds=5Fclassif?= =?UTF-8?q?iers=20remplac=C3=A9s=20par=20self.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/multi_datasets.py | 91 ++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index f034487..7808f0d 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -20,12 +20,24 @@ # Créer à partir de KNORA-U class MultiDatasets(BaseDS): - def __init__(self, ds_classifier, pool_classifiers=None): + def __init__(self, ds_classifier, pool_classifiers): + """ + Parameters + ---------- + ds_classifier : classifier from the library + The DS model serves as a template for all the datasets. + + pool_classifiers : array of shape (n_datasets, n_classifiers) + Classifiers of each dataset. + """ super(MultiDatasets, self).__init__(pool_classifiers) self.ds_classifier = ds_classifier def fit(self, X, y): - """ + """Prepare the DS models by setting the KNN algorithm and + pre-processing the information required to apply the DS + methods + Parameters ---------- X : array of shape (n_datasets, n_samples, n_features) @@ -36,17 +48,33 @@ class labels of each example in X. """ n_datasets = len(X) self.ds_classifiers = [] + for i in range(n_datasets): ds_classifier = copy.deepcopy(self.ds_classifier) ds_classifier.pool_classifiers = self.pool_classifiers[i] ds_classifier.fit(X[i], y[i]) self.ds_classifiers.append(ds_classifier) + self._setup_label_encoder(y[0]) + return self def predict(self, X, y=None): + """ + Parameters + ---------- + X : array of shape (n_datasets, n_samples, n_features) + The input data. + + y : array of shape (n_datasets, n_samples) + class labels of each example in X. + It's added as a parameter so Oracle can be used. + """ + # Oracle is not a DS model, so there is no need to execute + # the code below. if issubclass(type(self.ds_classifier), Oracle): return self._predict_oracle(X, y) + # TODO: move the code below in a function called _predict_ds(X). merged_base_probabilities = [] merged_base_predictions = [] n_datasets = len(X) @@ -149,20 +177,20 @@ def predict(self, X, y=None): merged_competences.append(competences) merged_left_base_predictions.append(base_predictions[ ind_ds_original_matrix]) - + merged_left_base_predictions = np.concatenate( merged_left_base_predictions, axis=1) merged_competences = np.concatenate(merged_competences, axis=1) - + if issubclass(type(self.ds_classifier), BaseDCS): pred_ds = self._get_dcs_predicted_label(self.ds_classifier, merged_left_base_predictions, merged_competences) elif issubclass(type(self.ds_classifier), BaseDES): pred_ds = self._get_des_predicted_label(self.ds_classifier, merged_left_base_predictions, merged_competences) - + predicted_labels[ind_ds_original_matrix] = pred_ds - + return self.classes_.take(predicted_labels) def _predict_oracle(self, X, y): @@ -197,31 +225,36 @@ def predict_proba(self, X): raise ValueError("Méthode incomplète!") def _get_base_proba_and_pred(self, ds_classifier, X): + """ + This code has been copied, but "self" has been replaced by + "ds_classifier" because this function doesn't exist elsewhere. + """ + self = ds_classifier # Check if the DS model was trained - check_is_fitted(ds_classifier, + check_is_fitted(self, ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) # Check if X is a valid input X = check_array(X) - ds_classifier._check_num_features(X) + self._check_num_features(X) if self.needs_proba: - base_probabilities = ds_classifier._predict_proba_base(X) + base_probabilities = self._predict_proba_base(X) base_predictions = base_probabilities.argmax(axis=2) else: base_probabilities = None - base_predictions = ds_classifier._predict_base(X) + base_predictions = self._predict_base(X) return base_probabilities, base_predictions def _calculate_hardness_level(self, ds_classifier, ind_disagreement, predicted_labels, neighbors, distances): """ - This function exists so parameters represent all datasets. This code has been copied, but "self" has been replaced by "ds_classifier" because this function doesn't exist elsewhere. """ self = ds_classifier + # if IH is used, calculate the hardness level associated with # each sample hardness = hardness_region_competence(neighbors, @@ -262,18 +295,30 @@ def _calculate_hardness_level(self, ds_classifier, ind_disagreement, return ind_ds_classifier, predicted_labels, neighbors, distances def _get_DFP_mask(self, ds_classifier, ind_ds_classifier, neighbors): - if ds_classifier.DFP: + """ + This code has been copied, but "self" has been replaced by + "ds_classifier" because this function doesn't exist elsewhere. + """ + self = ds_classifier + + if self.DFP: DFP_mask = frienemy_pruning_preprocessed( neighbors, - ds_classifier.DSEL_target_, - ds_classifier.DSEL_processed_) + self.DSEL_target_, + self.DSEL_processed_) else: DFP_mask = np.ones( - (ind_ds_classifier.size, ds_classifier.n_classifiers_)) + (ind_ds_classifier.size, self.n_classifiers_)) def _get_competences(self, ds_classifier, query, predictions, probabilities=None, neighbors=None, distances=None, DFP_mask=None): + """ + This code has been copied, but "self" has been replaced by + "ds_classifier" because this function doesn't exist elsewhere. + """ + self = ds_classifier + if query.ndim < 2: query = query.reshape(1, -1) @@ -288,13 +333,13 @@ def _get_competences(self, ds_classifier, query, predictions, predictions.shape)) if self.needs_proba: - competences = ds_classifier.estimate_competence_from_proba( + competences = self.estimate_competence_from_proba( query, neighbors=neighbors, distances=distances, probabilities=probabilities) else: - competences = ds_classifier.estimate_competence( + competences = self.estimate_competence( query, neighbors=neighbors, distances=distances, @@ -306,11 +351,11 @@ def _get_competences(self, ds_classifier, query, predictions, def _get_dcs_predicted_label(self, ds_classifier, predictions, competences): """ - This function exists so parameters represent all datasets. This code has been copied, but "self" has been replaced by "ds_classifier" because this function doesn't exist elsewhere. """ self = ds_classifier + if self.selection_method != 'all': # only one classifier is selected clf_index = self.select(competences) @@ -327,25 +372,25 @@ def _get_dcs_predicted_label(self, ds_classifier, predictions, competences): def _get_des_predicted_label(self, ds_classifier, predictions, competences): """ - This function exists so parameters represent all datasets. This code has been copied, but "self" has been replaced by "ds_classifier" because this function doesn't exist elsewhere. """ self = ds_classifier + if self.mode == "selection": # The selected_classifiers matrix is used as a mask to remove # the predictions of certain base classifiers. - selected_classifiers = ds_classifier.select(competences) + selected_classifiers = self.select(competences) votes = np.ma.MaskedArray(predictions, ~selected_classifiers) predicted_label = majority_voting_rule(votes) elif self.mode == "weighting": votes = np.atleast_2d(predictions) predicted_label = weighted_majority_voting_rule(votes, competences, - np.arange(ds_classifier.n_classes_)) + np.arange(self.n_classes_)) else: - selected_classifiers = ds_classifier.select(competences) + selected_classifiers = self.select(competences) votes = np.ma.MaskedArray(predictions, ~selected_classifiers) predicted_label = weighted_majority_voting_rule(votes, competences, - np.arange(ds_classifier.n_classes_)) + np.arange(self.n_classes_)) return predicted_label From dc2d0500a814557f59ee8e5d54dca4354f3b2690 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Wed, 3 Mar 2021 22:45:30 -0500 Subject: [PATCH 04/21] Ajout de StackedClassifier. --- deslib/multi_datasets.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 7808f0d..923703f 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -13,6 +13,7 @@ from deslib.dcs.base import BaseDCS from deslib.des.base import BaseDES from deslib.static.oracle import Oracle +from deslib.static.stacked import StackedClassifier from deslib.util.aggregation import (weighted_majority_voting_rule, majority_voting_rule, aggregate_proba_ensemble_weighted) @@ -73,8 +74,12 @@ class labels of each example in X. # the code below. if issubclass(type(self.ds_classifier), Oracle): return self._predict_oracle(X, y) + elif issubclass(type(self.ds_classifier), StackedClassifier): + return self._predict_stacked(X) + else: + return self._predict_ds(X) - # TODO: move the code below in a function called _predict_ds(X). + def _predict_ds(self, X): merged_base_probabilities = [] merged_base_predictions = [] n_datasets = len(X) @@ -221,6 +226,24 @@ def _predict_oracle(self, X, y): return self.classes_.take(predicted_labels) + def _predict_stacked(self, X): + merged_base_preds = [] + n_datasets = len(X) + + for i in range(n_datasets): + classifier = self.ds_classifiers[i] + X[i] = check_array(X[i]) + check_is_fitted(classifier, "meta_classifier_") + base_preds = classifier._predict_proba_base(X[i]) + X_meta = classifier._connect_input(X[i], base_preds) + preds = classifier.meta_classifier_.predict_proba(X_meta) + merged_base_preds.append(preds) + + merged_base_preds = np.sum(merged_base_preds,0) + preds = np.argmax(merged_base_preds,axis=1) + + return self.classes_.take(preds) + def predict_proba(self, X): raise ValueError("Méthode incomplète!") From 795a19b2c195ef280107595eb19aa68dd1d61ed5 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Thu, 4 Mar 2021 15:39:57 -0500 Subject: [PATCH 05/21] Fix de knorau avec multi-datasets. --- deslib/multi_datasets.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 923703f..379642b 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -56,7 +56,10 @@ class labels of each example in X. ds_classifier.fit(X[i], y[i]) self.ds_classifiers.append(ds_classifier) + one_classifier = self.ds_classifiers[0] + self.ds_classifier.n_classes_ = one_classifier.n_classes_ self._setup_label_encoder(y[0]) + return self def predict(self, X, y=None): From 742bffc7eba1e44a55dc67d4361557ba84ffbaad Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Thu, 11 Mar 2021 14:10:02 -0500 Subject: [PATCH 06/21] =?UTF-8?q?Distances=20ajout=C3=A9s=20dans=20qq=20te?= =?UTF-8?q?chniques,=20stats=20=C3=A0=201=20prim=20ajout=C3=A9es.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/base.py | 37 ++++++++++--- deslib/dcs/base.py | 3 +- deslib/dcs/ola.py | 3 +- deslib/des/base.py | 6 ++- deslib/des/knop.py | 4 +- deslib/des/knora_e.py | 4 +- deslib/des/knora_u.py | 4 +- deslib/util/stats.py | 122 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 170 insertions(+), 13 deletions(-) create mode 100644 deslib/util/stats.py diff --git a/deslib/base.py b/deslib/base.py index eeefef8..9358ac4 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -16,7 +16,7 @@ from sklearn.ensemble import BaseEnsemble, BaggingClassifier from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder, normalize from sklearn.utils.validation import (check_X_y, check_is_fitted, check_array, check_random_state) @@ -24,6 +24,7 @@ from deslib.util import faiss_knn_wrapper from deslib.util.dfp import frienemy_pruning_preprocessed from deslib.util.instance_hardness import hardness_region_competence +from deslib.util.stats import stats class BaseDS(BaseEstimator, ClassifierMixin): @@ -40,8 +41,8 @@ class BaseDS(BaseEstimator, ClassifierMixin): @abstractmethod def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, needs_proba=False, - random_state=None, knn_classifier='knn', DSEL_perc=0.5, - knne=False, n_jobs=-1): + random_state=None, knn_classifier='knn', + knn_metric='minkowski', DSEL_perc=0.5, knne=False, n_jobs=-1): self.pool_classifiers = pool_classifiers self.k = k @@ -52,9 +53,11 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, self.needs_proba = needs_proba self.random_state = random_state self.knn_classifier = knn_classifier + self.knn_metric = knn_metric self.DSEL_perc = DSEL_perc self.knne = knne self.n_jobs = n_jobs + self.stats = stats() # Check optional dependency if knn_classifier == 'faiss' and not faiss_knn_wrapper.is_available(): @@ -202,6 +205,7 @@ class labels of each example in X. self """ self.random_state_ = check_random_state(self.random_state) + self.stats.true_labels = y # Check if the length of X and y are consistent. X, y = check_X_y(X, y) @@ -244,7 +248,7 @@ class labels of each example in X. # validate the value of k self._validate_k() - self._set_region_of_competence_algorithm() + self._set_region_of_competence_algorithm(X_dsel) self._fit_region_competence(X_dsel, y_dsel) # validate the IH @@ -315,6 +319,7 @@ def _fit_region_competence(self, X, y): class labels of each sample in X. """ + if self.knn_metric == 'cosine': X = normalize(X) self.roc_algorithm_.fit(X, y) def _set_dsel(self, X, y): @@ -337,17 +342,30 @@ class labels of each sample in X. self.n_samples_ = self.DSEL_target_.size self.DSEL_processed_, self.BKS_DSEL_ = self._preprocess_dsel() - def _set_region_of_competence_algorithm(self): + def _set_region_of_competence_algorithm(self, X): + + algorithm = "auto" + metric = 'minkowski' + metric_params = None + + if self.knn_metric == 'mahalanobis': + metric = 'mahalanobis' + metric_params = {'V': np.cov(X)} + algorithm = "brute" if self.knn_classifier is None or self.knn_classifier in ['knn', 'sklearn']: knn_class = functools.partial(KNeighborsClassifier, n_jobs=self.n_jobs, - algorithm="auto") + algorithm=algorithm, + metric=metric, + metric_params=metric_params) elif self.knn_classifier == 'faiss': knn_class = functools.partial( faiss_knn_wrapper.FaissKNNClassifier, - n_jobs=self.n_jobs, algorithm="brute") + n_jobs=self.n_jobs, algorithm="brute", + metric=self.knn_metric, + metric_params=metric_params) elif callable(self.knn_classifier): knn_class = self.knn_classifier else: @@ -427,6 +445,7 @@ def predict(self, X): base_probabilities = None base_predictions = self._predict_base(X) + self.stats.bases_labels = base_predictions all_agree_vector = BaseDS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] @@ -435,6 +454,7 @@ def predict(self, X): if ind_all_agree.size: predicted_labels[ind_all_agree] = base_predictions[ ind_all_agree, 0] + self.stats.agree_ind = ind_all_agree # For the samples with disagreement, perform the dynamic selection # steps. First step is to collect the samples with disagreement @@ -534,6 +554,9 @@ def predict(self, X): distances=distances, DFP_mask=DFP_mask) predicted_labels[ind_ds_original_matrix] = pred_ds + self.stats.disagree_ind = ind_ds_original_matrix + + self.stats.predicted_labels = predicted_labels return self.classes_.take(predicted_labels) diff --git a/deslib/dcs/base.py b/deslib/dcs/base.py index 974efd7..3ef54e7 100644 --- a/deslib/dcs/base.py +++ b/deslib/dcs/base.py @@ -21,7 +21,7 @@ class BaseDCS(BaseDS): def __init__(self, pool_classifiers=None, k=7, DFP=False, safe_k=None, with_IH=False, IH_rate=0.30, selection_method='best', diff_thresh=0.1, random_state=None, knn_classifier='knn', - DSEL_perc=0.5, + knn_metric='minkowski', DSEL_perc=0.5, knne=False, n_jobs=-1): super(BaseDCS, self).__init__(pool_classifiers=pool_classifiers, k=k, @@ -29,6 +29,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, safe_k=None, IH_rate=IH_rate, random_state=random_state, knn_classifier=knn_classifier, + knn_metric=knn_metric, DSEL_perc=DSEL_perc, knne=knne, n_jobs=n_jobs) diff --git a/deslib/dcs/ola.py b/deslib/dcs/ola.py index 3ea0f35..b3d81b5 100644 --- a/deslib/dcs/ola.py +++ b/deslib/dcs/ola.py @@ -111,7 +111,7 @@ class :class:`FaissKNNClassifier` def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, selection_method='best', diff_thresh=0.1, random_state=None, knn_classifier='knn', - knne=False, DSEL_perc=0.5, n_jobs=-1): + knn_metric='minkowski', knne=False, DSEL_perc=0.5, n_jobs=-1): super(OLA, self).__init__(pool_classifiers=pool_classifiers, k=k, DFP=DFP, with_IH=with_IH, safe_k=safe_k, IH_rate=IH_rate, @@ -119,6 +119,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, diff_thresh=diff_thresh, random_state=random_state, knn_classifier=knn_classifier, + knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/des/base.py b/deslib/des/base.py index 1530e64..f518bbf 100644 --- a/deslib/des/base.py +++ b/deslib/des/base.py @@ -21,7 +21,8 @@ class BaseDES(BaseDS): def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, mode='selection', needs_proba=False, random_state=None, - knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knn_metric='minkowski', knne=False, + DSEL_perc=0.5, n_jobs=-1): super(BaseDES, self).__init__(pool_classifiers=pool_classifiers, k=k, @@ -32,6 +33,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, needs_proba=needs_proba, random_state=random_state, knn_classifier=knn_classifier, + knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) self.mode = mode @@ -191,6 +193,8 @@ def classify_with_ds(self, query, predictions, probabilities=None, if self.DFP: competences = competences * DFP_mask + self.stats.competences = competences + if self.mode == "selection": # The selected_classifiers matrix is used as a mask to remove # the predictions of certain base classifiers. diff --git a/deslib/des/knop.py b/deslib/des/knop.py index adffb36..1cffc36 100644 --- a/deslib/des/knop.py +++ b/deslib/des/knop.py @@ -106,7 +106,8 @@ class :class:`FaissKNNClassifier` """ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, random_state=None, - knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knn_metric='minkowski', knne=False, + DSEL_perc=0.5, n_jobs=-1): super(KNOP, self).__init__(pool_classifiers, k, DFP=DFP, @@ -117,6 +118,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, needs_proba=True, random_state=random_state, knn_classifier=knn_classifier, + knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/des/knora_e.py b/deslib/des/knora_e.py index fa0298d..6619b03 100644 --- a/deslib/des/knora_e.py +++ b/deslib/des/knora_e.py @@ -99,7 +99,8 @@ class :class:`FaissKNNClassifier` def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, random_state=None, - knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knn_metric='minkowski', knne=False, + DSEL_perc=0.5, n_jobs=-1): super(KNORAE, self).__init__(pool_classifiers=pool_classifiers, k=k, @@ -109,6 +110,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, IH_rate=IH_rate, random_state=random_state, knn_classifier=knn_classifier, + knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/des/knora_u.py b/deslib/des/knora_u.py index e6cdc89..c4403fc 100644 --- a/deslib/des/knora_u.py +++ b/deslib/des/knora_u.py @@ -95,7 +95,8 @@ class :class:`FaissKNNClassifier` def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, random_state=None, - knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knn_metric='minkowski', knne=False, + DSEL_perc=0.5, n_jobs=-1): super(KNORAU, self).__init__(pool_classifiers, k, DFP=DFP, with_IH=with_IH, @@ -104,6 +105,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, mode='weighting', random_state=random_state, knn_classifier=knn_classifier, + knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/util/stats.py b/deslib/util/stats.py new file mode 100644 index 0000000..e9f3194 --- /dev/null +++ b/deslib/util/stats.py @@ -0,0 +1,122 @@ +import numpy as np + + +class stats(): + def __init__(self): + self.agree_ind = [] + self.disagree_ind = [] + self.true_labels = [] + self.bases_labels = [] + self.predicted_labels = [] + self.agree_labels = [] + self.competences = [] + + def log_stats(self): + n_queries = len(self.true_labels) + n_classes = len(np.unique(self.predicted_labels)) + n_bases = len(self.bases_labels[0]) + n_agree = len(self.agree_ind) + n_disagree = len(self.disagree_ind) + + n_right_clf_by_query, n_right_clf_ind = \ + self._get_n_right_clf_stats(n_classes) + + predicted_dis = self._get_distribution() + agree_dis = self._get_distribution(ind=self.agree_ind) + n_right_clf_dis = self._get_distribution(n_right_clf_by_query) + + agree_score = self._get_score(self.agree_ind) + disagree_score = self._get_score(self.disagree_ind) + + competences_mean, competences_mean_by_clf, n_even_max_competence = \ + self._get_competences_stats() + + lines = [] + lines.extend([ + "Queries:", + n_queries, + "Nb of right classifiers from 0 to "+str(n_bases)+":", + n_right_clf_dis, + "--- Agreements", + "Instances, ratio on queries:", + n_agree, + round(n_agree / n_queries, 3), + "Classes distribution, ratio on predictions:", + agree_dis, + np.round(agree_dis / predicted_dis, 3), + "Score, ratio on agreements:", + agree_score, + round(agree_score / n_agree, 3), + ]) + + for i,n_right_clf in enumerate(n_right_clf_dis): + score = self._get_score(n_right_clf_ind[i]) + lines.extend([ + "--- "+str(i)+" right classifiers", + "Instances, ratio on queries:", + n_right_clf_dis[i], + round(n_right_clf / n_queries, 3), + "Score, ratio on "+str(i)+" right clf:", + score, + round(score / n_right_clf_dis[i], 3), + ]) + + lines.extend([ + "--- Disagreements", + "Instances, ratio on queries:", + n_disagree, + round(n_disagree / n_queries, 3), + "Score, ratio on disagreements:", + disagree_score, + round(disagree_score / n_disagree, 3), + "--- Competences", + "Mean:", + round(competences_mean, 3), + "Mean by classifier:", + np.round(competences_mean_by_clf, 3), + "Even max competences times, ratio on disagreements:", + n_even_max_competence, + round(n_even_max_competence / n_disagree, 3), + ]) + + with open("log.txt",'w') as f: + for line in lines: + f.write(str(line)) + f.write("\n") + + def _get_distribution(self, labels=None, ind=None): + labels = self.predicted_labels if labels is None else labels + if ind is not None: labels = labels[ind] + _, counts = np.unique(labels, return_counts=True) + return counts + + def _get_n_right_clf_stats(self, n_classes): + n_right_clf_by_query = [] + n_right_clf_ind = [[] for i in range(n_classes)] + + for i,label in enumerate(self.true_labels): + row = self.bases_labels[i] + n_right_clf = np.count_nonzero(row == label) + n_right_clf_by_query.append(n_right_clf) + n_right_clf_ind[n_right_clf].append(i) + + return n_right_clf_by_query, n_right_clf_ind + + def _get_competences_stats(self): + competences_mean = np.mean(self.competences) + competences_mean_by_clf = np.mean(self.competences, axis=0) + n_even_max_competence = 0 + + for c in self.competences: + max_ = c[np.argmax(c)] + n_max = np.count_nonzero(c == max_) + if n_max > 1: n_even_max_competence += 1 + + return competences_mean, competences_mean_by_clf, n_even_max_competence + + def _get_score(self, ind): + true_labels = self.true_labels[ind] + labels = self.predicted_labels[ind] + matches = np.equal(true_labels, labels) + score = np.sum(matches) + return score From 6835d89c25525f4f4064cdd72379dd0d391614fc Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Thu, 11 Mar 2021 18:07:12 -0500 Subject: [PATCH 07/21] =?UTF-8?q?Support=20des=20stats=20=C3=A0=20multidat?= =?UTF-8?q?asets,=20fix=20avec=20les=20stats.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/base.py | 11 +++++------ deslib/multi_datasets.py | 7 +++++++ deslib/util/stats.py | 8 ++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/deslib/base.py b/deslib/base.py index 9358ac4..041dbe5 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -24,7 +24,7 @@ from deslib.util import faiss_knn_wrapper from deslib.util.dfp import frienemy_pruning_preprocessed from deslib.util.instance_hardness import hardness_region_competence -from deslib.util.stats import stats +from deslib.util.stats import Stats class BaseDS(BaseEstimator, ClassifierMixin): @@ -57,7 +57,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, self.DSEL_perc = DSEL_perc self.knne = knne self.n_jobs = n_jobs - self.stats = stats() + self.stats = Stats() # Check optional dependency if knn_classifier == 'faiss' and not faiss_knn_wrapper.is_available(): @@ -205,7 +205,6 @@ class labels of each example in X. self """ self.random_state_ = check_random_state(self.random_state) - self.stats.true_labels = y # Check if the length of X and y are consistent. X, y = check_X_y(X, y) @@ -445,7 +444,6 @@ def predict(self, X): base_probabilities = None base_predictions = self._predict_base(X) - self.stats.bases_labels = base_predictions all_agree_vector = BaseDS._all_classifier_agree(base_predictions) ind_all_agree = np.where(all_agree_vector)[0] @@ -454,7 +452,6 @@ def predict(self, X): if ind_all_agree.size: predicted_labels[ind_all_agree] = base_predictions[ ind_all_agree, 0] - self.stats.agree_ind = ind_all_agree # For the samples with disagreement, perform the dynamic selection # steps. First step is to collect the samples with disagreement @@ -539,6 +536,7 @@ def predict(self, X): # Get the real indices_ of the samples that will be classified # using a DS algorithm. ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] + self.stats.disagree_ind = ind_ds_original_matrix if self.needs_proba: selected_probabilities = base_probabilities[ @@ -554,8 +552,9 @@ def predict(self, X): distances=distances, DFP_mask=DFP_mask) predicted_labels[ind_ds_original_matrix] = pred_ds - self.stats.disagree_ind = ind_ds_original_matrix + self.stats.bases_labels = base_predictions + self.stats.agree_ind = ind_all_agree self.stats.predicted_labels = predicted_labels return self.classes_.take(predicted_labels) diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 379642b..4502277 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -19,6 +19,7 @@ aggregate_proba_ensemble_weighted) from deslib.util.instance_hardness import hardness_region_competence + # Créer à partir de KNORA-U class MultiDatasets(BaseDS): def __init__(self, ds_classifier, pool_classifiers): @@ -166,6 +167,7 @@ def _predict_ds(self, X): # Get the real indices_ of the samples that will be classified # using a DS algorithm. ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] + self.stats.disagree_ind = ind_ds_original_matrix if ds_classifier.needs_proba: selected_probabilities = base_probabilities[ @@ -189,6 +191,7 @@ def _predict_ds(self, X): merged_left_base_predictions = np.concatenate( merged_left_base_predictions, axis=1) merged_competences = np.concatenate(merged_competences, axis=1) + self.stats.competences = merged_competences if issubclass(type(self.ds_classifier), BaseDCS): pred_ds = self._get_dcs_predicted_label(self.ds_classifier, @@ -199,6 +202,10 @@ def _predict_ds(self, X): predicted_labels[ind_ds_original_matrix] = pred_ds + self.stats.bases_labels = merged_base_predictions + self.stats.agree_ind = ind_all_agree + self.stats.predicted_labels = predicted_labels + return self.classes_.take(predicted_labels) def _predict_oracle(self, X, y): diff --git a/deslib/util/stats.py b/deslib/util/stats.py index e9f3194..c8367da 100644 --- a/deslib/util/stats.py +++ b/deslib/util/stats.py @@ -1,7 +1,7 @@ import numpy as np -class stats(): +class Stats(): def __init__(self): self.agree_ind = [] self.disagree_ind = [] @@ -19,7 +19,7 @@ def log_stats(self): n_disagree = len(self.disagree_ind) n_right_clf_by_query, n_right_clf_ind = \ - self._get_n_right_clf_stats(n_classes) + self._get_n_right_clf_stats(n_bases) predicted_dis = self._get_distribution() agree_dis = self._get_distribution(ind=self.agree_ind) @@ -90,9 +90,9 @@ def _get_distribution(self, labels=None, ind=None): _, counts = np.unique(labels, return_counts=True) return counts - def _get_n_right_clf_stats(self, n_classes): + def _get_n_right_clf_stats(self, n_bases): n_right_clf_by_query = [] - n_right_clf_ind = [[] for i in range(n_classes)] + n_right_clf_ind = [[] for i in range(n_bases + 1)] for i,label in enumerate(self.true_labels): row = self.bases_labels[i] From 3a764e4453ccaf0284d9194ea259c2ef83b557f7 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Thu, 11 Mar 2021 21:39:36 -0500 Subject: [PATCH 08/21] Segmentation du log. --- deslib/util/stats.py | 105 ++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/deslib/util/stats.py b/deslib/util/stats.py index c8367da..3ab86da 100644 --- a/deslib/util/stats.py +++ b/deslib/util/stats.py @@ -10,44 +10,66 @@ def __init__(self): self.predicted_labels = [] self.agree_labels = [] self.competences = [] + self.log_fname = "log.txt" def log_stats(self): - n_queries = len(self.true_labels) - n_classes = len(np.unique(self.predicted_labels)) - n_bases = len(self.bases_labels[0]) - n_agree = len(self.agree_ind) - n_disagree = len(self.disagree_ind) + self.n_queries = len(self.true_labels) + self.n_bases = len(self.bases_labels[0]) + self.n_disagree = len(self.disagree_ind) - n_right_clf_by_query, n_right_clf_ind = \ - self._get_n_right_clf_stats(n_bases) + with open(self.log_fname,'w') as f: + for line in self._get_all_lines(): + f.write(str(line)) + f.write("\n") - predicted_dis = self._get_distribution() + def _get_all_lines(self): + lines = [] + lines.extend(self._get_general_lines()) + lines.extend(self._get_agree_lines()) + lines.extend(self._get_n_right_clf_lines()) + lines.extend(self._get_disagree_lines()) + lines.extend(self._get_competences_lines()) + return lines + + def _get_general_lines(self): + lines = [ + "Queries:", + self.n_queries, + ] + + return lines + + def _get_agree_lines(self): + n_agree = len(self.agree_ind) agree_dis = self._get_distribution(ind=self.agree_ind) - n_right_clf_dis = self._get_distribution(n_right_clf_by_query) - agree_score = self._get_score(self.agree_ind) - disagree_score = self._get_score(self.disagree_ind) + predicted_dis = self._get_distribution() - competences_mean, competences_mean_by_clf, n_even_max_competence = \ - self._get_competences_stats() - - lines = [] - lines.extend([ - "Queries:", - n_queries, - "Nb of right classifiers from 0 to "+str(n_bases)+":", - n_right_clf_dis, + lines = [ "--- Agreements", "Instances, ratio on queries:", n_agree, - round(n_agree / n_queries, 3), + round(n_agree / self.n_queries, 3), "Classes distribution, ratio on predictions:", agree_dis, np.round(agree_dis / predicted_dis, 3), "Score, ratio on agreements:", agree_score, round(agree_score / n_agree, 3), - ]) + ] + + return lines + + def _get_n_right_clf_lines(self): + n_right_clf_by_query, n_right_clf_ind = \ + self._get_n_right_clf_stats() + n_right_clf_dis = self._get_distribution(n_right_clf_by_query) + + lines = [ + "--- Right classifiers:", + "Distribution:", + n_right_clf_dis, + ] for i,n_right_clf in enumerate(n_right_clf_dis): score = self._get_score(n_right_clf_ind[i]) @@ -55,20 +77,34 @@ def log_stats(self): "--- "+str(i)+" right classifiers", "Instances, ratio on queries:", n_right_clf_dis[i], - round(n_right_clf / n_queries, 3), + round(n_right_clf / self.n_queries, 3), "Score, ratio on "+str(i)+" right clf:", score, round(score / n_right_clf_dis[i], 3), ]) - lines.extend([ + return lines + + def _get_disagree_lines(self): + disagree_score = self._get_score(self.disagree_ind) + + lines = [ "--- Disagreements", "Instances, ratio on queries:", - n_disagree, - round(n_disagree / n_queries, 3), + self.n_disagree, + round(self.n_disagree / self.n_queries, 3), "Score, ratio on disagreements:", disagree_score, - round(disagree_score / n_disagree, 3), + round(disagree_score / self.n_disagree, 3), + ] + + return lines + + def _get_competences_lines(self): + competences_mean, competences_mean_by_clf, n_even_max_competence = \ + self._get_competences_stats() + + lines = [ "--- Competences", "Mean:", round(competences_mean, 3), @@ -76,13 +112,10 @@ def log_stats(self): np.round(competences_mean_by_clf, 3), "Even max competences times, ratio on disagreements:", n_even_max_competence, - round(n_even_max_competence / n_disagree, 3), - ]) - - with open("log.txt",'w') as f: - for line in lines: - f.write(str(line)) - f.write("\n") + round(n_even_max_competence / self.n_disagree, 3), + ] + + return lines def _get_distribution(self, labels=None, ind=None): labels = self.predicted_labels if labels is None else labels @@ -90,9 +123,9 @@ def _get_distribution(self, labels=None, ind=None): _, counts = np.unique(labels, return_counts=True) return counts - def _get_n_right_clf_stats(self, n_bases): + def _get_n_right_clf_stats(self): n_right_clf_by_query = [] - n_right_clf_ind = [[] for i in range(n_bases + 1)] + n_right_clf_ind = [[] for i in range(self.n_bases + 1)] for i,label in enumerate(self.true_labels): row = self.bases_labels[i] From 9a779a4d001c66ed45336f8dfacdc88caedaad01 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Mon, 15 Mar 2021 14:27:10 -0400 Subject: [PATCH 09/21] =?UTF-8?q?Ajout=20de=20logs=20multi-datasets,=20ges?= =?UTF-8?q?tion=20du=20cas=20o=C3=B9=20il=20y=20a=200=20d=C3=A9saccords,?= =?UTF-8?q?=20renommages.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/base.py | 5 +++- deslib/multi_datasets.py | 6 +++- deslib/util/stats.py | 63 +++++++++++++++++++++++++++++----------- 3 files changed, 55 insertions(+), 19 deletions(-) diff --git a/deslib/base.py b/deslib/base.py index 041dbe5..368c0e5 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -57,7 +57,7 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, self.DSEL_perc = DSEL_perc self.knne = knne self.n_jobs = n_jobs - self.stats = Stats() + self._set_stats() # Check optional dependency if knn_classifier == 'faiss' and not faiss_knn_wrapper.is_available(): @@ -65,6 +65,9 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, 'Using knn_classifier="faiss" requires that the FAISS library ' 'be installed.Please check the Installation Guide.') + def _set_stats(self): + self.stats = Stats() + @abstractmethod def select(self, competences): """Select the most competent classifier for diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 4502277..82a58fc 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -18,7 +18,7 @@ majority_voting_rule, aggregate_proba_ensemble_weighted) from deslib.util.instance_hardness import hardness_region_competence - +from deslib.util.stats import MultiStats # Créer à partir de KNORA-U class MultiDatasets(BaseDS): @@ -35,6 +35,9 @@ def __init__(self, ds_classifier, pool_classifiers): super(MultiDatasets, self).__init__(pool_classifiers) self.ds_classifier = ds_classifier + def _set_stats(self): + self.stats = MultiStats() + def fit(self, X, y): """Prepare the DS models by setting the KNN algorithm and pre-processing the information required to apply the DS @@ -202,6 +205,7 @@ def _predict_ds(self, X): predicted_labels[ind_ds_original_matrix] = pred_ds + self.stats.n_datasets = n_datasets self.stats.bases_labels = merged_base_predictions self.stats.agree_ind = ind_all_agree self.stats.predicted_labels = predicted_labels diff --git a/deslib/util/stats.py b/deslib/util/stats.py index 3ab86da..267176d 100644 --- a/deslib/util/stats.py +++ b/deslib/util/stats.py @@ -27,8 +27,9 @@ def _get_all_lines(self): lines.extend(self._get_general_lines()) lines.extend(self._get_agree_lines()) lines.extend(self._get_n_right_clf_lines()) - lines.extend(self._get_disagree_lines()) - lines.extend(self._get_competences_lines()) + if self.n_disagree > 0: + lines.extend(self._get_disagree_lines()) + lines.extend(self._get_competences_lines()) return lines def _get_general_lines(self): @@ -50,7 +51,7 @@ def _get_agree_lines(self): "Instances, ratio on queries:", n_agree, round(n_agree / self.n_queries, 3), - "Classes distribution, ratio on predictions:", + "Distribution, ratio on predictions:", agree_dis, np.round(agree_dis / predicted_dis, 3), "Score, ratio on agreements:", @@ -64,25 +65,19 @@ def _get_n_right_clf_lines(self): n_right_clf_by_query, n_right_clf_ind = \ self._get_n_right_clf_stats() n_right_clf_dis = self._get_distribution(n_right_clf_by_query) + scores = [self._get_score(n_right_clf_ind[i]) \ + for i in range(len(n_right_clf_dis))] lines = [ "--- Right classifiers:", - "Distribution:", + "Distribution, ratio on queries:", n_right_clf_dis, + np.round(n_right_clf_dis / self.n_queries, 3), + "Scores, ratio on N right clf", + scores, + np.round(scores / n_right_clf_dis, 3), ] - for i,n_right_clf in enumerate(n_right_clf_dis): - score = self._get_score(n_right_clf_ind[i]) - lines.extend([ - "--- "+str(i)+" right classifiers", - "Instances, ratio on queries:", - n_right_clf_dis[i], - round(n_right_clf / self.n_queries, 3), - "Score, ratio on "+str(i)+" right clf:", - score, - round(score / n_right_clf_dis[i], 3), - ]) - return lines def _get_disagree_lines(self): @@ -110,7 +105,7 @@ def _get_competences_lines(self): round(competences_mean, 3), "Mean by classifier:", np.round(competences_mean_by_clf, 3), - "Even max competences times, ratio on disagreements:", + "Even max competences times, \nratio on disagreements:", n_even_max_competence, round(n_even_max_competence / self.n_disagree, 3), ] @@ -153,3 +148,37 @@ def _get_score(self, ind): matches = np.equal(true_labels, labels) score = np.sum(matches) return score + + +class MultiStats(Stats): + def __init__(self): + super().__init__() + self.n_datasets = 1 + + def _get_all_lines(self): + lines = super()._get_all_lines() + lines.extend(self._get_multistats_lines()) + return lines + + def _get_competences_lines(self): + competences_mean, competences_mean_by_clf, n_even_max_competence = \ + self._get_competences_stats() + + means = competences_mean_by_clf.reshape(self.n_datasets, -1) + competences_mean_by_dataset = np.mean(means, axis=1) + + lines = super()._get_competences_lines() + lines.extend([ + "Mean by dataset:", + np.round(competences_mean_by_dataset, 3), + ]) + + return lines + + def _get_multistats_lines(self): + lines = [ + "--- Multidatasets", + self.n_datasets + ] + + return lines From bdccfc277459a65c24010bf2f2479ced678890fd Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Sun, 21 Mar 2021 21:13:37 -0400 Subject: [PATCH 10/21] Retrait du choix de distances avec fknn. --- deslib/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deslib/base.py b/deslib/base.py index 368c0e5..1575528 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -365,9 +365,7 @@ def _set_region_of_competence_algorithm(self, X): elif self.knn_classifier == 'faiss': knn_class = functools.partial( faiss_knn_wrapper.FaissKNNClassifier, - n_jobs=self.n_jobs, algorithm="brute", - metric=self.knn_metric, - metric_params=metric_params) + n_jobs=self.n_jobs, algorithm="brute") elif callable(self.knn_classifier): knn_class = self.knn_classifier else: From 58502dfd0c14d839d4da2e1231408c4286523f5c Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Sun, 28 Mar 2021 21:03:34 -0400 Subject: [PATCH 11/21] =?UTF-8?q?Ajout=20de=20KNOP=20fusionn=C3=A9=20dans?= =?UTF-8?q?=20le=20profil.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/des/multi_knop.py | 256 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 deslib/des/multi_knop.py diff --git a/deslib/des/multi_knop.py b/deslib/des/multi_knop.py new file mode 100644 index 0000000..c1dac6d --- /dev/null +++ b/deslib/des/multi_knop.py @@ -0,0 +1,256 @@ +# coding=utf-8 + +# Author: Rafael Menelau Oliveira e Cruz +# +# License: BSD 3 clause + +import numpy as np + +from deslib.des.base import BaseDS +from deslib.des.knop import KNOP +from sklearn.utils.validation import (check_X_y, check_is_fitted, check_array, + check_random_state) + + +class MultiKNOP(KNOP): + def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, + safe_k=None, IH_rate=0.30, random_state=None, voting='hard', + knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): + + super(KNOP, self).__init__(None, k, + DFP=DFP, + with_IH=with_IH, + safe_k=safe_k, + IH_rate=IH_rate, + needs_proba=True, + random_state=random_state, + knn_classifier=knn_classifier, + knne=knne, + DSEL_perc=DSEL_perc, + n_jobs=n_jobs) + self.ds_classifiers = [] + for i in range(len(pool_classifiers)): + knop = KNOP(pool_classifiers[i], k, + DFP=DFP, + with_IH=with_IH, + safe_k=safe_k, + IH_rate=IH_rate, + random_state=random_state, + knn_classifier=knn_classifier, + knne=knne, + DSEL_perc=DSEL_perc, + n_jobs=n_jobs) + self.ds_classifiers.append(knop) + + + """Multi k-Nearest Output Profiles (MultiKNOP). + """ + def fit(self, X, y): + """Train the DS model by setting the KNN algorithm and + pre-process the information required to apply the DS + methods. In this case, the scores of the base classifiers for + the dynamic selection dataset (DSEL) are pre-calculated to + transform each sample in DSEL into an output profile. + + Parameters + ---------- + X : array of shape n_datasets, n_samples, n_features) + Data used to fit the model. + + y : array of shape (n_datasets, n_samples) + class labels of each example in X. + + Returns + ------- + self + """ + if len(X) == 1 or len(y) == 1: + raise ValueError("Error. MultiKNOP does not accept one dataset!") + """ + for predict()... + for i in range(len(y)-1): + if np.array_equal(y[0],y[i+1]): + raise ValueError( + "Error. All datasets queries must match exactly!") + """ + + datasets_dsel_scores = [] + datasets_DSEL_processed_ = [] + # Process each dataset + for i in range(len(X)): + self.ds_classifiers[i].fit(X[i], y[i]) + if self.ds_classifiers[i].n_classes_ == 1: + raise ValueError( + "Error. MultiKNOP does not accept one class datasets!") + self.ds_classifiers[i]._check_predict_proba() + self.ds_classifiers[i].dsel_scores_ = \ + self.ds_classifiers[i]._preprocess_dsel_scores() + datasets_dsel_scores.append(self.ds_classifiers[i].dsel_scores_) + datasets_DSEL_processed_.append( + self.ds_classifiers[i].DSEL_processed_) + + self.dsel_scores_ = np.concatenate(datasets_dsel_scores, axis=1) + self.DSEL_processed_ = np.concatenate(datasets_DSEL_processed_, axis=1) + self.n_classifiers_ = self.dsel_scores_.shape[1] + + # Reassignment + self.DSEL_target_ = self.ds_classifiers[0].DSEL_target_ + self.n_samples_ = self.ds_classifiers[0].n_samples_ + self.n_classes_ = self.ds_classifiers[0].n_classes_ + self.knn_class_ = self.ds_classifiers[0].knn_class_ + self.k_ = self.ds_classifiers[0].k_ + self.classes_ = self.ds_classifiers[0].classes_ + + # Reshape DSEL_scores as a 2-D array for nearest neighbor calculations + dsel_output_profiles = self.dsel_scores_.reshape(self.n_samples_, + self.n_classifiers_ * + self.n_classes_) + + self._fit_OP(dsel_output_profiles, self.DSEL_target_, self.k_) + + return self + + def _fit_OP(self, X_op, y_op, k): + """ Fit the set of output profiles. + + Parameters + ---------- + X_op : array of shape (n_samples, n_features) + Output profiles of the training data. n_features is equals + to (n_classifiers x n_classes). + + y_op : array of shape (n_samples) + Class labels of each sample in X_op. + + k : int + Number of output profiles used in the region of competence + estimation. + + """ + self.op_knn_ = self.knn_class_(k) + + if self.n_classes_ == 2: + # Get only the scores for one class since they are complementary + X_temp = X_op[:, ::2] + self.op_knn_.fit(X_temp, y_op) + else: + self.op_knn_.fit(X_op, y_op) + + def _get_similar_out_profiles(self, probabilities): + """Get the most similar output profiles of the query sample. + + Parameters + ---------- + probabilities : array of shape (n_samples, n_classifiers, n_classes) + predictions of each base classifier for all samples. + + Returns + ------- + dists : list of shape = [n_samples, k] + The distances between the query and each sample in the region + of competence. The vector is ordered in an ascending fashion. + + idx : list of shape = [n_samples, k] + Indices of the instances belonging to the region of competence of + the given query sample. + """ + + if self.n_classes_ == 2: + # Get only the scores for one class since they are complementary + query_op = probabilities[:, :, 0] + else: + query_op = probabilities.reshape((probabilities.shape[0], + self.n_classifiers_ * + self.n_classes_)) + + dists, idx = self.op_knn_.kneighbors(query_op, n_neighbors=self.k_, + return_distance=True) + return dists, np.atleast_2d(idx) + + def predict(self, X): + """Predict the class label for each sample in X. + Parameters + ---------- + X : array of shape (n_samples, n_features) + The input data. + Returns + ------- + predicted_labels : array of shape (n_samples) + Predicted class label for each sample in X. + """ + # Check if the DS model was trained + #check_is_fitted(self, + # ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) + + # Check if X is a valid input + #for i in range(len(X)): + # X[i] = check_array(X[i]) + # self._check_num_features(X[i]) + + n_samples = X[0].shape[0] + predicted_labels = np.empty(n_samples, dtype=np.intp) + + base_probabilities = [] + for i in range(len(X)): + base_probabilities.append( + self.ds_classifiers[i]._predict_proba_base(X[i])) + base_probabilities = np.concatenate(base_probabilities,axis=1) + base_predictions = base_probabilities.argmax(axis=2) + + all_agree_vector = BaseDS._all_classifier_agree(base_predictions) + ind_all_agree = np.where(all_agree_vector)[0] + + # Since the predictions are always the same, get the predictions of the + # first base classifier. + if ind_all_agree.size: + predicted_labels[ind_all_agree] = base_predictions[ + ind_all_agree, 0] + + # For the samples with disagreement, perform the dynamic selection + # steps. First step is to collect the samples with disagreement + # between base classifiers + ind_disagreement = np.where(~all_agree_vector)[0] + if ind_disagreement.size: + + X_DS = X[0][ind_disagreement, :] + + # Then, we estimate the nearest neighbors for all samples that + # we need to call DS routines + distances, neighbors = None, None + #distances, neighbors = self._get_region_competence(X_DS) + + # IH was not considered. So all samples with disagreement are + # passed down to the DS algorithm + ind_ds_classifier = np.arange(ind_disagreement.size) + + # At this stage the samples which all base classifiers agrees or + # that are associated with low hardness were already classified. + # The remaining samples are now passed down to the DS techniques + # for classification. + + # First check whether there are still samples to be classified. + if ind_ds_classifier.size: + + DFP_mask = np.ones( + (ind_ds_classifier.size, self.n_classifiers_)) + + # Get the real indices_ of the samples that will be classified + # using a DS algorithm. + ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] + + if self.needs_proba or self.voting == 'soft': + selected_probabilities = base_probabilities[ + ind_ds_original_matrix] + else: + selected_probabilities = None + + pred_ds = self.classify_with_ds(X_DS[ind_ds_classifier], + base_predictions[ + ind_ds_original_matrix], + selected_probabilities, + neighbors=neighbors, + distances=distances, + DFP_mask=DFP_mask) + predicted_labels[ind_ds_original_matrix] = pred_ds + + return self.classes_.take(predicted_labels) From 6d3ba91f980802f0faa9b266cd841d8b556110e2 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Thu, 24 Jun 2021 00:13:20 -0400 Subject: [PATCH 12/21] Fix: quand il y a 0 accord pour une classe. --- deslib/util/stats.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/deslib/util/stats.py b/deslib/util/stats.py index 267176d..fdcb0e3 100644 --- a/deslib/util/stats.py +++ b/deslib/util/stats.py @@ -114,9 +114,13 @@ def _get_competences_lines(self): def _get_distribution(self, labels=None, ind=None): labels = self.predicted_labels if labels is None else labels + max_label = max(labels) if ind is not None: labels = labels[ind] - _, counts = np.unique(labels, return_counts=True) - return counts + unique_labels, unique_counts = np.unique(labels, return_counts=True) + distribution = np.full(max_label+1,0) + for i,l in enumerate(unique_labels): + distribution[l] = unique_counts[i] + return distribution def _get_n_right_clf_stats(self): n_right_clf_by_query = [] From 7d06fe919047ef94f90cbd950fcfcd458d45a652 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Thu, 1 Jul 2021 23:21:14 -0400 Subject: [PATCH 13/21] =?UTF-8?q?Patch:=20permet=20d'avoir=200=20ou=201=20?= =?UTF-8?q?base=20par=20mod=C3=A8le.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/deslib/base.py b/deslib/base.py index 1575528..72a8653 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -826,9 +826,10 @@ def _validate_pool(self): ValueError If the pool of classifiers is empty. """ - if self.n_classifiers_ <= 1: - raise ValueError("n_classifiers must be greater than one, " - "got {}.".format(self.n_classifiers_)) + # PATCH: allow 0 or 1 base for the multidatasets model. + #if self.n_classifiers_ <= 1: + # raise ValueError("n_classifiers must be greater than one, " + # "got {}.".format(self.n_classifiers_)) def _check_num_features(self, X): """ Verify if the number of features (n_features) of X is equals to From 1390e67f597c8a44cc646889893c2eec9bb6c6c8 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Fri, 2 Jul 2021 09:38:46 -0400 Subject: [PATCH 14/21] PATCH: 1 base permise pour static --- deslib/static/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deslib/static/base.py b/deslib/static/base.py index e72547e..993b909 100644 --- a/deslib/static/base.py +++ b/deslib/static/base.py @@ -134,6 +134,6 @@ def _validate_pool(self): ValueError If the pool of classifiers is empty or just a single model. """ - if self.n_classifiers_ <= 1: - raise ValueError("n_classifiers must be greater than one, " - "got {}.".format(len(self.pool_classifiers))) + #if self.n_classifiers_ <= 1: + # raise ValueError("n_classifiers must be greater than one, " + # "got {}.".format(len(self.pool_classifiers))) From 2905fb713684c6f8ec17bea356b8702dccd36dca Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Sun, 1 Aug 2021 14:18:11 -0400 Subject: [PATCH 15/21] =?UTF-8?q?Ajout=20des=20tats=20de=20la=20fiabilit?= =?UTF-8?q?=C3=A9=20de=20comp=C3=A9tence.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/base.py | 2 + deslib/multi_datasets.py | 2 + deslib/util/stats.py | 90 +++++++++++++++++++++++++++++++++------- 3 files changed, 80 insertions(+), 14 deletions(-) diff --git a/deslib/base.py b/deslib/base.py index 72a8653..4f20326 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -458,6 +458,7 @@ def predict(self, X): # steps. First step is to collect the samples with disagreement # between base classifiers ind_disagreement = np.where(~all_agree_vector)[0] + ind_disagreement = np.asarray(range(len(predicted_labels))) if ind_disagreement.size: X_DS = X[ind_disagreement, :] @@ -557,6 +558,7 @@ def predict(self, X): self.stats.bases_labels = base_predictions self.stats.agree_ind = ind_all_agree self.stats.predicted_labels = predicted_labels + self.stats.k = self.k return self.classes_.take(predicted_labels) diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 82a58fc..09e8624 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -121,6 +121,7 @@ def _predict_ds(self, X): # steps. First step is to collect the samples with disagreement # between base classifiers ind_disagreement = np.where(~all_agree_vector)[0] + ind_disagreement = np.asarray(range(len(merged_base_predictions))) if ind_disagreement.size: merged_left_base_predictions = [] merged_competences = [] @@ -209,6 +210,7 @@ def _predict_ds(self, X): self.stats.bases_labels = merged_base_predictions self.stats.agree_ind = ind_all_agree self.stats.predicted_labels = predicted_labels + self.stats.k = self.k return self.classes_.take(predicted_labels) diff --git a/deslib/util/stats.py b/deslib/util/stats.py index fdcb0e3..604e24b 100644 --- a/deslib/util/stats.py +++ b/deslib/util/stats.py @@ -30,6 +30,7 @@ def _get_all_lines(self): if self.n_disagree > 0: lines.extend(self._get_disagree_lines()) lines.extend(self._get_competences_lines()) + lines.extend(self._get_competences_reliability_lines()) return lines def _get_general_lines(self): @@ -96,18 +97,77 @@ def _get_disagree_lines(self): return lines def _get_competences_lines(self): - competences_mean, competences_mean_by_clf, n_even_max_competence = \ + mean, mean_by_clf, var, var_by_clf, n_even_max = \ self._get_competences_stats() lines = [ "--- Competences", "Mean:", - round(competences_mean, 3), + round(mean, 3), "Mean by classifier:", - np.round(competences_mean_by_clf, 3), + np.round(mean_by_clf, 3), + "Var:", + round(var, 3), + "Var by classifier:", + np.round(var_by_clf, 3), "Even max competences times, \nratio on disagreements:", - n_even_max_competence, - round(n_even_max_competence / self.n_disagree, 3), + n_even_max, + round(n_even_max / self.n_disagree, 3), + ] + + return lines + + def _get_competences_reliability_lines(self): + true_labels_by_base = np.tile(self.true_labels,(self.n_bases,1)) + correct_bases_bln_array = np.equal( + self.bases_labels.T,true_labels_by_base) + n_queries = self.bases_labels.shape[0] + n_correct_labels_by_base = np.sum(correct_bases_bln_array,axis=1) + acc_by_base = np.round(n_correct_labels_by_base/n_queries,3) + + comp = self.competences/self.k + n_incorrect_labels_by_base = \ + len(self.predicted_labels)-n_correct_labels_by_base + + correct_comp = comp.T*correct_bases_bln_array + correct_comp_by_base = np.sum(correct_comp,axis=1) + correct_comp_by_base /= n_correct_labels_by_base + mean_correct_comp_by_base = np.round(correct_comp_by_base,3) + mean = mean_correct_comp_by_base + mean = np.repeat(mean,n_queries).reshape(self.n_bases,-1) + correct_comp_by_base = np.sum((correct_comp-mean)**2,axis=1) + correct_comp_by_base /= n_correct_labels_by_base + std_correct_comp_by_base = np.round(np.sqrt(correct_comp_by_base),3) + + incorrect_comp = comp.T*~correct_bases_bln_array + incorrect_comp_by_base = np.sum(incorrect_comp,axis=1) + incorrect_comp_by_base /= n_incorrect_labels_by_base + mean_incorrect_comp_by_base = np.round(incorrect_comp_by_base,3) + mean = mean_incorrect_comp_by_base + mean = np.repeat(mean,n_queries).reshape(self.n_bases,-1) + incorrect_comp_by_base = np.sum((incorrect_comp-mean)**2,axis=1) + incorrect_comp_by_base /= n_incorrect_labels_by_base + std_incorrect_comp_by_base = np.round( + np.sqrt(incorrect_comp_by_base),3) + + lines = [ + "--- Competence reliability", + "Acc:", + round(np.mean(acc_by_base),3), + "(by base):", + acc_by_base, + "Competence mean & std when well clasified:", + round(np.mean(mean_correct_comp_by_base),3), + round(np.mean(std_correct_comp_by_base),3), + "(by base):", + mean_correct_comp_by_base, + std_correct_comp_by_base, + "Competence mean & std when not well clasified:", + round(np.mean(mean_incorrect_comp_by_base),3), + round(np.mean(std_incorrect_comp_by_base),3), + "(by base):", + mean_incorrect_comp_by_base, + std_incorrect_comp_by_base, ] return lines @@ -135,16 +195,18 @@ def _get_n_right_clf_stats(self): return n_right_clf_by_query, n_right_clf_ind def _get_competences_stats(self): - competences_mean = np.mean(self.competences) - competences_mean_by_clf = np.mean(self.competences, axis=0) - n_even_max_competence = 0 + mean = np.mean(self.competences) + var = np.var(self.competences) + mean_by_clf = np.mean(self.competences, axis=0) + var_by_clf = np.var(self.competences, axis=0) + n_even_max = 0 for c in self.competences: max_ = c[np.argmax(c)] n_max = np.count_nonzero(c == max_) - if n_max > 1: n_even_max_competence += 1 + if n_max > 1: n_even_max += 1 - return competences_mean, competences_mean_by_clf, n_even_max_competence + return mean, mean_by_clf, var, var_by_clf, n_even_max def _get_score(self, ind): true_labels = self.true_labels[ind] @@ -165,16 +227,16 @@ def _get_all_lines(self): return lines def _get_competences_lines(self): - competences_mean, competences_mean_by_clf, n_even_max_competence = \ + _, mean_by_clf, _, var_by_clf, _ = \ self._get_competences_stats() - means = competences_mean_by_clf.reshape(self.n_datasets, -1) - competences_mean_by_dataset = np.mean(means, axis=1) + means = mean_by_clf.reshape(self.n_datasets, -1) + mean_by_dataset = np.mean(means, axis=1) lines = super()._get_competences_lines() lines.extend([ "Mean by dataset:", - np.round(competences_mean_by_dataset, 3), + np.round(mean_by_dataset, 3), ]) return lines From d40452ab6066a1ed11576938a6e17945304dcc71 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Wed, 1 Sep 2021 22:33:33 -0400 Subject: [PATCH 16/21] =?UTF-8?q?Lignes=20de=20comp=C3=A9tence=20avec=20do?= =?UTF-8?q?nn=C3=A9es=20rendues=20en=20pourcentage.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deslib/util/stats.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/deslib/util/stats.py b/deslib/util/stats.py index 604e24b..d48f6ab 100644 --- a/deslib/util/stats.py +++ b/deslib/util/stats.py @@ -195,13 +195,14 @@ def _get_n_right_clf_stats(self): return n_right_clf_by_query, n_right_clf_ind def _get_competences_stats(self): - mean = np.mean(self.competences) - var = np.var(self.competences) - mean_by_clf = np.mean(self.competences, axis=0) - var_by_clf = np.var(self.competences, axis=0) + comp = self.competences/self.k + mean = np.mean(comp) + var = np.var(comp) + mean_by_clf = np.mean(comp, axis=0) + var_by_clf = np.var(comp, axis=0) n_even_max = 0 - for c in self.competences: + for c in comp: max_ = c[np.argmax(c)] n_max = np.count_nonzero(c == max_) if n_max > 1: n_even_max += 1 From 6b91780e8e6f36ff5a00b73de6e6dd7d13c305bd Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Mon, 3 Jan 2022 09:19:11 -0500 Subject: [PATCH 17/21] Sanitizing. --- deslib/util/stats.py | 67 ++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/deslib/util/stats.py b/deslib/util/stats.py index d48f6ab..7e87e21 100644 --- a/deslib/util/stats.py +++ b/deslib/util/stats.py @@ -13,10 +13,16 @@ def __init__(self): self.log_fname = "log.txt" def log_stats(self): - self.n_queries = len(self.true_labels) + bln_mat = np.equal(self.true_labels,self.predicted_labels) + self.wrong_true_labels = self.true_labels[~bln_mat] + self.wrong_bases_labels = self.bases_labels[~bln_mat] + self.wrong_predicted_labels = self.predicted_labels[~bln_mat] + self.wrong_competences = self.competences[~bln_mat] + + self.n_queries = len(self.wrong_true_labels) self.n_bases = len(self.bases_labels[0]) self.n_disagree = len(self.disagree_ind) - + with open(self.log_fname,'w') as f: for line in self._get_all_lines(): f.write(str(line)) @@ -44,7 +50,8 @@ def _get_general_lines(self): def _get_agree_lines(self): n_agree = len(self.agree_ind) agree_dis = self._get_distribution(ind=self.agree_ind) - agree_score = self._get_score(self.agree_ind) + agree_score = self._get_score( + self.agree_ind, self.true_labels, self.predicted_labels) predicted_dis = self._get_distribution() lines = [ @@ -64,9 +71,12 @@ def _get_agree_lines(self): def _get_n_right_clf_lines(self): n_right_clf_by_query, n_right_clf_ind = \ - self._get_n_right_clf_stats() + self._get_n_right_clf_stats(self.wrong_true_labels,self.wrong_bases_labels) n_right_clf_dis = self._get_distribution(n_right_clf_by_query) - scores = [self._get_score(n_right_clf_ind[i]) \ + scores = [self._get_score( + n_right_clf_ind[i], + self.wrong_true_labels, + self.wrong_predicted_labels) \ for i in range(len(n_right_clf_dis))] lines = [ @@ -82,8 +92,8 @@ def _get_n_right_clf_lines(self): return lines def _get_disagree_lines(self): - disagree_score = self._get_score(self.disagree_ind) - + disagree_score = self._get_score( + self.disagree_ind, self.true_labels, self.predicted_labels) lines = [ "--- Disagreements", "Instances, ratio on queries:", @@ -98,7 +108,7 @@ def _get_disagree_lines(self): def _get_competences_lines(self): mean, mean_by_clf, var, var_by_clf, n_even_max = \ - self._get_competences_stats() + self._get_competences_stats(self.wrong_competences) lines = [ "--- Competences", @@ -182,37 +192,35 @@ def _get_distribution(self, labels=None, ind=None): distribution[l] = unique_counts[i] return distribution - def _get_n_right_clf_stats(self): + def _get_n_right_clf_stats(self,true_labels,bases_labels): n_right_clf_by_query = [] n_right_clf_ind = [[] for i in range(self.n_bases + 1)] - for i,label in enumerate(self.true_labels): - row = self.bases_labels[i] + for i,label in enumerate(true_labels): + row = bases_labels[i] n_right_clf = np.count_nonzero(row == label) n_right_clf_by_query.append(n_right_clf) n_right_clf_ind[n_right_clf].append(i) return n_right_clf_by_query, n_right_clf_ind - def _get_competences_stats(self): - comp = self.competences/self.k - mean = np.mean(comp) - var = np.var(comp) - mean_by_clf = np.mean(comp, axis=0) - var_by_clf = np.var(comp, axis=0) + def _get_competences_stats(self, competences): + competences = competences/self.k + mean = np.mean(competences) + var = np.var(competences) + mean_by_clf = np.mean(competences, axis=0) + var_by_clf = np.var(competences, axis=0) n_even_max = 0 - for c in comp: + for c in competences: max_ = c[np.argmax(c)] n_max = np.count_nonzero(c == max_) if n_max > 1: n_even_max += 1 return mean, mean_by_clf, var, var_by_clf, n_even_max - def _get_score(self, ind): - true_labels = self.true_labels[ind] - labels = self.predicted_labels[ind] - matches = np.equal(true_labels, labels) + def _get_score(self, ind, true_labels, predicted_labels): + matches = np.equal(true_labels[ind], predicted_labels[ind]) score = np.sum(matches) return score @@ -227,21 +235,6 @@ def _get_all_lines(self): lines.extend(self._get_multistats_lines()) return lines - def _get_competences_lines(self): - _, mean_by_clf, _, var_by_clf, _ = \ - self._get_competences_stats() - - means = mean_by_clf.reshape(self.n_datasets, -1) - mean_by_dataset = np.mean(means, axis=1) - - lines = super()._get_competences_lines() - lines.extend([ - "Mean by dataset:", - np.round(mean_by_dataset, 3), - ]) - - return lines - def _get_multistats_lines(self): lines = [ "--- Multidatasets", From 9fd1de4642aa114db54662ee4440ef787b332ca9 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Fri, 14 Jan 2022 16:44:39 -0500 Subject: [PATCH 18/21] Retrait du code sur les stats de la SD. --- deslib/base.py | 11 -- deslib/des/base.py | 2 - deslib/multi_datasets.py | 12 -- deslib/util/stats.py | 244 --------------------------------------- 4 files changed, 269 deletions(-) delete mode 100644 deslib/util/stats.py diff --git a/deslib/base.py b/deslib/base.py index 4f20326..e892659 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -24,7 +24,6 @@ from deslib.util import faiss_knn_wrapper from deslib.util.dfp import frienemy_pruning_preprocessed from deslib.util.instance_hardness import hardness_region_competence -from deslib.util.stats import Stats class BaseDS(BaseEstimator, ClassifierMixin): @@ -57,7 +56,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, self.DSEL_perc = DSEL_perc self.knne = knne self.n_jobs = n_jobs - self._set_stats() # Check optional dependency if knn_classifier == 'faiss' and not faiss_knn_wrapper.is_available(): @@ -65,9 +63,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, 'Using knn_classifier="faiss" requires that the FAISS library ' 'be installed.Please check the Installation Guide.') - def _set_stats(self): - self.stats = Stats() - @abstractmethod def select(self, competences): """Select the most competent classifier for @@ -538,7 +533,6 @@ def predict(self, X): # Get the real indices_ of the samples that will be classified # using a DS algorithm. ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] - self.stats.disagree_ind = ind_ds_original_matrix if self.needs_proba: selected_probabilities = base_probabilities[ @@ -555,11 +549,6 @@ def predict(self, X): DFP_mask=DFP_mask) predicted_labels[ind_ds_original_matrix] = pred_ds - self.stats.bases_labels = base_predictions - self.stats.agree_ind = ind_all_agree - self.stats.predicted_labels = predicted_labels - self.stats.k = self.k - return self.classes_.take(predicted_labels) def predict_proba(self, X): diff --git a/deslib/des/base.py b/deslib/des/base.py index f518bbf..a749155 100644 --- a/deslib/des/base.py +++ b/deslib/des/base.py @@ -193,8 +193,6 @@ def classify_with_ds(self, query, predictions, probabilities=None, if self.DFP: competences = competences * DFP_mask - self.stats.competences = competences - if self.mode == "selection": # The selected_classifiers matrix is used as a mask to remove # the predictions of certain base classifiers. diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 09e8624..2ca0b2e 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -18,7 +18,6 @@ majority_voting_rule, aggregate_proba_ensemble_weighted) from deslib.util.instance_hardness import hardness_region_competence -from deslib.util.stats import MultiStats # Créer à partir de KNORA-U class MultiDatasets(BaseDS): @@ -35,9 +34,6 @@ def __init__(self, ds_classifier, pool_classifiers): super(MultiDatasets, self).__init__(pool_classifiers) self.ds_classifier = ds_classifier - def _set_stats(self): - self.stats = MultiStats() - def fit(self, X, y): """Prepare the DS models by setting the KNN algorithm and pre-processing the information required to apply the DS @@ -171,7 +167,6 @@ def _predict_ds(self, X): # Get the real indices_ of the samples that will be classified # using a DS algorithm. ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] - self.stats.disagree_ind = ind_ds_original_matrix if ds_classifier.needs_proba: selected_probabilities = base_probabilities[ @@ -195,7 +190,6 @@ def _predict_ds(self, X): merged_left_base_predictions = np.concatenate( merged_left_base_predictions, axis=1) merged_competences = np.concatenate(merged_competences, axis=1) - self.stats.competences = merged_competences if issubclass(type(self.ds_classifier), BaseDCS): pred_ds = self._get_dcs_predicted_label(self.ds_classifier, @@ -206,12 +200,6 @@ def _predict_ds(self, X): predicted_labels[ind_ds_original_matrix] = pred_ds - self.stats.n_datasets = n_datasets - self.stats.bases_labels = merged_base_predictions - self.stats.agree_ind = ind_all_agree - self.stats.predicted_labels = predicted_labels - self.stats.k = self.k - return self.classes_.take(predicted_labels) def _predict_oracle(self, X, y): diff --git a/deslib/util/stats.py b/deslib/util/stats.py deleted file mode 100644 index 7e87e21..0000000 --- a/deslib/util/stats.py +++ /dev/null @@ -1,244 +0,0 @@ -import numpy as np - - -class Stats(): - def __init__(self): - self.agree_ind = [] - self.disagree_ind = [] - self.true_labels = [] - self.bases_labels = [] - self.predicted_labels = [] - self.agree_labels = [] - self.competences = [] - self.log_fname = "log.txt" - - def log_stats(self): - bln_mat = np.equal(self.true_labels,self.predicted_labels) - self.wrong_true_labels = self.true_labels[~bln_mat] - self.wrong_bases_labels = self.bases_labels[~bln_mat] - self.wrong_predicted_labels = self.predicted_labels[~bln_mat] - self.wrong_competences = self.competences[~bln_mat] - - self.n_queries = len(self.wrong_true_labels) - self.n_bases = len(self.bases_labels[0]) - self.n_disagree = len(self.disagree_ind) - - with open(self.log_fname,'w') as f: - for line in self._get_all_lines(): - f.write(str(line)) - f.write("\n") - - def _get_all_lines(self): - lines = [] - lines.extend(self._get_general_lines()) - lines.extend(self._get_agree_lines()) - lines.extend(self._get_n_right_clf_lines()) - if self.n_disagree > 0: - lines.extend(self._get_disagree_lines()) - lines.extend(self._get_competences_lines()) - lines.extend(self._get_competences_reliability_lines()) - return lines - - def _get_general_lines(self): - lines = [ - "Queries:", - self.n_queries, - ] - - return lines - - def _get_agree_lines(self): - n_agree = len(self.agree_ind) - agree_dis = self._get_distribution(ind=self.agree_ind) - agree_score = self._get_score( - self.agree_ind, self.true_labels, self.predicted_labels) - predicted_dis = self._get_distribution() - - lines = [ - "--- Agreements", - "Instances, ratio on queries:", - n_agree, - round(n_agree / self.n_queries, 3), - "Distribution, ratio on predictions:", - agree_dis, - np.round(agree_dis / predicted_dis, 3), - "Score, ratio on agreements:", - agree_score, - round(agree_score / n_agree, 3), - ] - - return lines - - def _get_n_right_clf_lines(self): - n_right_clf_by_query, n_right_clf_ind = \ - self._get_n_right_clf_stats(self.wrong_true_labels,self.wrong_bases_labels) - n_right_clf_dis = self._get_distribution(n_right_clf_by_query) - scores = [self._get_score( - n_right_clf_ind[i], - self.wrong_true_labels, - self.wrong_predicted_labels) \ - for i in range(len(n_right_clf_dis))] - - lines = [ - "--- Right classifiers:", - "Distribution, ratio on queries:", - n_right_clf_dis, - np.round(n_right_clf_dis / self.n_queries, 3), - "Scores, ratio on N right clf", - scores, - np.round(scores / n_right_clf_dis, 3), - ] - - return lines - - def _get_disagree_lines(self): - disagree_score = self._get_score( - self.disagree_ind, self.true_labels, self.predicted_labels) - lines = [ - "--- Disagreements", - "Instances, ratio on queries:", - self.n_disagree, - round(self.n_disagree / self.n_queries, 3), - "Score, ratio on disagreements:", - disagree_score, - round(disagree_score / self.n_disagree, 3), - ] - - return lines - - def _get_competences_lines(self): - mean, mean_by_clf, var, var_by_clf, n_even_max = \ - self._get_competences_stats(self.wrong_competences) - - lines = [ - "--- Competences", - "Mean:", - round(mean, 3), - "Mean by classifier:", - np.round(mean_by_clf, 3), - "Var:", - round(var, 3), - "Var by classifier:", - np.round(var_by_clf, 3), - "Even max competences times, \nratio on disagreements:", - n_even_max, - round(n_even_max / self.n_disagree, 3), - ] - - return lines - - def _get_competences_reliability_lines(self): - true_labels_by_base = np.tile(self.true_labels,(self.n_bases,1)) - correct_bases_bln_array = np.equal( - self.bases_labels.T,true_labels_by_base) - n_queries = self.bases_labels.shape[0] - n_correct_labels_by_base = np.sum(correct_bases_bln_array,axis=1) - acc_by_base = np.round(n_correct_labels_by_base/n_queries,3) - - comp = self.competences/self.k - n_incorrect_labels_by_base = \ - len(self.predicted_labels)-n_correct_labels_by_base - - correct_comp = comp.T*correct_bases_bln_array - correct_comp_by_base = np.sum(correct_comp,axis=1) - correct_comp_by_base /= n_correct_labels_by_base - mean_correct_comp_by_base = np.round(correct_comp_by_base,3) - mean = mean_correct_comp_by_base - mean = np.repeat(mean,n_queries).reshape(self.n_bases,-1) - correct_comp_by_base = np.sum((correct_comp-mean)**2,axis=1) - correct_comp_by_base /= n_correct_labels_by_base - std_correct_comp_by_base = np.round(np.sqrt(correct_comp_by_base),3) - - incorrect_comp = comp.T*~correct_bases_bln_array - incorrect_comp_by_base = np.sum(incorrect_comp,axis=1) - incorrect_comp_by_base /= n_incorrect_labels_by_base - mean_incorrect_comp_by_base = np.round(incorrect_comp_by_base,3) - mean = mean_incorrect_comp_by_base - mean = np.repeat(mean,n_queries).reshape(self.n_bases,-1) - incorrect_comp_by_base = np.sum((incorrect_comp-mean)**2,axis=1) - incorrect_comp_by_base /= n_incorrect_labels_by_base - std_incorrect_comp_by_base = np.round( - np.sqrt(incorrect_comp_by_base),3) - - lines = [ - "--- Competence reliability", - "Acc:", - round(np.mean(acc_by_base),3), - "(by base):", - acc_by_base, - "Competence mean & std when well clasified:", - round(np.mean(mean_correct_comp_by_base),3), - round(np.mean(std_correct_comp_by_base),3), - "(by base):", - mean_correct_comp_by_base, - std_correct_comp_by_base, - "Competence mean & std when not well clasified:", - round(np.mean(mean_incorrect_comp_by_base),3), - round(np.mean(std_incorrect_comp_by_base),3), - "(by base):", - mean_incorrect_comp_by_base, - std_incorrect_comp_by_base, - ] - - return lines - - def _get_distribution(self, labels=None, ind=None): - labels = self.predicted_labels if labels is None else labels - max_label = max(labels) - if ind is not None: labels = labels[ind] - unique_labels, unique_counts = np.unique(labels, return_counts=True) - distribution = np.full(max_label+1,0) - for i,l in enumerate(unique_labels): - distribution[l] = unique_counts[i] - return distribution - - def _get_n_right_clf_stats(self,true_labels,bases_labels): - n_right_clf_by_query = [] - n_right_clf_ind = [[] for i in range(self.n_bases + 1)] - - for i,label in enumerate(true_labels): - row = bases_labels[i] - n_right_clf = np.count_nonzero(row == label) - n_right_clf_by_query.append(n_right_clf) - n_right_clf_ind[n_right_clf].append(i) - - return n_right_clf_by_query, n_right_clf_ind - - def _get_competences_stats(self, competences): - competences = competences/self.k - mean = np.mean(competences) - var = np.var(competences) - mean_by_clf = np.mean(competences, axis=0) - var_by_clf = np.var(competences, axis=0) - n_even_max = 0 - - for c in competences: - max_ = c[np.argmax(c)] - n_max = np.count_nonzero(c == max_) - if n_max > 1: n_even_max += 1 - - return mean, mean_by_clf, var, var_by_clf, n_even_max - - def _get_score(self, ind, true_labels, predicted_labels): - matches = np.equal(true_labels[ind], predicted_labels[ind]) - score = np.sum(matches) - return score - - -class MultiStats(Stats): - def __init__(self): - super().__init__() - self.n_datasets = 1 - - def _get_all_lines(self): - lines = super()._get_all_lines() - lines.extend(self._get_multistats_lines()) - return lines - - def _get_multistats_lines(self): - lines = [ - "--- Multidatasets", - self.n_datasets - ] - - return lines From a71dcd7dbc0796227e895e5df0b81ecb615f5c6b Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Fri, 14 Jan 2022 16:56:59 -0500 Subject: [PATCH 19/21] Retrait de multiknop. --- deslib/des/multi_knop.py | 256 --------------------------------------- 1 file changed, 256 deletions(-) delete mode 100644 deslib/des/multi_knop.py diff --git a/deslib/des/multi_knop.py b/deslib/des/multi_knop.py deleted file mode 100644 index c1dac6d..0000000 --- a/deslib/des/multi_knop.py +++ /dev/null @@ -1,256 +0,0 @@ -# coding=utf-8 - -# Author: Rafael Menelau Oliveira e Cruz -# -# License: BSD 3 clause - -import numpy as np - -from deslib.des.base import BaseDS -from deslib.des.knop import KNOP -from sklearn.utils.validation import (check_X_y, check_is_fitted, check_array, - check_random_state) - - -class MultiKNOP(KNOP): - def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, - safe_k=None, IH_rate=0.30, random_state=None, voting='hard', - knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): - - super(KNOP, self).__init__(None, k, - DFP=DFP, - with_IH=with_IH, - safe_k=safe_k, - IH_rate=IH_rate, - needs_proba=True, - random_state=random_state, - knn_classifier=knn_classifier, - knne=knne, - DSEL_perc=DSEL_perc, - n_jobs=n_jobs) - self.ds_classifiers = [] - for i in range(len(pool_classifiers)): - knop = KNOP(pool_classifiers[i], k, - DFP=DFP, - with_IH=with_IH, - safe_k=safe_k, - IH_rate=IH_rate, - random_state=random_state, - knn_classifier=knn_classifier, - knne=knne, - DSEL_perc=DSEL_perc, - n_jobs=n_jobs) - self.ds_classifiers.append(knop) - - - """Multi k-Nearest Output Profiles (MultiKNOP). - """ - def fit(self, X, y): - """Train the DS model by setting the KNN algorithm and - pre-process the information required to apply the DS - methods. In this case, the scores of the base classifiers for - the dynamic selection dataset (DSEL) are pre-calculated to - transform each sample in DSEL into an output profile. - - Parameters - ---------- - X : array of shape n_datasets, n_samples, n_features) - Data used to fit the model. - - y : array of shape (n_datasets, n_samples) - class labels of each example in X. - - Returns - ------- - self - """ - if len(X) == 1 or len(y) == 1: - raise ValueError("Error. MultiKNOP does not accept one dataset!") - """ - for predict()... - for i in range(len(y)-1): - if np.array_equal(y[0],y[i+1]): - raise ValueError( - "Error. All datasets queries must match exactly!") - """ - - datasets_dsel_scores = [] - datasets_DSEL_processed_ = [] - # Process each dataset - for i in range(len(X)): - self.ds_classifiers[i].fit(X[i], y[i]) - if self.ds_classifiers[i].n_classes_ == 1: - raise ValueError( - "Error. MultiKNOP does not accept one class datasets!") - self.ds_classifiers[i]._check_predict_proba() - self.ds_classifiers[i].dsel_scores_ = \ - self.ds_classifiers[i]._preprocess_dsel_scores() - datasets_dsel_scores.append(self.ds_classifiers[i].dsel_scores_) - datasets_DSEL_processed_.append( - self.ds_classifiers[i].DSEL_processed_) - - self.dsel_scores_ = np.concatenate(datasets_dsel_scores, axis=1) - self.DSEL_processed_ = np.concatenate(datasets_DSEL_processed_, axis=1) - self.n_classifiers_ = self.dsel_scores_.shape[1] - - # Reassignment - self.DSEL_target_ = self.ds_classifiers[0].DSEL_target_ - self.n_samples_ = self.ds_classifiers[0].n_samples_ - self.n_classes_ = self.ds_classifiers[0].n_classes_ - self.knn_class_ = self.ds_classifiers[0].knn_class_ - self.k_ = self.ds_classifiers[0].k_ - self.classes_ = self.ds_classifiers[0].classes_ - - # Reshape DSEL_scores as a 2-D array for nearest neighbor calculations - dsel_output_profiles = self.dsel_scores_.reshape(self.n_samples_, - self.n_classifiers_ * - self.n_classes_) - - self._fit_OP(dsel_output_profiles, self.DSEL_target_, self.k_) - - return self - - def _fit_OP(self, X_op, y_op, k): - """ Fit the set of output profiles. - - Parameters - ---------- - X_op : array of shape (n_samples, n_features) - Output profiles of the training data. n_features is equals - to (n_classifiers x n_classes). - - y_op : array of shape (n_samples) - Class labels of each sample in X_op. - - k : int - Number of output profiles used in the region of competence - estimation. - - """ - self.op_knn_ = self.knn_class_(k) - - if self.n_classes_ == 2: - # Get only the scores for one class since they are complementary - X_temp = X_op[:, ::2] - self.op_knn_.fit(X_temp, y_op) - else: - self.op_knn_.fit(X_op, y_op) - - def _get_similar_out_profiles(self, probabilities): - """Get the most similar output profiles of the query sample. - - Parameters - ---------- - probabilities : array of shape (n_samples, n_classifiers, n_classes) - predictions of each base classifier for all samples. - - Returns - ------- - dists : list of shape = [n_samples, k] - The distances between the query and each sample in the region - of competence. The vector is ordered in an ascending fashion. - - idx : list of shape = [n_samples, k] - Indices of the instances belonging to the region of competence of - the given query sample. - """ - - if self.n_classes_ == 2: - # Get only the scores for one class since they are complementary - query_op = probabilities[:, :, 0] - else: - query_op = probabilities.reshape((probabilities.shape[0], - self.n_classifiers_ * - self.n_classes_)) - - dists, idx = self.op_knn_.kneighbors(query_op, n_neighbors=self.k_, - return_distance=True) - return dists, np.atleast_2d(idx) - - def predict(self, X): - """Predict the class label for each sample in X. - Parameters - ---------- - X : array of shape (n_samples, n_features) - The input data. - Returns - ------- - predicted_labels : array of shape (n_samples) - Predicted class label for each sample in X. - """ - # Check if the DS model was trained - #check_is_fitted(self, - # ["DSEL_processed_", "DSEL_data_", "DSEL_target_"]) - - # Check if X is a valid input - #for i in range(len(X)): - # X[i] = check_array(X[i]) - # self._check_num_features(X[i]) - - n_samples = X[0].shape[0] - predicted_labels = np.empty(n_samples, dtype=np.intp) - - base_probabilities = [] - for i in range(len(X)): - base_probabilities.append( - self.ds_classifiers[i]._predict_proba_base(X[i])) - base_probabilities = np.concatenate(base_probabilities,axis=1) - base_predictions = base_probabilities.argmax(axis=2) - - all_agree_vector = BaseDS._all_classifier_agree(base_predictions) - ind_all_agree = np.where(all_agree_vector)[0] - - # Since the predictions are always the same, get the predictions of the - # first base classifier. - if ind_all_agree.size: - predicted_labels[ind_all_agree] = base_predictions[ - ind_all_agree, 0] - - # For the samples with disagreement, perform the dynamic selection - # steps. First step is to collect the samples with disagreement - # between base classifiers - ind_disagreement = np.where(~all_agree_vector)[0] - if ind_disagreement.size: - - X_DS = X[0][ind_disagreement, :] - - # Then, we estimate the nearest neighbors for all samples that - # we need to call DS routines - distances, neighbors = None, None - #distances, neighbors = self._get_region_competence(X_DS) - - # IH was not considered. So all samples with disagreement are - # passed down to the DS algorithm - ind_ds_classifier = np.arange(ind_disagreement.size) - - # At this stage the samples which all base classifiers agrees or - # that are associated with low hardness were already classified. - # The remaining samples are now passed down to the DS techniques - # for classification. - - # First check whether there are still samples to be classified. - if ind_ds_classifier.size: - - DFP_mask = np.ones( - (ind_ds_classifier.size, self.n_classifiers_)) - - # Get the real indices_ of the samples that will be classified - # using a DS algorithm. - ind_ds_original_matrix = ind_disagreement[ind_ds_classifier] - - if self.needs_proba or self.voting == 'soft': - selected_probabilities = base_probabilities[ - ind_ds_original_matrix] - else: - selected_probabilities = None - - pred_ds = self.classify_with_ds(X_DS[ind_ds_classifier], - base_predictions[ - ind_ds_original_matrix], - selected_probabilities, - neighbors=neighbors, - distances=distances, - DFP_mask=DFP_mask) - predicted_labels[ind_ds_original_matrix] = pred_ds - - return self.classes_.take(predicted_labels) From 2e36290f67b42963572bc732ba431928182e9843 Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Wed, 23 Feb 2022 22:17:29 -0500 Subject: [PATCH 20/21] Sanitazing. --- deslib/base.py | 26 ++++++-------------------- deslib/dcs/base.py | 3 +-- deslib/dcs/ola.py | 3 +-- deslib/des/base.py | 4 +--- deslib/des/knop.py | 4 +--- deslib/des/knora_e.py | 4 +--- deslib/des/knora_u.py | 4 +--- deslib/multi_datasets.py | 4 ++-- 8 files changed, 14 insertions(+), 38 deletions(-) diff --git a/deslib/base.py b/deslib/base.py index e892659..e8f51cf 100644 --- a/deslib/base.py +++ b/deslib/base.py @@ -16,7 +16,7 @@ from sklearn.ensemble import BaseEnsemble, BaggingClassifier from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -from sklearn.preprocessing import LabelEncoder, normalize +from sklearn.preprocessing import LabelEncoder from sklearn.utils.validation import (check_X_y, check_is_fitted, check_array, check_random_state) @@ -40,8 +40,8 @@ class BaseDS(BaseEstimator, ClassifierMixin): @abstractmethod def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, needs_proba=False, - random_state=None, knn_classifier='knn', - knn_metric='minkowski', DSEL_perc=0.5, knne=False, n_jobs=-1): + random_state=None, knn_classifier='knn', DSEL_perc=0.5, + knne=False, n_jobs=-1): self.pool_classifiers = pool_classifiers self.k = k @@ -52,7 +52,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, self.needs_proba = needs_proba self.random_state = random_state self.knn_classifier = knn_classifier - self.knn_metric = knn_metric self.DSEL_perc = DSEL_perc self.knne = knne self.n_jobs = n_jobs @@ -245,7 +244,7 @@ class labels of each example in X. # validate the value of k self._validate_k() - self._set_region_of_competence_algorithm(X_dsel) + self._set_region_of_competence_algorithm() self._fit_region_competence(X_dsel, y_dsel) # validate the IH @@ -316,7 +315,6 @@ def _fit_region_competence(self, X, y): class labels of each sample in X. """ - if self.knn_metric == 'cosine': X = normalize(X) self.roc_algorithm_.fit(X, y) def _set_dsel(self, X, y): @@ -339,24 +337,13 @@ class labels of each sample in X. self.n_samples_ = self.DSEL_target_.size self.DSEL_processed_, self.BKS_DSEL_ = self._preprocess_dsel() - def _set_region_of_competence_algorithm(self, X): - - algorithm = "auto" - metric = 'minkowski' - metric_params = None - - if self.knn_metric == 'mahalanobis': - metric = 'mahalanobis' - metric_params = {'V': np.cov(X)} - algorithm = "brute" + def _set_region_of_competence_algorithm(self): if self.knn_classifier is None or self.knn_classifier in ['knn', 'sklearn']: knn_class = functools.partial(KNeighborsClassifier, n_jobs=self.n_jobs, - algorithm=algorithm, - metric=metric, - metric_params=metric_params) + algorithm="auto") elif self.knn_classifier == 'faiss': knn_class = functools.partial( faiss_knn_wrapper.FaissKNNClassifier, @@ -453,7 +440,6 @@ def predict(self, X): # steps. First step is to collect the samples with disagreement # between base classifiers ind_disagreement = np.where(~all_agree_vector)[0] - ind_disagreement = np.asarray(range(len(predicted_labels))) if ind_disagreement.size: X_DS = X[ind_disagreement, :] diff --git a/deslib/dcs/base.py b/deslib/dcs/base.py index 3ef54e7..974efd7 100644 --- a/deslib/dcs/base.py +++ b/deslib/dcs/base.py @@ -21,7 +21,7 @@ class BaseDCS(BaseDS): def __init__(self, pool_classifiers=None, k=7, DFP=False, safe_k=None, with_IH=False, IH_rate=0.30, selection_method='best', diff_thresh=0.1, random_state=None, knn_classifier='knn', - knn_metric='minkowski', DSEL_perc=0.5, + DSEL_perc=0.5, knne=False, n_jobs=-1): super(BaseDCS, self).__init__(pool_classifiers=pool_classifiers, k=k, @@ -29,7 +29,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, safe_k=None, IH_rate=IH_rate, random_state=random_state, knn_classifier=knn_classifier, - knn_metric=knn_metric, DSEL_perc=DSEL_perc, knne=knne, n_jobs=n_jobs) diff --git a/deslib/dcs/ola.py b/deslib/dcs/ola.py index b3d81b5..3ea0f35 100644 --- a/deslib/dcs/ola.py +++ b/deslib/dcs/ola.py @@ -111,7 +111,7 @@ class :class:`FaissKNNClassifier` def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, selection_method='best', diff_thresh=0.1, random_state=None, knn_classifier='knn', - knn_metric='minkowski', knne=False, DSEL_perc=0.5, n_jobs=-1): + knne=False, DSEL_perc=0.5, n_jobs=-1): super(OLA, self).__init__(pool_classifiers=pool_classifiers, k=k, DFP=DFP, with_IH=with_IH, safe_k=safe_k, IH_rate=IH_rate, @@ -119,7 +119,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, diff_thresh=diff_thresh, random_state=random_state, knn_classifier=knn_classifier, - knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/des/base.py b/deslib/des/base.py index a749155..1530e64 100644 --- a/deslib/des/base.py +++ b/deslib/des/base.py @@ -21,8 +21,7 @@ class BaseDES(BaseDS): def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, mode='selection', needs_proba=False, random_state=None, - knn_classifier='knn', knn_metric='minkowski', knne=False, - DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): super(BaseDES, self).__init__(pool_classifiers=pool_classifiers, k=k, @@ -33,7 +32,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, needs_proba=needs_proba, random_state=random_state, knn_classifier=knn_classifier, - knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) self.mode = mode diff --git a/deslib/des/knop.py b/deslib/des/knop.py index 1cffc36..adffb36 100644 --- a/deslib/des/knop.py +++ b/deslib/des/knop.py @@ -106,8 +106,7 @@ class :class:`FaissKNNClassifier` """ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, random_state=None, - knn_classifier='knn', knn_metric='minkowski', knne=False, - DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): super(KNOP, self).__init__(pool_classifiers, k, DFP=DFP, @@ -118,7 +117,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, needs_proba=True, random_state=random_state, knn_classifier=knn_classifier, - knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/des/knora_e.py b/deslib/des/knora_e.py index 6619b03..fa0298d 100644 --- a/deslib/des/knora_e.py +++ b/deslib/des/knora_e.py @@ -99,8 +99,7 @@ class :class:`FaissKNNClassifier` def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, random_state=None, - knn_classifier='knn', knn_metric='minkowski', knne=False, - DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): super(KNORAE, self).__init__(pool_classifiers=pool_classifiers, k=k, @@ -110,7 +109,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, IH_rate=IH_rate, random_state=random_state, knn_classifier=knn_classifier, - knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/des/knora_u.py b/deslib/des/knora_u.py index c4403fc..e6cdc89 100644 --- a/deslib/des/knora_u.py +++ b/deslib/des/knora_u.py @@ -95,8 +95,7 @@ class :class:`FaissKNNClassifier` def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, safe_k=None, IH_rate=0.30, random_state=None, - knn_classifier='knn', knn_metric='minkowski', knne=False, - DSEL_perc=0.5, n_jobs=-1): + knn_classifier='knn', knne=False, DSEL_perc=0.5, n_jobs=-1): super(KNORAU, self).__init__(pool_classifiers, k, DFP=DFP, with_IH=with_IH, @@ -105,7 +104,6 @@ def __init__(self, pool_classifiers=None, k=7, DFP=False, with_IH=False, mode='weighting', random_state=random_state, knn_classifier=knn_classifier, - knn_metric=knn_metric, knne=knne, DSEL_perc=DSEL_perc, n_jobs=n_jobs) diff --git a/deslib/multi_datasets.py b/deslib/multi_datasets.py index 2ca0b2e..4ef4188 100644 --- a/deslib/multi_datasets.py +++ b/deslib/multi_datasets.py @@ -105,7 +105,7 @@ def _predict_ds(self, X): predicted_labels = np.empty(n_samples, dtype=np.intp) all_agree_vector = BaseDS._all_classifier_agree(merged_base_predictions) - ind_all_agree = np.where(all_agree_vector)[0] + ind_all_agree = np.where(all_agree_vector)[0] # Since the predictions are always the same, get the predictions of the # first base classifier. @@ -337,7 +337,7 @@ def _get_DFP_mask(self, ds_classifier, ind_ds_classifier, neighbors): DFP_mask = np.ones( (ind_ds_classifier.size, self.n_classifiers_)) - def _get_competences(self, ds_classifier, query, predictions, + def _get_competences(self, ds_classifier, query, predictions, probabilities=None, neighbors=None, distances=None, DFP_mask=None): """ From 871abef20368c71db9dd846e377d30e86cbf2cfe Mon Sep 17 00:00:00 2001 From: Pierre-Marc Thibault Date: Mon, 16 May 2022 23:40:41 -0400 Subject: [PATCH 21/21] Multidatasets tests. --- deslib/tests/test_multidatasets.py | 100 +++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 deslib/tests/test_multidatasets.py diff --git a/deslib/tests/test_multidatasets.py b/deslib/tests/test_multidatasets.py new file mode 100644 index 0000000..83ec0a1 --- /dev/null +++ b/deslib/tests/test_multidatasets.py @@ -0,0 +1,100 @@ +import numpy as np +import pytest +import math +from sklearn.datasets import make_classification +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +from sklearn.metrics import precision_recall_fscore_support as prf +from sklearn.calibration import CalibratedClassifierCV as CC +from sklearn.linear_model import Perceptron +from sklearn.naive_bayes import GaussianNB as GNB +from deslib.base import BaseDS +from deslib.multi_datasets import MultiDatasets +# Static techniques +from deslib.static.oracle import Oracle +from deslib.static.stacked import StackedClassifier +# DCS techniques +from deslib.dcs.a_posteriori import APosteriori +from deslib.dcs.a_priori import APriori +from deslib.dcs.lca import LCA +from deslib.dcs.mcb import MCB +from deslib.dcs.mla import MLA +from deslib.dcs.ola import OLA +from deslib.dcs.rank import Rank +from deslib.des import DESKL +# DES techniques +from deslib.des.des_knn import DESKNN +from deslib.des.des_p import DESP +from deslib.des.knop import KNOP +from deslib.des.knora_e import KNORAE +from deslib.des.knora_u import KNORAU +from deslib.des.meta_des import METADES + + +# ----- Integration tests ----- + +def setup_classifiers(): + rng = np.random.RandomState(123456) + rng2 = np.random.RandomState(654321) + + # Generate a classification dataset + X, y = make_classification(n_classes=2, n_samples=1000, weights=[0.2, 0.8], + random_state=rng) + X2, y2 = make_classification(n_classes=2, n_samples=1000, weights=[0.3, 0.7], + random_state=rng2) + # split the data into training and test data + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, random_state=rng) + X_train2, X_test2, y_train2, y_test2 = train_test_split( + X2, y2, test_size=0.5, random_state=rng2) + + # Scale the variables to have 0 mean and unit variance + scalar = StandardScaler() + X_train = scalar.fit_transform(X_train) + X_test = scalar.transform(X_test) + scalar2 = StandardScaler() + X_train2 = scalar2.fit_transform(X_train2) + X_test2 = scalar2.transform(X_test2) + + # Split the data into training and DSEL for DS techniques + X_train, X_dsel, y_train, y_dsel = train_test_split( + X_train, y_train, test_size=0.5, random_state=rng) + X_train2, X_dsel2, y_train2, y_dsel2 = train_test_split( + X_train2, y_train2, test_size=0.5, random_state=rng2) + + gnb1 = GNB() + gnb2 = GNB() + gnb1.fit(X_train, y_train) + gnb2.fit(X_train2, y_train2) + pool_classifiers = np.asarray([[gnb1, gnb1], [gnb2, gnb2]]) + return (X_dsel, y_dsel, X_test, y_test, + X_dsel2, y_dsel2, X_test2, y_test2, pool_classifiers) + +@pytest.mark.parametrize('params',[ + [Oracle(),0.962], + [StackedClassifier(),0.85], + [KNORAU(),0.764], + [KNORAE(),0.772], + [DESP(),0.666], + [OLA(),0.830], + [LCA(),0.814], + [MLA(),0.810], + [MCB(random_state=0),0.806], + [APriori(random_state=0),0.796], + [Rank(),0.824], + [APosteriori(random_state=0),0.782], + [METADES(),0.690], + [KNOP(),0.792], + [DESKL(),0.680] +]) +def test(params): + [X_dsel, y_dsel, X_test, y_test, + X_dsel2, y_dsel2, X_test2, y_test2, pool_classifiers] = setup_classifiers() + technique = MultiDatasets(params[0], pool_classifiers) + technique.fit(np.asarray([X_dsel, X_dsel2]), np.asarray([y_dsel, y_dsel2])) + pred = technique.predict(np.asarray([X_test, X_test2]), y_test); + precision = prf(y_test, pred, average='micro')[0] + assert math.isclose(precision, params[1]); +