You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to implement a custom transformer for multilabel SMOTE in my pipeline. However, I constantly run into:
"All intermediate steps of the chain should be estimators that implement fit and transform or fit_resample (but not both) or be a string 'passthrough' 'MLSMOTE_resampler(n_samples=100)' (type <class 'upsampling_multilabel_data_with_mlsmote.MLSMOTE_resampler'>) doesn't). Here's my code for the class
import numpy as np
import pandas as pd
import random
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
def get_tail_label(df: pd.DataFrame, ql=[0.05, 1.]) -> list:
"""
Find the underrepresented targets.
Underrepresented targets are those which are observed less than the median occurence.
Targets beyond a quantile limit are filtered.
"""
irlbl = df.sum(axis=0)
irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))] # Filtering
irlbl = irlbl.max() / irlbl
threshold_irlbl = irlbl.median()
tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
return tail_label
def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.05, 1.]):
"""
return
X_sub: pandas.DataFrame, the feature vector minority dataframe
y_sub: pandas.DataFrame, the target vector minority dataframe
"""
tail_labels = get_tail_label(y, ql=ql)
index = y[y[tail_labels].any(axis=1)].index.tolist()
X_sub = X[X.index.isin(index)].reset_index(drop=True)
y_sub = y[y.index.isin(index)].reset_index(drop=True)
return X_sub, y_sub
def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
"""
Give index of 10 nearest neighbor of all the instance
args
X: np.array, array whose nearest neighbor has to find
return
indices: list of list, index of 5 NN of each element in X
"""
nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='brute').fit(X)
euclidean, indices = nbs.kneighbors(X)
return indices
def MLSMOTE(X, y, n_sample, neigh=5):
"""
Give the augmented data using MLSMOTE algorithm
args
X: pandas.DataFrame, input vector DataFrame
y: pandas.DataFrame, feature vector dataframe
n_sample: int, number of newly generated sample
return
new_X: pandas.DataFrame, augmented feature vector data
target: pandas.DataFrame, augmented target vector data
"""
indices2 = nearest_neighbour(X, neigh=5)
n = len(indices2)
new_X = np.zeros((n_sample, X.shape[1]))
target = np.zeros((n_sample, y.shape[1]))
for i in range(n_sample):
reference = random.randint(0, n - 1)
neighbor = random.choice(list(indices2[reference, 1:]))
all_point = indices2[reference]
nn_df = y[y.index.isin(all_point)]
ser = nn_df.sum(axis=0, skipna=True)
target[i] = np.array([1 if val > 0 else 0 for val in ser])
ratio = random.random()
gap = X.loc[reference, :] - X.loc[neighbor, :]
new_X[i] = np.array(X.loc[reference, :] + ratio * gap)
new_X = pd.DataFrame(new_X, columns=X.columns)
target = pd.DataFrame(target, columns=y.columns)
return new_X, target
class MLSMOTE_resampler(BaseEstimator, TransformerMixin):
def __init__(self, n_samples=500, neigh=5):
self.n_samples = n_samples
self.neigh = neigh
def _fit_resample(self, X, y):
X = pd.DataFrame.sparse.from_spmatrix(X)
X_sub, y_sub = get_minority_samples(X, y)
X_res, y_res = MLSMOTE(X_sub, y_sub)
X_con = pd.concat(X, X_res, ignore_index=True)
y_con = pd.concat(y, y_res, ignore_index=True)
return X_con, y_con
This is my pipeline:
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from ml_pipeline_preparation tokenize
from upsampling_multilabel_data_with_mlsmote import MLSMOTE
pipe = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('mlsmote', sampler()),
('clf', MultiOutputClassifier(RandomForestClassifier()))
])
The text was updated successfully, but these errors were encountered:
@solegalli I tried it with and without still with the same result. The reason why I tried it with was just to follow the SMOTE class of imbalanced-learn.
I am trying to implement a custom transformer for multilabel SMOTE in my pipeline. However, I constantly run into:
"All intermediate steps of the chain should be estimators that implement fit and transform or fit_resample (but not both) or be a string 'passthrough' 'MLSMOTE_resampler(n_samples=100)' (type <class 'upsampling_multilabel_data_with_mlsmote.MLSMOTE_resampler'>) doesn't). Here's my code for the class
This is my pipeline:
The text was updated successfully, but these errors were encountered: