-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
48 lines (42 loc) · 2.76 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def preprocess(df, option):
# Defining the map function
def binary_map(feature):
return feature.map({'Yes': 1, 'No': 0})
# Encode binary categorical features
binary_list = ['SeniorCitizen', 'Dependents', 'PhoneService', 'PaperlessBilling']
df[binary_list] = df[binary_list].apply(binary_map)
# Drop values based on operational options
if (option == "Online"):
columns = ['SeniorCitizen', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges',
'TotalCharges', 'MultipleLines_No_phone_service', 'MultipleLines_Yes', 'InternetService_Fiber_optic',
'InternetService_No', 'OnlineSecurity_No_internet_service', 'OnlineSecurity_Yes',
'OnlineBackup_No_internet_service', 'TechSupport_No_internet_service', 'TechSupport_Yes',
'StreamingTV_No_internet_service', 'StreamingTV_Yes', 'StreamingMovies_No_internet_service',
'StreamingMovies_Yes', 'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Electronic_check']
# Encoding the other categorical categoric features with more than two categories
df = pd.get_dummies(df).reindex(columns=columns, fill_value=0)
elif (option == "Batch"):
pass
df = df[['SeniorCitizen', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity',
'OnlineBackup', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod',
'MonthlyCharges', 'TotalCharges']]
columns = ['SeniorCitizen', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges',
'TotalCharges', 'MultipleLines_No_phone_service', 'MultipleLines_Yes', 'InternetService_Fiber_optic',
'InternetService_No', 'OnlineSecurity_No_internet_service', 'OnlineSecurity_Yes',
'OnlineBackup_No_internet_service', 'TechSupport_No_internet_service', 'TechSupport_Yes',
'StreamingTV_No_internet_service', 'StreamingTV_Yes', 'StreamingMovies_No_internet_service',
'StreamingMovies_Yes', 'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Electronic_check']
# Encoding the other categorical categoric features with more than two categories
df = pd.get_dummies(df).reindex(columns=columns, fill_value=0)
else:
print("Incorrect operational options")
# feature scaling
sc = MinMaxScaler()
df['tenure'] = sc.fit_transform(df[['tenure']])
df['MonthlyCharges'] = sc.fit_transform(df[['MonthlyCharges']])
df['TotalCharges'] = sc.fit_transform(df[['TotalCharges']])
return df