forked from eyra/fertility-prediction-challenge
-
Notifications
You must be signed in to change notification settings - Fork 1
/
submission.py
144 lines (114 loc) · 7.4 KB
/
submission.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
This is an example script to generate the outcome variable given the input dataset.
This script should be modified to prepare your own submission that predicts
the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function.
The predict_outcomes function takes a Pandas data frame. The return value must
be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
should contain the nomem_encr column from the input data frame. The outcome
column should contain the predicted outcome for each nomem_encr. The outcome
should be 0 (no child) or 1 (having a child).
clean_df should be used to clean (preprocess) the data.
run.py can be used to test your submission.
"""
# List your libraries and modules here. Don't forget to update environment.yml!
from training import clean_df
import pandas as pd
import numpy as np
import joblib
def data_prepartion(X_var, outcome=None):
vars_model = ["nomem_encr", 'gender_bg', 'age_bg',
'partnership_status', 'domestic_situation', 'lenght_partnership',
'satisf_partnership',
'age_of_female', 'age_of_male', 'hh_net_income', 'fertility_intentions',
'parity', 'high_edu_level', 'child_soon', 'n_children_in_hh', 'fert_int_index_5y', 'hh_income_sd_5y',
'stability_hh_5y', 'personal_income_2020', 'dutch', 'non_wstrn_1gen',
'non_wstrn_2gen', 'wstrn_1gen', 'wstrn_2gen', 'sted', 'cost-free',
'rental', 'self-owned', 'weight', 'height', 'bmi', 'len_partnership',
'satis_partnership', 'irregular_work', 'ends_meet', 'religion_they_1.0',
'religion_they_2.0', 'religion_they_3.0', 'religion_they_4.0',
'religion_they_5.0', 'religion_they_6.0', 'religion_they_7.0',
'religion_they_8.0', 'religion_they_9.0', 'religion_they_10.0',
'religion_they_11.0', 'religion_they_12.0', 'religion_they_13.0',
'religion_they_14.0', 'religious', 'religion_you_1.0',
'religion_you_2.0', 'religion_you_3.0', 'religion_you_4.0',
'religion_you_5.0', 'religion_you_6.0', 'religion_you_7.0',
'religion_you_8.0', 'religion_you_9.0', 'religion_you_10.0',
'religion_you_11.0', 'religion_you_12.0', 'religion_you_13.0',
'religion_you_14.0', 'freq_see_father', 'freq_see_mother', 'life_satis',
'satis_relationship', 'satis_family_life', 'satis_house',
'satis_financial', 'satis_contacts', 'perc_health', 'long_disease',
'hinder', 'nettocat_clean']
var_cate = ["gender_bg", "partnership_status", "domestic_situation", "satisf_partnership"
, "fertility_intentions", "high_edu_level", "n_children_in_hh"]
# Check which columns are missing
missing_columns = [col for col in vars_model if col not in X_var.columns]
# Add missing columns to the DataFrame and fill them with zeros
for col in missing_columns:
X_var[col] = 0
imputed_media_median = X_var[['gender_bg', 'age_bg',
'partnership_status', 'domestic_situation', 'lenght_partnership',
'satisf_partnership',
'age_of_female', 'age_of_male', 'hh_net_income', 'fertility_intentions',
'parity', 'high_edu_level', 'child_soon', 'n_children_in_hh', 'fert_int_index_5y', 'hh_income_sd_5y',
'stability_hh_5y', 'personal_income_2020', 'dutch', 'non_wstrn_1gen',
'non_wstrn_2gen', 'wstrn_1gen', 'wstrn_2gen', 'sted', 'cost-free',
'rental', 'self-owned', 'weight', 'height', 'bmi', 'len_partnership',
'satis_partnership', 'irregular_work', 'ends_meet', 'religion_they_1.0',
'religion_they_2.0', 'religion_they_3.0', 'religion_they_4.0',
'religion_they_5.0', 'religion_they_6.0', 'religion_they_7.0',
'religion_they_8.0', 'religion_they_9.0', 'religion_they_10.0',
'religion_they_11.0', 'religion_they_12.0', 'religion_they_13.0',
'religion_they_14.0', 'religious', 'religion_you_1.0',
'religion_you_2.0', 'religion_you_3.0', 'religion_you_4.0',
'religion_you_5.0', 'religion_you_6.0', 'religion_you_7.0',
'religion_you_8.0', 'religion_you_9.0', 'religion_you_10.0',
'religion_you_11.0', 'religion_you_12.0', 'religion_you_13.0',
'religion_you_14.0', 'freq_see_father', 'freq_see_mother', 'life_satis',
'satis_relationship', 'satis_family_life', 'satis_house',
'satis_financial', 'satis_contacts', 'perc_health', 'long_disease',
'hinder', 'nettocat_clean']].median()
X = X_var[vars_model]
X[vars_model] = X[vars_model].fillna(imputed_media_median)
# X=X.dropna(how='any')
X[var_cate] = X[var_cate].astype('int').astype("category")
if isinstance(outcome, pd.DataFrame):
y = outcome[outcome["nomem_encr"].isin(X["nomem_encr"])][["nomem_encr", "new_child"]]
y["new_child"] = y["new_child"].astype('int')
y = y.drop(columns="nomem_encr")
return X, y
else:
return X
def predict_outcomes(df, background_df=None, model_path="model.joblib"):
"""Generate predictions using the saved model and the input dataframe.
The predict_outcomes function accepts a Pandas DataFrame as an argument
and returns a new DataFrame with two columns: nomem_encr and
prediction. The nomem_encr column in the new DataFrame replicates the
corresponding column from the input DataFrame. The prediction
column contains predictions for each corresponding nomem_encr. Each
prediction is represented as a binary value: '0' indicates that the
individual did not have a child during 2021-2023, while '1' implies that
they did.
Parameters:
df (pd.DataFrame): The input dataframe for which predictions are to be made.
background_df (pd.DataFrame): The background dataframe for which predictions are to be made.
model_path (str): The path to the saved model file (which is the output of training.py).
Returns:
pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions.
"""
## This script contains a bare minimum working example
if "nomem_encr" not in df.columns:
print("The identifier variable 'nomem_encr' should be in the dataset")
# Load the model
model = joblib.load(model_path)
# Preprocess the fake / holdout data
df = clean_df(df, background_df)
# Exclude the variable nomem_encr if this variable is NOT in your model
X = data_prepartion(df)
# Generate predictions from model, should be 0 (no child) or 1 (had child)
predictions = model.predict(X.drop(columns="nomem_encr"))
# Output file should be DataFrame with two columns, nomem_encr and predictions
df_predict = pd.DataFrame(
{"nomem_encr": X["nomem_encr"], "prediction": predictions}
)
# Return only dataset with predictions and identifier
return df_predict