-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
154 lines (89 loc) · 3.73 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#libraries
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import LinearRegression, LogisticRegression
from category_encoders import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pickle
import psycopg2
host = 'rajje.db.elephantsql.com'
user = 'jukofzsj'
password = 'fARhsJ6hz7j7knj2CWAt30JHSjMypo8o'
database = 'jukofzsj'
connection = psycopg2.connect(
host=host,
user=user,
password=password,
database=database
)
cur = connection.cursor()
# 데이터 가져오기
cur.execute("SELECT * FROM heart_2020;")
data = cur.fetchall()
# 컬럼 이름 가져오기
cur.execute("SELECT column_name FROM information_schema.columns WHERE table_name='heart_2020';")
columns = [col[0] for col in cur.fetchall()]
# 데이터프레임으로 변환
df = pd.DataFrame(data, columns=columns)
# 타겟 데이터 범주의 비율을 확인합니다.
y = df['heartdisease']
df.drop_duplicates(keep='first', inplace=True)
# 결측치는 없습니다.
df.isna().sum()
df.dropna(inplace=True)
# 수치형 특성의 아웃라이어를 삭제해주겠습니다.
def outlier_iqr(data) :
q1, q3 = data.quantile([0.25, 0.75])
iqr = q3 - q1
return q3 + (iqr*1.5)
bmi_upper = outlier_iqr(df['bmi'])
ph_upper = outlier_iqr(df['physicalhealth'])
mh_upper = outlier_iqr(df['mentalhealth'])
sl_upper = outlier_iqr(df['sleeptime'])
df = df[(df['bmi']<bmi_upper) | (df['physicalhealth']<ph_upper) | (df['mentalhealth']<mh_upper) | (df['sleeptime'] <sl_upper)]
# 범주형 특성 일부와 타겟의 관계를 시각화해보겠습니다.
# 나이
age_encoding = {'65-69':67, '60-64':62, '70-74':72, '55-59':57, '50-54':52, '80 or older':80, '75-79':77,
'45-49':47, '18-24' :20, '40-44':42, '35-39':37, '30-34':32, '25-29':27}
df['agecategory'] = df['agecategory'].replace(age_encoding)
df['agecategory'].astype('float')
# 전반적인 건강상태
genh_encoding = {'Poor':1, 'Fair':2, 'Excellent':3, 'Good':4, 'Very good':5}
df['genhealth'] = df['genhealth'].replace(genh_encoding)
df['genhealth'].astype('float')
### Modeling
# train, validation, test set으로 먼저 나눠주겠습니다.
# 타겟값이 문자형이기 때문에 수치형으로 바꿔주겠습니다.
df['heartdisease'] = df['heartdisease'].replace({'No':0, 'Yes':1})
y = df['heartdisease']
X = df.drop('heartdisease', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, stratify=y_train)
# scaling
numeric_feats = X_train.dtypes[X_train.dtypes != "object"].index
scaler = StandardScaler()
X_train[numeric_feats] = scaler.fit_transform(X_train[numeric_feats])
X_val[numeric_feats] = scaler.transform(X_val[numeric_feats])
X_test[numeric_feats] = scaler.transform(X_test[numeric_feats])
# One-Hot encoding
ohe = OneHotEncoder()
X_train_ohe = ohe.fit_transform(X_train)
X_val_ohe = ohe.transform(X_val)
X_test_ohe = ohe.transform(X_test)
#### Logistic Regression
logistic = LogisticRegression(class_weight='balanced')
logistic.fit(X_train_ohe, y_train)
y_val_pred = logistic.predict(X_val_ohe)
#-----------------------
## 새로운 데이터 한 샘플을 선택해 학습한 모델을 통해 예측해 봅니다
# X_test = [[4000]]
# y_pred = ensemble.predict(X_test)
# print(f'{X_test[0][0]} sqft GrLivArea를 가지는 주택의 예상 가격은 ${int(y_pred)} 입니다.')
# 출력 예시
# 4000 sqft GrLivArea를 가지는 주택의 예상 가격은 $447090 입니다.
#-----------
# with open('model.pkl','wb') as pickle_file:
# pickle.dump(logistic, pickle_file)