-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_colinearity.py
60 lines (51 loc) · 1.96 KB
/
remove_colinearity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import argparse
import random
parser = argparse.ArgumentParser(description='This program takes in the .codedx file generated by code_data.py, removes multicolinearity among features')
parser.add_argument('-i', '--incodedx', type=str, required=True, help="input codedx files")
parser.add_argument('-o', '--outcodedx', type=str, required=True, help="output codedx files")
parser.add_argument('-s', '--sampling', type=str, required=False, help="sampling approach: if integer and >1: number of rows; if float and <=1: fraction; default:1000")
parser.add_argument('-m', '--model', type=int, required=True, help="model: 0:VIF, 1:TOP, 2:PCA")
args = parser.parse_args()
def calculate_vif(df, thresh=5):
const = add_constant(df)
cols = const.columns
variables = np.arange(const.shape[1])
vif_df = pd.Series([variance_inflation_factor(const.values, i) for i in range(const.shape[1])], index=const.columns).to_frame()
vif_df = vif_df.sort_values(by=0, ascending=False).rename(columns={0: 'VIF'})
vif_df = vif_df.drop('const')
vif_df = vif_df[vif_df['VIF'] > thresh]
print('Features above VIF threshold:\n')
print(vif_df[vif_df['VIF'] > thresh])
col_to_drop = list(vif_df.index)
for i in col_to_drop:
print('Dropping: {}'.format(i))
df = df.drop(columns=i)
return df
codedxfile=args.incodedx+".codedx"
if not os.path.isfile(codedxfile):
print("Cannot find the .codedx file!")
exit()
codedx=pd.read_csv(codedxfile)
n=len(codedx.index)
print(n)
samn=1000
if n<samn:
samn=n
if args.sampling:
r=float(args.sampling)
if r>1:
samn=int(r)
elif r>0:
samn=int(n*r)
else:
print("samping option should be >0")
exit()
idx=[i for i in range(0,n)]
selidx=random.sample(idx,samn)
seldat=codedx.iloc[selidx,:]
calculate_vif(seldat)