-
Notifications
You must be signed in to change notification settings - Fork 5
/
dataset.py
79 lines (64 loc) · 2.19 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import pandas as pd
x = input("Enter filename")
data = pd.read_csv(x)
tissue_col = data.Tissue.tolist()
set_types = list(set(data['Tissue'].tolist()))
tot_cat1 = tissue_col.count(set_types[0])
print("HEllo", tot_cat1)
tot_cat2 = len(tissue_col) - tot_cat1
print("Hello", tot_cat2)
df = data.drop('Tissue', axis=1)
train_list, validation_list, test_list = [], [], []
train_label, validation_label, test_label = [], [], []
count_other, count_lung = 0, 0
for ind,row in df.iterrows():
if data['Tissue'][ind] == 'Other':
if count_other <= 0.6*tot_cat1:
train_list.append(np.array(row))
train_label.append(np.array([0.0, 1.0]))
count_other+=1
elif count_other > 0.6*tot_cat1 and count_other <= 0.8*tot_cat1:
validation_list.append(np.array(row))
validation_label.append(np.array([0.0, 1.0]))
count_other+=1
else:
test_list.append(np.array(row))
test_label.append(np.array([0.0, 1.0]))
else:
if count_lung <= 0.6*tot_cat2:
train_list.append(np.array(row))
train_label.append(np.array([1.0, 0.0]))
count_lung+=1
elif count_lung > 0.6*tot_cat2 and count_lung <= 0.8*tot_cat2:
validation_list.append(np.array(np.array(row)))
validation_label.append(np.array([1.0, 0.0]))
count_lung+=1
else:
test_list.append(np.array(row))
test_label.append(np.array([1.0, 0.0]))
train_list = np.array(train_list).astype(float)
train_label = np.array(train_label).astype(float)
validation_list = np.array(validation_list).astype(float)
validation_label = np.array(validation_label).astype(float)
test_list = np.array(test_list).astype(float)
test_label = np.array(test_label).astype(float)
# print(train_list)
# example = {'field1':[1,2,3,4,5,6,7], 'field2':['a','b','c','d','e','f','g'], 'label':[0,1,0,1,1,0,0]}
# data_frame = pd.DataFrame(data=example)
# data_frame.to_csv('file.csv')
# data = pd.read_csv("OVA_Lung.arff")
# count = 0
# for ind,row in data.iterrows():
# if count <= 5:
# print(row.toList())
# count += 1
# #
# 1 - I need dictionary
# so convert the arff to dict format
# so that row1 values are key and corresponding is column
# 2 - divide data into 3 csv
# numpy array
# train labels, validation, test labels - 6 lists
# else if 0.6, 0.2, 0.2
# #