-
Notifications
You must be signed in to change notification settings - Fork 4
/
build_classifier_dataset.py
116 lines (90 loc) · 3.76 KB
/
build_classifier_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
__author__ = 'Matias'
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textanalysis.texts import CaseReportLibrary
from irmodels.D2Vmodel import D2Vmodel
import numpy as np
import logging
import json
class LabeledCases(object):
def __init__(self, return_object=False):
self.cases = CaseReportLibrary()
self.labels = {}
self.return_object = return_object
with open("classification/data/class.txt", 'r') as infile:
lines = infile.read().split("\n")[:-1]
count = 0
for line in lines:
pmcid, cls = line.split(" ")
self.labels[pmcid] = cls
def __iter__(self):
i = 1
for case in self.cases:
pmcid = case.get_pmcid()
if pmcid not in self.labels:
continue
elif i >= len(self.labels):
break
else:
i += 1
item = case.get_text() if not self.return_object else (pmcid, self.labels[pmcid], case.get_text())
yield item
if i % 50 == 0:
msg = str("Streamed %s labeled case reports" % (i, ))
logging.info(msg)
if i > 500:
break
def __len__(self):
return len(self.cases)
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
corpus = LabeledCases()
vectorizer = CountVectorizer(min_df=1, max_features=5000).fit(corpus)
tfidf_vectorizer = TfidfVectorizer(min_df=1, max_features=5000).fit(corpus)
ngram_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 2), max_features=15000).fit(corpus)
ids = []
docs = []
labels = []
for case in LabeledCases(return_object=True):
pmcid, cls, text = case
ids.append(pmcid)
docs.append(text)
labels.append(cls)
docvecs_counts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
superdocvecs_counts = [40]
bow = vectorizer.fit_transform(docs).toarray()
tfidf = tfidf_vectorizer.fit_transform(docs).toarray()
ngram = ngram_vectorizer.fit_transform(docs).toarray()
with open('classification/data/labels_500.json', 'w') as outfile:
json.dump({'ids': ids, 'labels': labels}, outfile)
with open('classification/data/bow_500.txt', 'w') as outfile:
np.savetxt(outfile, bow)
with open('classification/data/tfidf_500.txt', 'w') as outfile:
np.savetxt(outfile, tfidf)
with open('classification/data/ngram_500.txt', 'w') as outfile:
np.savetxt(outfile, ngram)
for docvecs_count in docvecs_counts:
d2v = D2Vmodel(
None,
name="DOCID",
dataset_name="CASEREPORT",
epochs=2,
dimension=docvecs_count)
docvecs = np.zeros(shape=(len(ids), docvecs_count))
for idx, pmcid in enumerate(ids):
docvecs[idx, :] = d2v.inner_model['DOCID-CS'+str(pmcid)]
docvecs_filepath = str("classification/data/docvecs%s_500.txt" % (docvecs_count,))
with open(docvecs_filepath, 'w') as outfile:
np.savetxt(outfile, docvecs)
for superdocvecs_count in superdocvecs_counts:
d2v = D2Vmodel(
None,
name="DOCID",
dataset_name="CASEREPORT",
epochs=6,
dimension=superdocvecs_count)
superdocvecs = np.zeros(shape=(len(ids), superdocvecs_count))
for idx, pmcid in enumerate(ids):
superdocvecs[idx, :] = d2v.inner_model['DOCID-CS'+str(pmcid)]
superdocvecs_filepath = str("classification/data/superdocvecs%s_500.txt" % (superdocvecs_count,))
with open(superdocvecs_filepath, 'w') as outfile:
np.savetxt(outfile, superdocvecs)