-
Notifications
You must be signed in to change notification settings - Fork 1
/
n2c2_ner.py
99 lines (77 loc) · 2.99 KB
/
n2c2_ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding: utf-8 -*-
"""MIMIC_NER.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1vSCfx6dHw9wtO6qQTw7icJsYSYbYYuBL
"""
import gc
import os
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from torch.utils.data import Dataset
# bam
from tqdm import tqdm
# transformers
from transformers import (AutoModelForTokenClassification, AutoTokenizer,
pipeline)
from a2_spanbert import load_csv
from config import MODEL_PATH
from util import save_pickle
# for NLP treatment
eng_stopwords = set(stopwords.words('english'))
unused_patt = re.compile(rf"\[\*\*[^\*]+\*\*]")
# [** hello **]
# importing the models
def clean_sent(text):
no_patt_text = unused_patt.sub(" ", text.lower())
clean_toks = [tok for tok in no_patt_text.split()
if tok not in eng_stopwords]
return ' '.join(clean_toks)
def load_ner(use_gpu=True, model_path=MODEL_PATH):
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
if use_gpu:
return pipeline("ner", model=model, tokenizer=tokenizer, device=0)
else:
return pipeline("ner", model=model, tokenizer=tokenizer)
def stream_raw_sentences(text):
'''Creates a generator of sentences to be fed to hf's models. along with batching + GPU , should yield significant speedup'''
return [sentence[:512] for sentence in sent_tokenize(clean_sent(text))]
class TextDataset(Dataset):
def __init__(self, df):
self.df = df
self.texts = self.df['raw']
self.ids = self.df['sample_id']
def ready(self):
self.all_sentences = [(i, stream_raw_sentences(text))
for i, text in enumerate(self.texts)]
self.all_sentences_flat = [
(i, sentence, j) for i, sentences in self.all_sentences for j, sentence in enumerate(sentences)]
self.all_sentences_flat.sort(key=lambda x: len(x[1]))
self.indices_mapping = [(el[0], el[2])
for el in self.all_sentences_flat]
self.row_id_mappings = [(self.ids[i], j)
for i, j in self.indices_mapping]
self.inputs = [el[1] for el in self.all_sentences_flat]
def __getitem__(self, index):
return self.inputs[index]
def __len__(self):
return len(self.inputs)
def fit(ner, dataset):
dataset.ready()
dirname = f"data/output-n2c2/"
if not os.path.exists(dirname):
os.mkdir(dirname)
save_pickle(dataset.row_id_mappings, "id_mappings.pickle", dir=dirname)
preds = [batch for batch in tqdm(ner(
dataset, num_workers=8, batch_size=64), desc=f"Predicting N2C2", total=len(dataset))]
save_pickle(preds, "preds.pickle", dir=dirname)
gc.collect()
# break
if __name__ == "__main__":
data = load_csv()
dataset = TextDataset(data)
ner = load_ner(use_gpu=True)
fit(ner,dataset)