-
Notifications
You must be signed in to change notification settings - Fork 5
/
get_iu_xray.py
84 lines (67 loc) · 2.9 KB
/
get_iu_xray.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import xml.etree.ElementTree as ET
images_path = './IU-XRay/images'
reports_path = './IU-XRay/reports'
try:
os.makedirs(images_path)
os.makedirs(reports_path)
except:
print("path already exists")
# download PNG images
os.system("wget -P {}/ https://openi.nlm.nih.gov/imgs/collections/NLMCXR_png.tgz".format(images_path))
# download reports
os.system("wget -P {}/ https://openi.nlm.nih.gov/imgs/collections/NLMCXR_reports.tgz".format(reports_path))
# unzip
os.system("tar -xzf {}/NLMCXR_png.tgz -C {}/".format(images_path, images_path))
os.system("tar -xzf {}/NLMCXR_reports.tgz -C {}/".format(reports_path, reports_path))
os.system("mv {}/ecgen-radiology/*.xml {}/".format(reports_path, reports_path))
os.system("rm -rf {}/ecgen-radiology".format(reports_path))
os.system("rm {}/NLMCXR_png.tgz".format(images_path))
os.system("rm {}/NLMCXR_reports.tgz".format(reports_path))
reports = os.listdir(reports_path)
reports.sort()
reports_with_no_image = []
reports_with_empty_sections = []
reports_with_no_impression = []
reports_with_no_findings = []
images_captions = {}
reports_with_images = {}
text_of_reports = {}
for report in reports:
tree = ET.parse(os.path.join(reports_path, report))
root = tree.getroot()
img_ids = []
# find the images of the report
images = root.findall("parentImage")
# if there aren't any ignore the report
if len(images) == 0:
reports_with_no_image.append(report)
else:
sections = root.find("MedlineCitation").find("Article").find("Abstract").findall("AbstractText")
# find impression and findings sections
for section in sections:
if section.get("Label") == "FINDINGS":
findings = section.text
if section.get("Label") == "IMPRESSION":
impression = section.text
if impression is None and findings is None:
reports_with_empty_sections.append(report)
else:
if impression is None:
reports_with_no_impression.append(report)
caption = findings
elif findings is None:
reports_with_no_findings.append(report)
caption = impression
else:
caption = impression + " " + findings
for image in images:
images_captions[image.get("id") + ".png"] = caption
img_ids.append(image.get("id") + ".png")
reports_with_images[report] = img_ids
text_of_reports[report] = caption
print("Found", len(reports_with_no_image), "reports with no associated image")
print("Found", len(reports_with_empty_sections), "reports with empty Impression and Findings sections")
print("Found", len(reports_with_no_impression), "reports with no Impression section")
print("Found", len(reports_with_no_findings), "reports with no Findings section")
print("Collected", len(images_captions), "image-caption pairs")