-
Notifications
You must be signed in to change notification settings - Fork 1
/
cluster_extraction.py
152 lines (122 loc) · 5.65 KB
/
cluster_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import csv
import sumy
import re
from nltk import ngrams
from nltk.corpus import stopwords
def group_by_clusters():
'''Group courses by clusters'''
clusters = {}
cluster_descriptions = {}
# Course Dept Course # Course Name Prerequisites Units Tools Description Course Cluster Foundational Applied Meta
with open('single_clustered_courses.txt', 'r') as readfile:
cols = csv.reader(readfile, delimiter='\t')
for col in cols:
# course_name = col[0] + " " + col[1] + " - " + col[2][re.search("\d", col[2]):]
course_name = col[0] + " " + col[1]
description = [word for word in col[6].split() if word not in stopwords.words('english')]
cluster = col[7].replace("/", " & ")
if cluster == "nan":
continue
if cluster not in clusters:
clusters[cluster] = [course_name]
cluster_descriptions[cluster] = [description]
else:
clusters[cluster].append(course_name)
cluster_descriptions[cluster].append(description)
'''Write tokenized descriptions of each cluster'''
with open('./clusters/all' + ".txt", "w") as writeall:
for cluster_course in sorted(clusters.keys()):
with open('./clusters/' + cluster_course + ".txt", "w") as writefile:
writefile.write(cluster_course + "\n")
writeall.write(cluster_course + "\n")
for course in sorted(clusters[cluster_course]):
writefile.write("\t" + course + "\n")
writeall.write("\t" + course + "\n")
with open("./tokenized_phrases/" + cluster_course + ".txt", "w") as write_description_file:
all_tokenized_phrases = []
for description in sorted(cluster_descriptions[cluster_course]):
all_tokenized_phrases.extend([x for x in ngrams(description, 1)])
all_tokenized_phrases.extend([x for x in ngrams(description, 2)])
all_tokenized_phrases.extend([x for x in ngrams(description, 3)])
write_description_file.write(str(set(all_tokenized_phrases)))
return clusters
def extract_prereq_info():
courses = {}
'''
Extract prerequisite information from the .tsv.
'''
with open('courses.tsv', 'r') as csvfile:
cols = csv.reader(csvfile, delimiter='\t')
for col in cols:
prereqs = col[3].split(",")
# course_name = col[0] + " " + col[1] + " - " + col[2][re.search("\d", col[2]):]
course_name = col[0] + " " + col[1]
for prereq in prereqs:
if prereq not in courses:
courses[prereq] = [course_name]
elif course_name not in courses[prereq]:
courses[prereq].append(course_name)
# Remove Duplicates
remove_courses = []
for prereq in courses:
for prereq_string in courses:
if prereq_string != prereq and prereq in prereq_string:
for ele in courses[prereq_string]:
if ele not in courses[prereq]:
courses[prereq].append(ele)
remove_courses.append(prereq_string)
for course in remove_courses:
if course in courses:
courses.pop(course)
return courses
# Setup Extraction & Clusters
CLUSTERS_TO_COURSES = group_by_clusters()
PREREQS_TO_COURSES = extract_prereq_info()
COURSES_TO_PREREQS = {}
for prereq in PREREQS_TO_COURSES.keys():
follow_ups = PREREQS_TO_COURSES[prereq]
for follow_up in follow_ups:
if follow_up in COURSES_TO_PREREQS:
COURSES_TO_PREREQS[follow_up].append(prereq)
else:
COURSES_TO_PREREQS[follow_up] = [prereq]
def is_prereq(course_name):
prereq_abbr = course_name.split("(")[1].split(")")[0] + " " + course_name.split()[-1]
for prereq in PREREQS_TO_COURSES.keys():
if prereq_abbr in prereq:
return prereq
return ""
def get_follow_up_courses(course_lst, depth=0):
ret_string = ""
for course_name in course_lst:
if is_prereq(course_name):
indents = "\t" * (depth+1)
ret_string += "\t" * depth + course_name + "\n" + get_follow_up_courses(PREREQS_TO_COURSES[is_prereq(course_name)], depth + 1)
else:
ret_string += "\t" * depth + course_name + "\n"
return ret_string
def has_prereq(course_name):
prereq_abbr = course_name.split("(")[1].split(")")[0] + " " + course_name.split()[-1]
for course in COURSES_TO_PREREQS.keys():
if prereq_abbr in COURSES_TO_PREREQS[course]:
return COURSES_TO_PREREQS[prereq_abbr]
return []
def get_prereq_courses(course_name):
if course_name not in COURSES_TO_PREREQS.keys():
return ""
prereqs = sorted(COURSES_TO_PREREQS[course_name])
for prereq in prereqs:
for course in COURSES_TO_PREREQS.keys():
if prereq in course:
prereqs.extend(get_prereq_courses(COURSES_TO_PREREQS[course]))
return "" if prereqs == ["None"] else str(prereqs) + "\n\t"
with open("./cluster_paths/all.txt", "w") as write_all_paths:
for cluster in CLUSTERS_TO_COURSES.keys():
write_all_paths.write("\n" + cluster + "\n\n")
cluster_write_string = ""
for course in sorted(CLUSTERS_TO_COURSES[cluster]):
cluster_write_string += get_prereq_courses(course) + get_follow_up_courses([course])
print(get_prereq_courses(course))
with open("./cluster_paths/" + cluster + ".txt", "w") as write_paths:
write_paths.write(cluster_write_string)
write_all_paths.write(cluster_write_string)