-
Notifications
You must be signed in to change notification settings - Fork 0
/
string_matching.py
158 lines (138 loc) · 4.97 KB
/
string_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import argparse
import pandas as pd
from outlines_quiz import json_load_results, get_imagepaths
D_CUTOFF = 3
def levenshteinDistance(s1, s2):
"""Compute the Levenshtein distance between two strings.
https://stackoverflow.com/a/32558749/10119867
"""
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2 + 1]
for i1, c1 in enumerate(s1):
# following https://stackoverflow.com/a/31599276/10119867
if c1.casefold() == c2.casefold():
distances_.append(distances[i1])
else:
distances_.append(
1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
)
distances = distances_
return distances[-1]
def get_llm_ids_and_fullnames(results_filename):
pages = [1, 3]
results = json_load_results(results_filename)
folder = "imgs/q11"
pattern = r"doc-\d+-page-[" + "|".join([str(p) for p in pages]) + r"]-[A-Z0-9]+.png"
print(f"Pattern: {pattern}")
filenames = get_imagepaths(folder, pattern)
llm_ids = {
filename: results[filename][0]["university_id"] for filename in filenames
}
llm_fullnames = {
filename: results[filename][0]["student_full_name"] for filename in filenames
}
df_ids = pd.DataFrame(llm_ids.items(), columns=["filename", "llm_id"])
df_fullnames = pd.DataFrame(
llm_fullnames.items(), columns=["filename", "llm_fullname"]
)
df = pd.merge(df_ids, df_fullnames, on="filename")
return df
def get_llm_distances(df_llm, doc_info_filename, ids_filename):
pages = [1, 3]
df_filenames = pd.read_csv(doc_info_filename)
# TODO: fix filepaths
df_filenames["filename"] = "imgs/q11/" + df_filenames["filename"]
query_str = " or ".join([f"page == {i}" for i in pages])
df_filenames = df_filenames.query(query_str)
df_llm = df_llm.merge(df_filenames, on="filename")
df_test = pd.read_csv(ids_filename)
#TODO: doc column should be removed from ids_filename file
df_test = df_test.drop(columns=["doc"])
df_test = df_llm.merge(df_test, how="cross")
# compare IDs
df_test["llm_id"] = df_test["llm_id"].astype(str)
df_test["student_id"] = df_test["student_id"].astype(str)
df_test["id_distance"] = df_test.apply(
lambda x: levenshteinDistance(x["llm_id"], x["student_id"]), axis=1
)
# compare last names
df_test["lastname_distance"] = df_test.apply(
lambda x: levenshteinDistance(
x["llm_fullname"].split()[-1] if x["llm_fullname"] else "",
x["student_full_name"].split()[-1] if x["student_full_name"] else "",
),
axis=1,
)
n_id_correct = df_test["id_distance"].eq(0).sum()
n_lastname_correct = df_test["lastname_distance"].eq(0).sum()
print(f"Number of perfect ID matches: {n_id_correct}/{len(df_test)}")
print(f"Number of perfect last name matches: {n_lastname_correct}/{len(df_test)}")
return df_test
def get_matches(df_test, id_d_cutoff=D_CUTOFF):
df = df_test[
(df_test["id_distance"] <= id_d_cutoff) | (df_test["lastname_distance"] == 0)
].copy()
df_matching = df.groupby(["doc", "student_id"]).agg(
{
"filename": "first",
"student_id": "first",
"student_full_name": "first",
"id_distance": "min",
"lastname_distance": "min",
}
)
df_matching["found"] = df_matching.apply(
lambda x: (x["id_distance"] == 0)
or ((x["id_distance"] <= id_d_cutoff) and (x["lastname_distance"] == 0)),
axis=1,
)
return df_matching
def parse_llm_pipe(
results_filename,
doc_info_filename,
ids_filename,
store_filename,
id_d_cutoff=D_CUTOFF,
):
df_llm = get_llm_ids_and_fullnames(results_filename)
df_test = get_llm_distances(df_llm, doc_info_filename, ids_filename)
df_matching = get_matches(df_test, id_d_cutoff)
df_matching.to_csv(store_filename, index=False)
return df_matching
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--results_fname",
type=str,
default="tests/output/qwen2-VL-2B-results.json",
help="Filename of the results JSON file",
)
parser.add_argument(
"--doc_info_fname",
type=str,
default="imgs/q11/doc_info.csv",
help="Filename of the document info CSV file",
)
parser.add_argument(
"--ids_fname",
type=str,
default="tests/data/test_ids.csv",
help="Filename of the test IDs CSV file",
)
parser.add_argument(
"--store_fname",
type=str,
default="tests/output/qwen2-VL-2B-matching_results.csv",
help="Filename to store the matching results",
)
args = parser.parse_args()
df_matching = parse_llm_pipe(
args.results_fname,
args.doc_info_fname,
args.ids_fname,
args.store_fname,
)
print(df_matching)