-
Notifications
You must be signed in to change notification settings - Fork 1
/
7_1_llm_compare_captions.py
107 lines (84 loc) · 3.73 KB
/
7_1_llm_compare_captions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import pandas as pd
import argparse
from datasets import load_dataset
from utils import (
extract_objects,
load_llm_pipe,
get_answers
)
def flatten_data(df):
caps_flat, objs_flat = [], []
for cap, objs in zip(df.gt_caption, df.generated_objs):
for obj in objs:
caps_flat.append(cap)
objs_flat.append(obj)
return tuple(caps_flat), tuple(objs_flat)
def unflatten_responses(responses_flat, df):
responses_unflat = []
i=0
for objs in df.generated_objs:
cur_responses = []
for obj in objs:
cur_responses.append(responses_flat[i])
i+=1
responses_unflat.append(cur_responses)
assert(len(responses_unflat) == len(df.generated_objs))
return responses_unflat
def apply_ignore_words(responses_flat, objs_flat):
ignore_words = ['painting', 'drawing', 'photo', 'picture', 'portrait', 'photograph']
for i, obj in enumerate(objs_flat):
if obj in ignore_words:
responses_flat[i] = 'ignore'
return responses_flat
def get_llm_responses(df, llm_pipe):
caps_flat, objs_flat = flatten_data(df)
responses_flat = get_answers(caps_flat, objs_flat, llm_pipe)
responses_flat = apply_ignore_words(responses_flat, objs_flat)
responses = unflatten_responses(responses_flat,df)
pd.DataFrame({"gt_caption": df.gt_caption,
"generated_caption": df.generated_caption,
"generated_objs": df.generated_objs,
"llm_responses": responses}).to_csv("responses.csv")
return responses
def get_och_score(llm_responses):
responses = []
[responses.extend(resp_per_cap) for resp_per_cap in llm_responses]
data = pd.Series(responses).str.lower().str.strip()
dv = data.value_counts()
d = dv.to_dict()
return d['no'] / (d['yes'] + d['no'])
def eval(args):
print("Loading Dataset\n")
# och_dataset = load_dataset("moranyanuka/OpenCHAIR", cache_dir=args.cache_dir)['test']
och_dataset = load_dataset("Hagarsh/OpenCHAIR_verb_exp_2", cache_dir=args.cache_dir)['test']
df = pd.read_csv(args.generations_file_path)
df['gt_caption'] = och_dataset['text']
word_conc = pd.read_excel(args.concreteness_dataset_path)[['Word','Conc.M']].set_index("Word").to_dict()['Conc.M']
print("\nLoading LLM\n")
llm_pipe = load_llm_pipe(args)
print("\nExtracting Generated Objects (Per Image)\n")
df['generated_objs'] = extract_objects(captions=df.generated_caption.tolist(), conc_df=word_conc, llm_pipe=llm_pipe)
print("\nGetting LLM Responses\n")
llm_responses = get_llm_responses(df, llm_pipe)
OpenCHAIR_score = get_och_score(llm_responses)
print("\nOpenCHAIR Score: \n")
print(OpenCHAIR_score)
if __name__ == "__main__":
"""
Evaluates the hallucinations ratio (score) on a given dataset.
The argument "--generations-file-path" is the CSV with the generated captions from the VLM, i.e., the 'predicted'
captions (these are generated by the script 'generate_captions_from_image_objs.py'
"""
# python -m spacy download en_core_web_sm
os.chdir(os.path.dirname(os.path.abspath(__file__)))
parser = argparse.ArgumentParser()
parser.add_argument("--llm-ckpt", type=str, default="mistralai/Mistral-7B-Instruct-v0.2")
parser.add_argument("--concreteness-dataset-path", type=str,
default="Concreteness_ratings_Brysbaert_et_al_BRM.xlsx")
parser.add_argument("--device", type=str, default='cuda')
parser.add_argument("--cache-dir", type=str, default=None)
parser.add_argument("--generations-file-path", type=str, default="./captions_from_image_verbs.csv")
parser.add_argument("--batch-size", type=int, default=32)
args = parser.parse_args()
eval(args)