update repo

Vision-CAIR · Sep 27, 2023 · 4bb8727 · 4bb8727
2 parents 1e5c7e3 + dfb725f
commit 4bb8727
Show file tree

Hide file tree

Showing 15 changed files with 267 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,64 @@
+<div align="center">
+<p align="center">
+     <img src="assets/img/web_teaser.png" width=500px/>
+</p> 
+<h1 align="center">
+</h1>
+<h1 align="center">
+    Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning Based on Visually Grounded Conversations
+</h1>
+
+[![arXiv](https://img.shields.io/badge/📚%20arXiv-grey?logoColor=white&logoWidth=20)](#)
+[![Download (coming soon)](https://img.shields.io/badge/📦%20Download-grey?logoColor=white&logoWidth=20)](#)
+[![Website](https://img.shields.io/badge/🌐%20Website-green?logoColor=white&logoWidth=20)](https://affective-visual-dialog.github.io/)
+
+</div>
+
+## 📰 News
+
+- **30/08/2023**: The preprint of our paper is now available on [arXiv](https://arxiv.org/abs/2308.16349).
+
+## Summary
+
+- [📰 News](#-news)
+- [Summary](#summary)
+- [📚 Introduction](#-introduction)
+- [📊 Baselines](#-baselines)
+- [Citation](#citation)
+- [References](#references)
+
+<br>
+
+## 📚 Introduction
+
+ AffectVisDial is a large-scale dataset which consists of 50K 10-turn visually grounded dialogs as well as concluding emotion attributions and dialog-informed textual emotion explanations.
+
+<br>
+
+## 📊 Baselines
+
+We provide baseline models explanation generation task:
+  - [GenLM](./baselines/GenLM/): BERT- and BART-based models [3, 4]
+  - [NLX-GPT](./baselines/nlx): NLX-GPT based model [1]
+
+<br>
+
+## Citation
+
+If you use our dataset, please cite the two following references:
+
+```bibtex
+@article{haydarov2023affective,
+  title={Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning Based on Visually Grounded Conversations},
+  author={Haydarov, Kilichbek and Shen, Xiaoqian and Madasu, Avinash and Salem, Mahmoud and Li, Li-Jia and Elsayed, Gamaleldin and Elhoseiny, Mohamed},
+  journal={arXiv preprint arXiv:2308.16349},
+  year={2023}
+}
+```
+</br>
+
+## References
+1. _[Sammani et al., 2022] - NLX-GPT: A Model for Natural Language Explanations in Vision and Vision-Language Tasks
+2. _[Li et al., 2022]_ - BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
+3. _[Lewis et al., 2019]_ - BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension.
+4. _[Dewlin et al., 2018]_ - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
diff --git a/assets/css/app.css b/assets/css/app.css
diff --git a/assets/css/bootstrap.min.css b/assets/css/bootstrap.min.css
diff --git a/assets/img/web_teaser.png b/assets/img/web_teaser.png
diff --git a/assets/js/app.js b/assets/js/app.js
diff --git a/baselines/BART/.gitignore b/baselines/BART/.gitignore
diff --git a/baselines/GenLM/README.txt b/baselines/GenLM/README.txt
@@ -0,0 +1,38 @@
+Installation:
+1. conda create -n emodialog
+2. conda activate emodialog
+3. pip install -r requirements.txt
+
+Data preparation:
+1. We use BLIP caption generator for captioning images and use them as image input for emotion classification and explanation generation. 
+
+2. Once the captions are generated for the images, we prepare data for training Seq-Seq models like BART, T5. 
+
+3. These files assume the following directory structure: 
+	data/
+	data/images/ --> For images
+	data/train.pkl --> Training data
+	data/val.pkl --> Dev data
+	data/test.pkl --> Test data
+
+4. Now run the data preparation using: 
+	bash data_prep.sh
+
+5. A set of files (for all results in Table - 4) are saved in "data/" folder.
+	a. _ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_ft_gen_cap --> Uses (Image, Emotion, Caption, Dialog)
+	b. _ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_ft_gen_cap --> Uses (Image, Emotion, Caption)
+	c. _ques_aft_expl_gen_emo_gen_emo1_emo2_conv_ft_gen_cap --> Uses (Image, Emotion, Dialog)
+	d. _ques_aft_expl_gen_emo_gen_cap1_cap2_conv --> Uses (Caption, Dialog)
+
+Running scripts:
+1. To train the explanation generation models for questioner run:
+	python ques_aft_emo_expla_gen_train.py
+
+2. To evaluate run:
+	python ques_aft_emo_expla_gen_eval.py 
+
+3. To train the explanation generation models for answerer run:
+	python ans_aft_emo_expla_gen_train.py
+
+4. To evaluate run:
+	python ans_aft_emo_expla_gen_eval.py 		
diff --git a/baselines/BART/ans_aft_emo_expla_gen_eval.py → ...lines/GenLM/ans_aft_emo_expla_gen_eval.py b/baselines/BART/ans_aft_emo_expla_gen_eval.py → ...lines/GenLM/ans_aft_emo_expla_gen_eval.py
@@ -27,16 +27,16 @@
 data_path = 'data/'
 logs_dir = 'logs/'
 output_dir = 'EmoDialog/'
-subtask = 'ans_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_gen_cap_1'
+subtask = 'ans_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_gen_cap'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     task = 'image_blip_text_' + subtask
 else:
     task = 'text_only_' + subtask
 
 modelname = 'facebook/bart-large'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     numepochs = 25
 else:
     numepochs = 5

diff --git a/...s/BART/ans_aft_emo_expla_gen_train_mod.py → ...ines/GenLM/ans_aft_emo_expla_gen_train.py b/...s/BART/ans_aft_emo_expla_gen_train_mod.py → ...ines/GenLM/ans_aft_emo_expla_gen_train.py
@@ -29,17 +29,22 @@
 
 data_path = 'data/'
 logs_dir = 'logs/'
+if(os.path.isdir(logs_dir) == False):
+    os.mkdir(logs_dir)
+
 output_dir = 'EmoDialog/'
-subtask = 'ans_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_gen_cap_1'
+if(os.path.isdir(output_dir) == False):
+    os.mkdir(output_dir)
+subtask = 'ans_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_gen_cap'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     task = 'image_blip_text_' + subtask
 else:
     task = 'text_only_' + subtask
 
 modelname = 'facebook/bart-large'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     numepochs = 25
 else:
     numepochs = 5

diff --git a/baselines/GenLM/cap_gen.py b/baselines/GenLM/cap_gen.py
@@ -0,0 +1,21 @@
+import os
+import pickle as pkl
+from PIL import Image
+import torch
+from lavis.models import load_model_and_preprocess
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+image_data = 'data/images/'
+
+model, vis_processors, _ = load_model_and_preprocess(name="blip_caption", model_type="base_coco", is_eval=True, device=device)
+
+image_to_caption = {}
+for img in os.listdir(image_data):
+    image_path = os.path.join(image_data, img)
+    raw_image = Image.open(image_path).convert("RGB")
+    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
+    gen_caption = model.generate({"image": image})
+    image_to_caption[img] = gen_caption[0]
+
+with open('data/blip_gen_captions.pkl', 'wb') as f:
+    pkl.dump(image_to_caption, f)
diff --git a/baselines/GenLM/data_prep.sh b/baselines/GenLM/data_prep.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+python cap_gen.py
+python data_prep_for_expl_gen.py
diff --git a/baselines/GenLM/data_prep_for_expl_gen.py b/baselines/GenLM/data_prep_for_expl_gen.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+# coding: utf-8
+import pickle as pkl
+import pandas as pd
+
+with open('data/blip_gen_captions.pkl', 'rb') as f:
+    gen_captions = pkl.load(f)
+
+def prepare_data_for_answerer(data, typ, task):
+    data_cap = []
+    data_res = []
+    for i, d in enumerate(data):
+        sen = ""
+        if('emo1_emo2' in task):
+            sen += '<emotion> ' + d['emotion1'] + ' <emotion> ' + d['emotion2']
+        sen +=  ' <caption> ' + d['caption1'] + ' <caption> ' + d['caption2']
+        if(task.endswith('gen_cap_')):
+            sen += ' <caption> ' + gen_captions[d['img_src']] + " "
+        if('conv' in task):
+            sen += " <conversation> "
+            for s in d['conversation']:
+                sen += s + " "
+
+        data_cap.append(str(sen) + "I feel " )
+
+        if(d['answerer_emotion'].lower() == 'something else' or d['answerer_emotion'] == ''):
+            res_sen = " neutral " + " because "
+        else:
+            res_sen = d['answerer_emotion'].lower()  + " because "
+
+        if('corrected_answerer_explanation' in d.keys()):
+            if(d['corrected_answerer_explanation'] == d['corrected_answerer_explanation'] and d['corrected_answerer_explanation'] != 'NA'):
+                data_res.append(res_sen + " " + str(d['corrected_answerer_explanation']))
+            else:
+                data_res.append(res_sen)
+        else:
+            if(d['answerer_explanation'] == d['answerer_explanation'] and d['answerer_explanation'] != 'NA'):
+                data_res.append(res_sen + " " + str(d['answerer_explanation']))
+            else:
+                data_res.append(res_sen)
+
+    df = pd.DataFrame({'caption': data_cap, 'response': data_res})
+    df.to_csv('data/' + typ + task + '.csv', header=True, index=False)
+
+def prepare_data_for_questioner(data, typ, task):
+    emotion_labels = []
+    for x in data:
+        emotion_labels.append(x['emotion_after'].lower())
+
+    data_cap = []
+    data_res = []
+    i = 0
+    for d in data:
+        sen = ""
+        if('emo1_emo2' in task):
+            sen += '<emotion> ' + d['emotion1'] + ' <emotion> ' + d['emotion2']
+        sen +=  ' <caption> ' + d['caption1'] + ' <caption> ' + d['caption2']
+        if('gen_cap' in task):
+            sen += ' <caption> ' + gen_captions[d['img_src']] + " "
+        if('conv' in task):
+            sen += " <conversation> "
+            for s in d['conversation']:
+                sen += s + " "
+        data_cap.append(str(sen) + "I feel " )
+
+        if(d['emotion_after'].lower() == 'something else'):
+            res_sen = " neutral " + " because "
+        else:
+            res_sen = d['emotion_after'].lower()  + " because "
+
+        if('corrected_explanation_after' in d.keys()):
+            if(d['corrected_explanation_after'] == d['corrected_explanation_after'] and d['corrected_explanation_after'] != 'NA'):
+                data_res.append(res_sen + " " + str(d['corrected_explanation_after']))
+            else:
+                if(d['corrected_explanation_before'] == d['corrected_explanation_before'] and d['corrected_explanation_before'] != 'NA'):
+                    data_res.append(res_sen + " " + str(d['corrected_explanation_before']))
+                else:
+                    data_res.append(res_sen)
+        else:
+            if(d['explanation_after'] == d['explanation_after'] and d['explanation_after'] != 'NA'):
+                data_res.append(res_sen + " " + str(d['explanation_after']))
+            else:
+                if(d['explanation_before'] == d['explanation_before'] and d['explanation_before'] != 'NA'):
+                    data_res.append(res_sen + " " + str(d['explanation_before']))
+                else:
+                    data_res.append(res_sen)
+        i += 1
+
+    df = pd.DataFrame({'caption': data_cap, 'response': data_res})
+    df.to_csv('data/' + typ + task + '.csv', header=True, index=False)
+
+
+for splits in ['train', 'val', 'test']:
+    with open('data/' + splits + '_data.pkl', 'rb') as f:
+        data = pkl.load(f)
+
+    questioner_tasks = ['_ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_ft_gen_cap',
+                        '_ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_ft_gen_cap'
+                        '_ques_aft_expl_gen_emo_gen_emo1_emo2_conv_ft_gen_cap',
+                        '_ques_aft_expl_gen_emo_gen_cap1_cap2_conv_ft_gen_cap',
+                        '_ques_aft_emo_cap1_cap2_conv_gen_cap',
+                        '_ques_aft_emo_cap1_cap2_conv',
+                        '_ques_aft_expl_gen_emo_gen_cap1_cap2_conv'
+                        ]
+    for task in questioner_tasks:
+        prepare_data_for_questioner(data, splits, task)
+
+    answerer_tasks = ['_ans_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_gen_cap',
+                      '_ans_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2']
+    for task in answerer_tasks:
+        prepare_data_for_answerer(data, splits, task)
diff --git a/...lines/BART/ques_aft_emo_expla_gen_eval.py → ...ines/GenLM/ques_aft_emo_expla_gen_eval.py b/...lines/BART/ques_aft_emo_expla_gen_eval.py → ...ines/GenLM/ques_aft_emo_expla_gen_eval.py
@@ -27,19 +27,19 @@
 data_path = 'data/'
 logs_dir = 'logs/'
 output_dir = 'EmoDialog/'
-subtask = 'ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv'
+subtask = 'ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_gen_cap'
 
 if('ft_gen_cap' in subtask):
     data_path += 'ft_gen/'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     task = 'image_blip_text_' + subtask
 else:
     task = 'text_only_' + subtask
 
 modelname = 'facebook/bart-large'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     numepochs = 25
 else:
     numepochs = 5

diff --git a/.../BART/ques_aft_emo_expla_gen_train_mod.py → ...nes/GenLM/ques_aft_emo_expla_gen_train.py b/.../BART/ques_aft_emo_expla_gen_train_mod.py → ...nes/GenLM/ques_aft_emo_expla_gen_train.py
@@ -2,7 +2,6 @@
 import json
 import random
 from tqdm import tqdm
-import pickle as pkl
 import numpy as np
 import pandas as pd
 from sklearn.metrics import accuracy_score, precision_recall_fscore_support
@@ -29,20 +28,25 @@
 
 data_path = 'data/'
 logs_dir = 'logs/'
+if(os.path.isdir(logs_dir) == False):
+    os.mkdir(logs_dir)
+
 output_dir = 'EmoDialog/'
-subtask = 'ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_gen_cap_1'
+if(os.path.isdir(output_dir) == False):
+    os.mkdir(output_dir)
+subtask = 'ques_aft_expl_gen_emo_gen_emo1_emo2_cap1_cap2_conv_gen_cap'
 
 if('ft_gen_cap' in subtask):
     data_path += 'ft_gen/'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     task = 'image_blip_text_' + subtask
 else:
     task = 'text_only_' + subtask
 
 modelname = 'facebook/bart-large'
 
-if(subtask.endswith('gen_cap_1')):
+if(subtask.endswith('gen_cap')):
     numepochs = 25
 else:
     numepochs = 5

diff --git a/baselines/GenLM/requirements.txt b/baselines/GenLM/requirements.txt
@@ -0,0 +1,8 @@
+torch==1.8.0
+salesforce-lavis==1.0.2
+Pillow==10.0.0
+scikit-learn==1.3.0
+transformers==4.18.0
+evaluate==0.4.0
+nltk==3.5.0
+git+https://github.com/neulab/BARTScore.git