From f02931c0899d25b15b3bf3e87cbc1f5ed3a59f35 Mon Sep 17 00:00:00 2001 From: Dmitry Ryumin Date: Thu, 21 Mar 2024 16:56:11 +0300 Subject: [PATCH] Update files --- json_data/2024/main/IVMSP.json | 644 +++++++++++++++++++++++++++++++++ 1 file changed, 644 insertions(+) diff --git a/json_data/2024/main/IVMSP.json b/json_data/2024/main/IVMSP.json index 89e0ae3..6eec477 100644 --- a/json_data/2024/main/IVMSP.json +++ b/json_data/2024/main/IVMSP.json @@ -166,5 +166,649 @@ "onedrive": null, "loom": null, "section": "Vision and Language" + }, + { + "title": "Human Guided Cross-Modal Reasoning with Semantic Attention Learning for Visual Question Answering", + "base_url": null, + "title_page": null, + "ieee_id": "10448302", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Semanticmapper: Region-Specific Domain Adaptation for 3D Shapes Through Lexical Delineation", + "base_url": null, + "title_page": null, + "ieee_id": "10446758", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Self-Distilled Dynamic Fusion Network for Language-based Fashion Retrieval", + "base_url": null, + "title_page": null, + "ieee_id": "10445903", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Implicit-Knowledge-Guided Align Before Understanding for KB-VQA", + "base_url": null, + "title_page": null, + "ieee_id": "10448108", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Imitating the Human Visual System for Scanpath Predicting", + "base_url": null, + "title_page": null, + "ieee_id": "10447354", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Read, Spell and Repeat: Scene Text Recognition with Vision-Language Circular Refinement", + "base_url": null, + "title_page": null, + "ieee_id": "10446176", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "End-to-End Spatially-Constrained Multi-Perspective Fine-Grained Image Captioning", + "base_url": null, + "title_page": null, + "ieee_id": "10445846", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Improved Image Captioning via Knowledge Graph-Augmented Models", + "base_url": null, + "title_page": null, + "ieee_id": "10447637", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Think as People: Context-Driven Multi-Image News Captioning with Adaptive Dual Attention", + "base_url": null, + "title_page": null, + "ieee_id": "10446024", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "MGRL: Mutual-Guidance Representation Learning for Text-to-Image Person Retrieval", + "base_url": null, + "title_page": null, + "ieee_id": "10447260", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Fine-Grained Features Alignment and Fusion for Text-Video Cross-Modal Retrieval", + "base_url": null, + "title_page": null, + "ieee_id": "10446511", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Label Correction for Sketch-based 3d Shape Retrieval", + "base_url": null, + "title_page": null, + "ieee_id": "10447927", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Glocal Cascading Network for Topic Enhanced Visual Storytelling", + "base_url": null, + "title_page": null, + "ieee_id": "10447361", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "CROCFUN: Cross-Modal Conditional Fusion Network for Pansharpening", + "base_url": null, + "title_page": null, + "ieee_id": "10446470", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Domain-Wise Invariant Learning for Panoptic Scene Graph Generation", + "base_url": null, + "title_page": null, + "ieee_id": "10447193", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": "2310.05867", + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "CReStyler: Text-Guided Single Image Style Transfer Method based on CNN and Restormer", + "base_url": null, + "title_page": null, + "ieee_id": "10446192", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Maskstr: Guide Scene Text Recognition Models with Masking", + "base_url": null, + "title_page": null, + "ieee_id": "10446874", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Supplementing Missing Visions via Dialog for Scene Graph Generations", + "base_url": null, + "title_page": null, + "ieee_id": "10446239", + "github": "L-YeZhu/SI-Dial", + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": "2204.11143", + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "CKT-RCM: Clip-based Knowledge Transfer and Relational Context Mining for Unbiased Panoptic Scene Graph Generation", + "base_url": null, + "title_page": null, + "ieee_id": "10446810", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Memory Self-Calibrated Network for Visual Grounding", + "base_url": null, + "title_page": null, + "ieee_id": "10447732", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Text-Video Completion Networks with Motion Compensation and Attention Aggregation", + "base_url": null, + "title_page": null, + "ieee_id": "10447901", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "M2SUM: Multi-Granularity Scale-Adaptive Video Summarizer towards Informative Context Representation Learning", + "base_url": null, + "title_page": null, + "ieee_id": "10446527", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" + }, + { + "title": "Template-Guided Data Augmentation for Unbiased Scene Graph Generation", + "base_url": null, + "title_page": null, + "ieee_id": "10448033", + "github": null, + "web_page": null, + "github_page": null, + "colab": null, + "modelscope": null, + "gitee": null, + "gitlab": null, + "zenodo": null, + "kaggle": null, + "demo_page": null, + "paper_thecvf": null, + "paper_arxiv_id": null, + "paper_pdf": null, + "paper_hal_science": null, + "paper_researchgate": null, + "paper_amazon": null, + "youtube_id": null, + "drive_google": null, + "dropbox": null, + "onedrive": null, + "loom": null, + "section": "Vision and Language" } ] \ No newline at end of file