The IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) is the premier annual computer vision event comprising the main conference and several co-located workshops and short courses. This year’s conference will be held at the Music City Center, Nashville, TN from June 11-15, 2025.
Members of UCF and their collaborators have 14 accepted papers into the CVPR 2025 conference.
The h5-index is the h-index for articles published in the last 5 complete years. According to Google Scholar Metrics, Computer Vision and Pattern Recognition (CVPR) is ranked 1st in the h5-index rankings for the Engineering & Computer Science subcategory and 2nd in the h5-index rankings for top venues.
You can access the Aii Publications Page and the CRCV Publications Page for enhanced search capabilities.
Liang, Xin; Rawat, Yogesh
DIFFER: Disentangling Identity Features via Semantic Cues for Clothes-Changing Person Re-ID Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Liang2025,
title = {DIFFER: Disentangling Identity Features via Semantic Cues for Clothes-Changing Person Re-ID},
author = {Xin Liang and Yogesh Rawat},
url = {https://cvpr.thecvf.com/virtual/2025/poster/33555
https://openreview.net/pdf?id=tI3ZlbEhOM
https://openreview.net/attachment?id=tI3ZlbEhOM&name=supplementary_material},
year = {2025},
date = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {In this work, we focus on clothes-changing person re-identification (CC-ReID), which aims to recognize individuals under different clothing scenarios. Current CC-ReID approaches either concentrate on modeling body shape using additional modalities including silhouette, pose, and body mesh, potentially causing the model to overlook other critical biometric traits such as gender, age, and style, or they incorporate supervision through additional labels that the model tries to disregard or emphasize, such as clothing or personal attributes. However, these annotations are discrete in nature and do not capture comprehensive descriptions.In this work, we propose DIFFER: Disentangle Identity Features From Entangled Representations, a novel adversarial learning method that leverages textual descriptions to disentangle identity features. Recognizing that image features inherently mix inseparable information, DIFFER introduces NBDetach, a mechanism that utilizes the separable nature of text descriptions as disentanglement supervision to partition the feature space into distinct subspaces, enabling the effective separation of identity-related features from non-biometric features through gradient reversal. We evaluate DIFFER on 4 different benchmark datasets (LTCC, PRCC, CelebreID-Light, and CCVID) to demonstrate its effectiveness and provide state-of-the-art performance across all the benchmarks. DIFFER consistently outperforms the baseline method, with improvements in top-1 accuracy of 3.6% on LTCC, 3.4% on PRCC, 2.5% on CelebReID-Light, and 1% on CCVID. The code will be made publicly available.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Azad, Shehreen; Vineet, Vibhav; Rawat, Yogesh
HierarQ: Task-Aware Hierarchical Q-Former for Enhanced Video Understanding Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Azad2025,
title = {HierarQ: Task-Aware Hierarchical Q-Former for Enhanced Video Understanding},
author = {Shehreen Azad and Vibhav Vineet and Yogesh Rawat},
url = {https://arxiv.org/abs/2503.08585
},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Despite advancements in multimodal large language models (MLLMs), current approaches struggle in medium-to-long video understanding due to frame and context length limitations. As a result, these models often depend on frame sampling, which risks missing key information over time and lacks task-specific relevance. To address these challenges, we introduce HierarQ, a task-aware hierarchical Q-Former based framework that sequentially processes frames to bypass the need for frame sampling, while avoiding LLM's context length limitations. We introduce a lightweight two-stream language-guided feature modulator to incorporate task awareness in video understanding, with the entity stream capturing frame-level object information within a short context and the scene stream identifying their broader interactions over longer period of time. Each stream is supported by dedicated memory banks which enables our proposed Hierachical Querying transformer (HierarQ) to effectively capture short and long-term context. Extensive evaluations on 10 video benchmarks across video understanding, question answering, and captioning tasks demonstrate HierarQ's state-of-the-art performance across most datasets, proving its robustness and efficiency for comprehensive video analysis.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Garg, Aaryan; Kumar, Akash; Rawat, Yogesh
STPro: Spatial and Temporal Progressive Learning for Weakly Supervised Spatio-Temporal Grounding Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
BibTeX | Links:
@conference{Garg2025,
title = {STPro: Spatial and Temporal Progressive Learning for Weakly Supervised Spatio-Temporal Grounding},
author = {Aaryan Garg and Akash Kumar and Yogesh Rawat},
url = {https://aaryangrg.github.io/research/stpro},
year = {2025},
date = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Vayani, Ashmal; Dissanayake, Dinura; Watawana, Hasindri; Ahsan, Noor; Sasikumar, Nevasini; Thawakar, Omkar; Ademtew, Henok Biadglign; Hmaiti, Yahya; Kumar, Amandeep; Kuckreja, Kartik; Maslych, Mykola; Ghallabi, Wafa Al; Mihaylov, Mihail; Qin, Chao; Shaker, Abdelrahman M; Zhang, Mike; Ihsani, Mahardika Krisna; Esplana, Amiel; Gokani, Monil; Mirkin, Shachar; Singh, Harsh; Srivastava, Ashay; Hamerlik, Endre; Izzati, Fathinah Asma; Maani, Fadillah Adamsyah; Cavada, Sebastian; Chim, Jenny; Gupta, Rohit; Manjunath, Sanjay; Zhumakhanova, Kamila; Rabevohitra, Feno Heriniaina; Amirudin, Azril; Ridzuan, Muhammad; Kareem, Daniya; More, Ketan; Li, Kunyang; Shakya, Pramesh; Saad, Muhammad; Ghasemaghaei, Amirpouya; Djanibekov, Amirbek; Azizov, Dilshod; Jankovic, Branislava; Bhatia, Naman; Cabrera, Alvaro; Obando-Ceron, Johan; Otieno, Olympiah; Farestam, Fabian; Rabbani, Muztoba; Baliah, Sanoojan; Sanjeev, Santosh; Shtanchaev, Abduragim; Fatima, Maheen; Nguyen, Thao; Kareem, Amrin; Aremu, Toluwani; Xavier, Nathan; Bhatkal, Amit; Toyin, Hawau; Chadha, Aman; Cholakkal, Hisham; Anwer, Rao Muhammad; Felsberg, Michael; Laaksonen, Jorma; Solorio, Thamar; Choudhury, Monojit; Laptev, Ivan; Shah, Mubarak; Khan, Salman; Khan, Fahad
All Languages Matter: Evaluating LMMs on Culturally Diverse 100 Languages Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Vayani2025,
title = {All Languages Matter: Evaluating LMMs on Culturally Diverse 100 Languages},
author = {Ashmal Vayani and Dinura Dissanayake and Hasindri Watawana and Noor Ahsan and Nevasini Sasikumar and Omkar Thawakar and Henok Biadglign Ademtew and Yahya Hmaiti and Amandeep Kumar and Kartik Kuckreja and Mykola Maslych and Wafa Al Ghallabi and Mihail Mihaylov and Chao Qin and Abdelrahman M Shaker and Mike Zhang and Mahardika Krisna Ihsani and Amiel Esplana and Monil Gokani and Shachar Mirkin and Harsh Singh and Ashay Srivastava and Endre Hamerlik and Fathinah Asma Izzati and Fadillah Adamsyah Maani and Sebastian Cavada and Jenny Chim and Rohit Gupta and Sanjay Manjunath and Kamila Zhumakhanova and Feno Heriniaina Rabevohitra and Azril Amirudin and Muhammad Ridzuan and Daniya Kareem and Ketan More and Kunyang Li and Pramesh Shakya and Muhammad Saad and Amirpouya Ghasemaghaei and Amirbek Djanibekov and Dilshod Azizov and Branislava Jankovic and Naman Bhatia and Alvaro Cabrera and Johan Obando-Ceron and Olympiah Otieno and Fabian Farestam and Muztoba Rabbani and Sanoojan Baliah and Santosh Sanjeev and Abduragim Shtanchaev and Maheen Fatima and Thao Nguyen and Amrin Kareem and Toluwani Aremu and Nathan Xavier and Amit Bhatkal and Hawau Toyin and Aman Chadha and Hisham Cholakkal and Rao Muhammad Anwer and Michael Felsberg and Jorma Laaksonen and Thamar Solorio and Monojit Choudhury and Ivan Laptev and Mubarak Shah and Salman Khan and Fahad Khan},
url = {https://arxiv.org/abs/2411.16508
https://mbzuai-oryx.github.io/ALM-Bench/
https://huggingface.co/datasets/MBZUAI/ALM-Bench
https://github.com/mbzuai-oryx/ALM-Bench
https://drive.google.com/file/d/1B2yynH-o1z1UvMCl32XoY3pVIkAxgN0P/view?usp=sharing
https://docs.google.com/presentation/d/10lUrfoC-R1lgl0Dx0EYpx1ID3AqLGnRO/edit?usp=sharing&ouid=103839181464334047856&rtpof=true&sd=true
https://www.youtube.com/watch?v=fvcX4yi5AiY&list=PLtIMlmzwbtty7ie1quj1fq2QoTRK9eWHX&ab_channel=AshmalVayani},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Existing Large Multimodal Models (LMMs) generally focus on only a few regions and languages. As LMMs continue to improve, it is increasingly important to ensure they understand cultural contexts, respect local sensitivities, and support low-resource languages, all while effectively integrating corresponding visual cues. In pursuit of culturally diverse global multimodal models, our proposed All Languages Matter Benchmark (ALM-bench) represents the largest and most comprehensive effort to date for evaluating LMMs across 100 languages. ALM-bench challenges existing models by testing their ability to understand and reason about culturally diverse images paired with text in various languages, including many low-resource languages traditionally underrepresented in LMM research. The benchmark offers a robust and nuanced evaluation framework featuring various question formats, including true/false, multiple choice, and open-ended questions, which are further divided into short and long-answer categories. ALM-bench design ensures a comprehensive assessment of a model's ability to handle varied levels of difficulty in visual and linguistic reasoning. To capture the rich tapestry of global cultures, ALM-bench carefully curates content from 13 distinct cultural aspects, ranging from traditions and rituals to famous personalities and celebrations. Through this, ALM-bench not only provides a rigorous testing ground for state-of-the-art open and closed-source LMMs but also highlights the importance of cultural and linguistic inclusivity, encouraging the development of models that can serve diverse global populations effectively. Our benchmark is publicly available.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Huynh, Chuong; Yang, Jinyu; Tawari, Ashish; Shah, Mubarak; Tran, Son; Hamid, Raffay; Chilimbi, Trishul; Shrivastava, Abhinav
CoLLM:ALargeLanguage Model for Composed Image Retrieva Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Huynh2025b,
title = {CoLLM:ALargeLanguage Model for Composed Image Retrieva},
author = {Chuong Huynh and Jinyu Yang and Ashish Tawari and Mubarak Shah and Son Tran and Raffay Hamid and Trishul Chilimbi and Abhinav Shrivastava},
url = {https://www.amazon.science/publications/collm-a-large-language-model-for-composed-image-retrieval
https://assets.amazon.science/55/e0/7ead8a494dc49fce25cb9947654c/collm-a-large-language-model-for-composed-image-retrieval.pdf
https://collm-cvpr25.github.io/},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Composed Image Retrieval (CIR) is a complex task that aims to retrieve images based on a multimodal query. Typical training data consists of triplets containing a reference image, a textual description of desired modifications, and the target image, which are expensive and time-consuming to acquire. The scarcity of CIR datasets has led to zero-shot approaches utilizing synthetic triplets or leveraging vision-language models (VLMs) with ubiquitous web-crawled image-caption pairs. However, these methods have significant limitations: synthetic triplets suffer from limited scale, lack of diversity, and unnatural modification text, while image-caption pairs hinder joint embedding learning of the multimodal query due to the absence of triplet data. Moreover, existing approaches struggle with complex and nuanced modification texts that demand sophisticated fusion and understanding of vision and language modalities. We present CoLLM, a one-stop framework that effectively addresses these limitations. Our approach generates triplets on-the-fly from image-caption pairs, enabling supervised training without manual annotation. We leverage Large Language Models (LLMs) to generate joint embeddings of reference images and modification texts, facilitating deeper multimodal fusion. Additionally, we introduce Multi-Text CIR (MTCIR), a large-scale dataset comprising 3.4M samples, and refine existing CIR benchmarks (CIRR and Fashion-IQ) to enhance evaluation reliability. Experimental results demonstrate that CoLLM achieves state-of0the-art performance across multiple CIR benchmarks and settings. MTCIR yields competitive results, with up to 15% performance improvement. Our refined benchmarks provide more reliable evaluation metrics for CIR models, contributing to the advancement of this important field. Project page is at collm-cvpr25.github.io.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Hu, Kai; Gao, Feng; Nie, Xiaohan; Zhou, Peng; Tran, Son; Neiman, Tal; Wang, Lingyun; Shah, Mubarak; Hamid, Raffay; Yin, Bing; Chilimbi, Trishul
M-LLMBasedVideo Frame Selection for Efficient Video Understanding Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Hu2025,
title = {M-LLMBasedVideo Frame Selection for Efficient Video Understanding},
author = {Kai Hu and Feng Gao and Xiaohan Nie and Peng Zhou and Son Tran and Tal Neiman and Lingyun Wang and Mubarak Shah and Raffay Hamid and Bing Yin and Trishul Chilimbi},
url = {https://www.amazon.science/publications/m-llm-based-video-frame-selection-for-efficient-video-understanding
https://assets.amazon.science/1a/4e/6794e1dd4fb2acb30eda428a5119/m-llm-based-video-frame-selection-for-efficient-video-understanding.pdf
},
year = {2025},
date = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Recent advances in Multi-Modal Large Language Models (M-LLMs) show promising results in video reasoning. Popular Multi-Modal Large Language Model (M-LLM) frameworks usually apply naive uniform sampling to reduce the number of video frames that are fed into an M-LLM, particularly for long context videos. However, it could lose crucial context in certain periods of a video, so that the downstream M-LLM may not have sufficient visual information to answer a question. To attack this pain point, we propose a light-weight M-LLM-based frame selection method that adaptively select frames that are more relevant to users’ queries. In order to train the proposed frame selector, we introduce two supervision signals (i) Spatial signal, where single frame importance score by prompting a M-LLM; (ii) Temporal signal, in which multiple frames selection by prompting Large Language Model (LLM) using the captions of all frame candidates. The selected frames are then digested by a frozen downstream video M-LLM for visual reasoning and question answering. Empirical results show that the proposed M-LLM video frame selector improves the performances various downstream video Large Language Model (video-LLM) across medium (ActivityNet, NExT-QA) and long (EgoSchema, LongVideoBench) context video question answering benchmarks.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Croitoru, Florinel-Alin; Hondru, Vlad; Ionescu, Radu Tudor; Sebe, Nicu; Shah, Mubarak
Curriculum Direct Preference Optimization for Diffusion and Consistency Models Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Croitoru2025,
title = {Curriculum Direct Preference Optimization for Diffusion and Consistency Models},
author = {Florinel-Alin Croitoru and Vlad Hondru and Radu Tudor Ionescu and Nicu Sebe and Mubarak Shah},
url = {https://arxiv.org/abs/2405.13637
https://croitorualin.github.io/cl-dpo/
https://github.com/CroitoruAlin/Curriculum-DPO
},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Direct Preference Optimization (DPO) has been proposed as an effective and efficient alternative to reinforcement learning from human feedback (RLHF). In this paper, we propose a novel and enhanced version of DPO based on curriculum learning for text-to-image generation. Our method is divided into two training stages. First, a ranking of the examples generated for each prompt is obtained by employing a reward model. Then, increasingly difficult pairs of examples are sampled and provided to a text-to-image generative (diffusion or consistency) model. Generated samples that are far apart in the ranking are considered to form easy pairs, while those that are close in the ranking form hard pairs. In other words, we use the rank difference between samples as a measure of difficulty. The sampled pairs are split into batches according to their difficulty levels, which are gradually used to train the generative model. Our approach, Curriculum DPO, is compared against state-of-the-art fine-tuning approaches on nine benchmarks, outperforming the competing methods in terms of text alignment, aesthetics and human preference. Our code is available at this https URL.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wang, Lan; Ao, Wei; Boddeti, Vishnu; Lim, Sernam
Generative Zero-Shot Composed Image Retrieval Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Wang2025,
title = {Generative Zero-Shot Composed Image Retrieval},
author = {Lan Wang and Wei Ao and Vishnu Boddeti and Sernam Lim},
url = {https://hal.cse.msu.edu/papers/cig-generative-zero-shot-composed-image-retrieval/
https://hal.cse.msu.edu/assets/pdfs/papers/2025-cvpr-cig-generative-zero-shot-composed-image-retrieval.pdf
https://hal.cse.msu.edu/assets/pdfs/papers/2025-cvpr-cig-generative-zero-shot-composed-image-retrieval-supp.pdf
https://lan-lw.github.io/CIG/},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Composed Image Retrieval (CIR) is a vision-language task utilizing queries comprising images and textual descriptions to achieve precise image retrieval. This task seeks to find images that are visually similar to a reference image while incorporating specific changes or features described textually (visual delta). CIR enables a more flexible and user-specific retrieval by bridging visual data with verbal instructions. This paper introduces a novel generative method that augments Composed Image Retrieval by Composed Image Generation (CIG) to provide pseudotarget images. CIG utilizes a textual inversion network to map reference images into semantic word space, which generates pseudo-target images in combination with textual descriptions. These images serve as additional visual information, significantly improving the accuracy and relevance of retrieved images when integrated into existing retrieval frameworks. Experiments conducted across multiple CIR datasets and several baseline methods demonstrate improvements in retrieval performance, which shows the potential of our approach as an effective add-on for existing composed image retrieval.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
xin Liu, Ye; Liang, Zhengyang; Wang, Yueze; Wu, Xianfeng; Tang, Feilong; He, Muyang; Li, Jian; Liu, Zheng; Yang, Harry; Lim, Ser-Nam; Zhao, Bo
Unveiling the Ignorance of MLLMs: Seeing Clearly, Answering Incorrectly Conference
2025.
@conference{xinLiu2025,
title = {Unveiling the Ignorance of MLLMs: Seeing Clearly, Answering Incorrectly},
author = {Ye xin Liu and Zhengyang Liang and Yueze Wang and Xianfeng Wu and Feilong Tang and Muyang He and Jian Li and Zheng Liu and Harry Yang and Ser-Nam Lim and Bo Zhao},
url = {https://arxiv.org/html/2406.10638v2
https://arxiv.org/pdf/2406.10638v2},
year = {2025},
date = {2025-06-11},
abstract = {Multimodal Large Language Models (MLLMs) have displayed remarkable performance in multi-modal tasks, particularly in visual comprehension. However, we reveal that MLLMs often generate incorrect answers even when they understand the visual content. To this end, we manually construct a benchmark with 12 categories and design evaluation metrics that assess the degree of error in MLLM responses even when the visual content is seemingly understood. Based on this benchmark, we test 15 leading MLLMs and analyze the distribution of attention maps and logits of some MLLMs. Our investigation identifies two primary issues: 1) most instruction tuning datasets predominantly feature questions that “directly” relate to the visual content, leading to a bias in MLLMs’ responses to other indirect questions, and 2) MLLMs’ attention to visual tokens is notably lower than to system and question tokens. We further observe that attention scores between questions and visual tokens as well as the model’s confidence in the answers are lower in response to misleading questions than to straightforward ones. To address the first challenge, we introduce a paired positive and negative data construction pipeline to diversify the dataset. For the second challenge, we propose to enhance the model’s focus on visual content during decoding by refining the text and visual prompt. For the text prompt, we propose a content guided refinement strategy that performs preliminary visual content analysis to generate structured information before answering the question. Additionally, we employ a visual attention refinement strategy that highlights question-relevant visual tokens to increase the model’s attention to visual content that aligns with the question. Extensive experiments demonstrate that these challenges can be significantly mitigated with our proposed dataset and techniques. The benchmark, training set, and code will be available.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Souradip Chakraborty Soumya Suvra Ghosal, Vaibhav Singh
IMMUNELogo : Improving Safety Against Jailbreaks in Multi-modal LLMs via Inference-Time Alignment Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Ghosal2025,
title = {IMMUNELogo : Improving Safety Against Jailbreaks in Multi-modal LLMs via Inference-Time Alignment},
author = {Soumya Suvra Ghosal, Souradip Chakraborty, Vaibhav Singh, Tianrui Guan, Mengdi Wang, Ahmad Beirami, Furong Huang, Alvaro Velasquez, Dinesh Manocha, Amrit Singh Bedi},
url = {https://itsvaibhav01.github.io/immune-web/
https://arxiv.org/pdf/2411.18688
https://arxiv.org/abs/2411.18688
https://github.com/itsvaibhav01/Immune},
year = {2025},
date = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {With the widespread deployment of Multimodal Large Language Models (MLLMs) for visual-reasoning tasks, improving their safety has become crucial. Recent research indicates that despite training-time safety alignment, these models remain vulnerable to jailbreak attacks. In this work, we first highlight an important safety gap to describe that alignment achieved solely through safety training may be insufficient against jailbreak attacks. To address this vulnerability, we propose Immune, an inference-time defense framework that leverages a safe reward model through controlled decoding to defend against jailbreak attacks. Additionally, we provide a mathematical characterization of Immune, offering insights on why it improves safety against jailbreaks. Extensive evaluations on diverse jailbreak benchmarks using recent MLLMs reveal that Immune effectively enhances model safety while preserving the model's original capabilities. For instance, against text-based jailbreak attacks on LLaVA-1.6, Immune reduces the attack success rate by 57.82% and 16.78% compared to the base MLLM and state-of-the-art defense strategy, respectively.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Chen, Chen; Liu, Daochang; Shah, Mubarak; Xu, Chang
Enhancing Privacy-Utility Trade-offs to Mitigate Memorization in Diffusion Models Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Chen2025b,
title = {Enhancing Privacy-Utility Trade-offs to Mitigate Memorization in Diffusion Models},
author = {Chen Chen and Daochang Liu and Mubarak Shah and Chang Xu},
url = {https://cvpr.thecvf.com/virtual/2025/poster/34842
https://chenchen-usyd.github.io/PRSS-Project-Page/},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Text-to-image diffusion models have demonstrated remarkable capabilities in creating images highly aligned with user prompts, yet their proclivity for memorizing training set images has sparked concerns about the originality of the generated images and privacy issues, potentially leading to legal complications for both model owners and users, particularly when the memorized images contain proprietary content. Although methods to mitigate these issues have been suggested, enhancing privacy often results in a significant decrease in the utility of the outputs, as indicated by text-alignment scores. To bridge the research gap, we introduce a novel method, PRSS, which refines the classifier-free guidance approach in diffusion models by integrating prompt re-anchoring (PR) to improve privacy and incorporating semantic prompt search (SS) to enhance utility. Extensive experiments across various privacy levels demonstrate that our approach consistently improves the privacy-utility trade-off, establishing a new state-of-the-art.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Deng, Andong; Chen, Tongjia; Yu, Shoubin; Yang, Taojiannan; Spencer, Lincoln; Tian, Yapeng; Mian, Ajmal; Bansal, Mohit; Chen, Chen
Motion-Grounded Video Reasoning: Understanding and Perceiving Motion at Pixel Level Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Deng2025,
title = {Motion-Grounded Video Reasoning: Understanding and Perceiving Motion at Pixel Level},
author = {Andong Deng and Tongjia Chen and Shoubin Yu and Taojiannan Yang and Lincoln Spencer and Yapeng Tian and Ajmal Mian and Mohit Bansal and Chen Chen},
url = {https://groundmore.github.io/
https://arxiv.org/abs/2411.09921
https://github.com/dengandong/GroundMoRe
https://huggingface.co/datasets/groundmore/GroundMoRe},
year = {2025},
date = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {In this paper, we introduce Motion-Grounded Video Reasoning, a new motion understanding task that requires generating spatiotemporal segmentation masks according to the input question, and hence needs implicit spatiotemporal reasoning and grounding. This task extends existing spatiotemporal grounding work that focuses on explicit action/motion recognition, to a more general format by enabling implicit motion reasoning via questions. To facilitate the development of advanced motion-grounding models on such a task, we collect a large-scale dataset called GroundMoRe, which comprises 1,715 video clips, 249K object masks that are deliberately designed with 4 types (Causal, Sequential, Counterfactual, and Descriptive) for benchmarking deep and comprehensive motion understanding abilities. Our GroundMoRe uniquely requires models to generate visual answers (spatiotemporal masks), providing a more concrete and visually interpretable response than plain text. It evaluates models on spatiotemporal grounding and reasoning, helping address complex challenges in video reasoning, temporal perception, and pixel-level understanding. To further facilitate the proposed task, we propose a baseline model, Motion-Grounded Video Reasoning Assistant (MoRA). MoRA incorporates the multimodal reasoning ability from Multimodal LLM and the pixel-level perception capability from the grounding model (SAM) as well as an additional temporal localization head. MoRA achieves respectable performance on GroundMoRe outperforming the best existing visual grounding baseline model by an average of 28.8% relatively, but there still remains substantial room for interesting future improvements by the community. We hope this novel and challenging task will pave the way for future advancements in robust and general motion understanding via video reasoning segmentation.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Ren, Li; Chen, Chen; Wang, Liqiang; Hua, Kien A.
DA-VPT: Semantic-Guided Visual Prompt Tuning for Vision Transformers Conference
2025.
@conference{Ren2025,
title = {DA-VPT: Semantic-Guided Visual Prompt Tuning for Vision Transformers},
author = {Li Ren and Chen Chen and Liqiang Wang and Kien A. Hua},
url = {https://cvpr.thecvf.com/virtual/2025/poster/33078},
year = {2025},
date = {2025-06-11},
abstract = {Visual Prompt Tuning (VPT) has become a promising solution for Parameter-Efficient Fine-Tuning (PEFT) approach for Vision Transformer (ViT) models by partially fine-tuning learnable tokens while keeping most model parameters frozen. Recent research has explored modifying the connection structures of the prompts. However, the fundamental correlation and distribution between the prompts and image tokens remain unexplored. In this paper, we leverage textit{metric learning} techniques to investigate how the distribution of prompts affects fine-tuning performance. Specifically, we propose a novel framework, textbf{D}istribution textbf{A}ware textbf{V}isual textbf{P}rompt Tuning (DA-VPT), to guide the distributions of the prompts by learning the distance metric from their class-related semantic data. Our method demonstrates that the prompts can serve as an effective bridge to share semantic information between image patches and the class token. We extensively evaluated our approach on popular benchmarks in both recognition and segmentation tasks. The results demonstrate that our approach enables more effective and efficient fine-tuning of ViT models by leveraging semantic information to guide the learning of the prompts, leading to improved performance on various downstream vision tasks. The code will be released.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Deng, Andong; Gao, Zhongpai; Choudhuri, Anwesa; Planche, Benjamin; Zheng, Meng; Wang, Bin; Chen, Terrence; Chen, Chen; Wu, Ziyan
Seq2Time: Sequential Knowledge Transfer for Video LLM Temporal Grounding Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
@conference{Deng2025b,
title = {Seq2Time: Sequential Knowledge Transfer for Video LLM Temporal Grounding},
author = {Andong Deng and Zhongpai Gao and Anwesa Choudhuri and Benjamin Planche and Meng Zheng and Bin Wang and Terrence Chen and Chen Chen and Ziyan Wu},
url = {https://arxiv.org/abs/2411.16932},
year = {2025},
date = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Temporal awareness is essential for video large language models (LLMs) to understand and reason about events within long videos, enabling applications like dense video captioning and temporal video grounding in a unified system. However, the scarcity of long videos with detailed captions and precise temporal annotations limits their temporal awareness. In this paper, we propose Seq2Time, a data-oriented training paradigm that leverages sequences of images and short video clips to enhance temporal awareness in long videos. By converting sequence positions into temporal annotations, we transform large-scale image and clip captioning datasets into sequences that mimic the temporal structure of long videos, enabling self-supervised training with abundant time-sensitive data. To enable sequence-to-time knowledge transfer, we introduce a novel time representation that unifies positional information across image sequences, clip sequences, and long videos. Experiments demonstrate the effectiveness of our method, achieving a 27.6% improvement in F1 score and 44.8% in CIDEr on the YouCook2 benchmark and a 14.7% increase in recall on the Charades-STA benchmark compared to the baseline.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}