ICCV is the premier international computer vision event comprising the main conference and several co-located workshops and tutorials.
Members of UCF’s Center for Research in Computer Vision (CRCV), Artificial Intelligence Initiative (Aii) and their collaborators had a record number of 19 papers accepted into the (ICCV 2025) conference that will take place in Honolulu, Hawaii from October 19-23, 2025.
The h5-index is the h-index for articles published in the last 5 complete years. According to Google Scholar Metrics, the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) is ranked 10th in Engineering & Computer Science on the h5-index rankings on https://scholar.google.com/citations?view_op=top_venues&hl=en&vq=eng.
You can access the CRCV Publications Page for enhanced search capabilities.
Li, Ming; Gu, Xin; Chen, Fan; Xing, Xiaoying; Wen, Longyin; Chen, Chen; Zhu, Sijie
SuperEdit: Rectifying and Facilitating Supervision for Instruction-Based Image Editing Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Li2025b,
title = {SuperEdit: Rectifying and Facilitating Supervision for Instruction-Based Image Editing},
author = {Ming Li and Xin Gu and Fan Chen and Xiaoying Xing and Longyin Wen and Chen Chen and Sijie Zhu},
url = {https://liming-ai.github.io/SuperEdit/
https://github.com/bytedance/SuperEdit
https://huggingface.co/datasets/limingcv/SuperEdit-40K},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Due to the challenges of manually collecting accurate editing data, existing datasets are typically constructed using various automated methods, leading to noisy supervision signals caused by the mismatch between editing instructions and original-edited image pairs. Recent efforts attempt to improve editing models through generating higher-quality edited images, pre-training on recognition tasks, or introducing vision-language models (VLMs) but fail to resolve this fundamental issue. In this paper, we offer a novel solution by constructing more effective editing instructions for given image pairs. This includes rectifying the editing instructions to better align with the original-edited image pairs and using contrastive editing instructions to further enhance their effectiveness. Specifically, we find that editing models exhibit specific generation attributes at different inference steps, independent of the text. Based on these prior attributes, we define a unified guide for VLMs to rectify editing instructions. However, there are some challenging editing scenarios that cannot be resolved solely with rectified instructions. To this end, we further construct contrastive supervision signals with positive and negative instructions and introduce them into the model training using triplet loss, thereby further facilitating supervision effectiveness. Our method does not require the VLM modules or pre-training tasks used in previous work, offering a more direct and efficient way to provide better supervision signals, and providing a novel, simple, and effective solution for instruction-based image editing. Results on multiple benchmarks demonstrate that our method significantly outperforms existing approaches. Compared with previous SOTA SmartEdit, we achieve 9.19% improvements on the Real-Edit benchmark with 30x less training data and 13x smaller model size.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Pinyoanuntapong, Ekkasit; Saleem, Muhammad Usama; Karunratanakul, Korrawe; Wang, Pu; Xue, Hongfei; Chen, Chen; chuan guo,; Cao, Junli; Ren, Jian; Tulyakov, Sergey
Spatio-Temporal Control for Masked Motion Synthesis Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Pinyoanuntapong2025,
title = {Spatio-Temporal Control for Masked Motion Synthesis},
author = {Ekkasit Pinyoanuntapong and Muhammad Usama Saleem and Korrawe Karunratanakul and Pu Wang and Hongfei Xue and Chen Chen and chuan guo and Junli Cao and Jian Ren and Sergey Tulyakov},
url = {https://www.ekkasit.com/ControlMM-page/
https://github.com/exitudio/MaskControl/},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Recent advances in motion diffusion models have enabled spatially controllable text-to-motion generation. However, these models struggle to achieve high-precision control while maintaining high-quality motion generation. To address these challenges, we propose MaskControl, the first approach to introduce controllability to the generative masked motion model. Our approach introduces two key innovations. First, textit{Logits Regularizer} implicitly perturbs logits at training time to align the distribution of motion tokens with the controlled joint positions, while regularizing the categorical token prediction to ensure high-fidelity generation. Second, textit{Logit Optimization} explicitly optimizes the predicted logits during inference time, directly reshaping the token distribution that forces the generated motion to accurately align with the controlled joint positions. Moreover, we introduce textit{Differentiable Expectation Sampling (DES)} to combat the non-differential distribution sampling process encountered by logits regularizer and optimization. Extensive experiments demonstrate that MaskControl outperforms state-of-the-art methods, achieving superior motion quality (FID decreases by ~77%) and higher control precision (average error 0.91 vs. 1.08). Additionally, MaskControl enables diverse applications, including any-joint-any-frame control, body-part timeline control, and zero-shot objective control.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Lyu, Zonglin; Chen, Chen
TLB-VFI: Temporal-Aware Latent Brownian Bridge Diffusion for Video Frame Interpolation Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Lyu2025,
title = {TLB-VFI: Temporal-Aware Latent Brownian Bridge Diffusion for Video Frame Interpolation},
author = {Zonglin Lyu and Chen Chen},
url = {https://arxiv.org/abs/2507.04984},
year = {2025},
date = {2025-10-19},
urldate = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Video Frame Interpolation (VFI) aims to predict the intermediate frame I_n (we use n to denote time in videos to avoid notation overload with the timestep t in diffusion models) based on two consecutive neighboring frames I_0 and I_1. Recent approaches apply diffusion models (both image-based and video-based) in this task and achieve strong performance. However, image-based diffusion models are unable to extract temporal information and are relatively inefficient compared to non-diffusion methods. Video-based diffusion models can extract temporal information, but they are too large in terms of training scale, model size, and inference time. To mitigate the above issues, we propose Temporal-Aware Latent Brownian Bridge Diffusion for Video Frame Interpolation (TLB-VFI), an efficient video-based diffusion model. By extracting rich temporal information from video inputs through our proposed 3D-wavelet gating and temporal-aware autoencoder, our method achieves 20% improvement in FID on the most challenging datasets over recent SOTA of image-based diffusion models. Meanwhile, due to the existence of rich temporal information, our method achieves strong performance while having 3times fewer parameters. Such a parameter reduction results in 2.3x speed up. By incorporating optical flow guidance, our method requires 9000x less training data and achieves over 20x fewer parameters than video-based diffusion models. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wu, Wenhan; Guo, Zhishuai; Chen, Chen; Xue, Hongfei; Lu, Aidong
Frequency-Semantic Enhanced Variational Autoencoder for Zero-Shot Skeleton-based Action Recognition Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Wu2025,
title = {Frequency-Semantic Enhanced Variational Autoencoder for Zero-Shot Skeleton-based Action Recognition},
author = {Wenhan Wu and Zhishuai Guo and Chen Chen and Hongfei Xue and Aidong Lu},
url = {https://arxiv.org/abs/2506.22179},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Zero-shot skeleton-based action recognition aims to develop models capable of identifying actions beyond the categories encountered during training. Previous approaches have primarily focused on aligning visual and semantic representations but often overlooked the importance of fine-grained action patterns in the semantic space (e.g., the hand movements in drinking water and brushing teeth). To address these limitations, we propose a Frequency-Semantic Enhanced Variational Autoencoder (FS-VAE) to explore the skeleton semantic representation learning with frequency decomposition. FS-VAE consists of three key components: 1) a frequency-based enhancement module with high- and low-frequency adjustments to enrich the skeletal semantics learning and improve the robustness of zero-shot action recognition; 2) a semantic-based action description with multilevel alignment to capture both local details and global correspondence, effectively bridging the semantic gap and compensating for the inherent loss of information in skeleton sequences; 3) a calibrated cross-alignment loss that enables valid skeleton-text pairs to counterbalance ambiguous ones, mitigating discrepancies and ambiguities in skeleton and text features, thereby ensuring robust alignment. Evaluations on the benchmarks demonstrate the effectiveness of our approach, validating that frequency-enhanced semantic features enable robust differentiation of visually and semantically similar action clusters, improving zero-shot action recognition.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Azad, Shehreen; Rawat, Yogesh
DisenQ: Disentangling Q-Former for Activity-Biometrics Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Azad2025b,
title = {DisenQ: Disentangling Q-Former for Activity-Biometrics},
author = {Shehreen Azad and Yogesh Rawat},
url = {https://chatpaper.com/pt/chatpaper/paper/163039
https://arxiv.org/abs/2507.07262},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {In this work, we address activity-biometrics, which involves identifying individuals across diverse set of activities. Unlike traditional person identification, this setting introduces additional challenges as identity cues become entangled with motion dynamics and appearance variations, making biometrics feature learning more complex. While additional visual data like pose and/or silhouette help, they often struggle from extraction inaccuracies. To overcome this, we propose a multimodal language-guided framework that replaces reliance on additional visual data with structured textual supervision. At its core, we introduce textbf{DisenQ} (textbf{Disen}tangling textbf{Q}-Former), a unified querying transformer that disentangles biometrics, motion, and non-biometrics features by leveraging structured language guidance. This ensures identity cues remain independent of appearance and motion variations, preventing misidentifications. We evaluate our approach on three activity-based video benchmarks, achieving state-of-the-art performance. Additionally, we demonstrate strong generalization to complex real-world scenario with competitive performance on a traditional video-based identification benchmark, showing the effectiveness of our framework.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Abdullah, Raiyaan; Claypoole, Jared; Cogswell, Michael; Divakaran, Ajay; Rawat, Yogesh
Punching Bag vs. Punching Person: Motion Transferability in Videos Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Abdullah2025,
title = {Punching Bag vs. Punching Person: Motion Transferability in Videos},
author = {Raiyaan Abdullah and Jared Claypoole and Michael Cogswell and Ajay Divakaran and Yogesh Rawat},
url = {https://iccv.thecvf.com/virtual/2025/poster/935},
year = {2025},
date = {2025-10-19},
urldate = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Action recognition models, both unimodal and multimodal, have demonstrated strong generalization in tasks such as zero-shot learning, base-to-novel transfer, and domain adaptation. However, can they effectively transfer high-level motion concepts across diverse contexts, even within similar distributions? For example, can a model recognize the broad action "Pushing" when presented with unknown variations such as "Pushing something from right to left"? To explore this, we introduce a motion transferability framework with three datasets: (1) Syn-TA, a synthetic dataset with 3D object motions; (2) Kinetics400-TA; and (3) Something-Something-v2-TA, both adapted from natural video datasets. We evaluate 13 state-of-the-art models on these benchmarks and observe a significant drop in performance when recognizing high-level actions in novel contexts. Our analysis reveals: 1) Multimodal models struggle more with fine-grained unknown actions than coarse ones; 2) The bias-free Syn-TA proves as challenging as real-world datasets, with models showing greater performance drops in controlled settings; 3) Larger models improve transferability when spatial cues dominate but struggle with intensive temporal reasoning, while reliance on object and background cues hinders generalization. We further explore how disentangling coarse and fine motions can improve recognition in temporally challenging datasets. Our study establishes a crucial benchmark for assessing motion transferability in action recognition.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Pathak, Priyank; Rawat, Yogesh
Colors See Colors Ignore: Clothes Changing ReID with Color Disentanglement Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Pathak2025b,
title = {Colors See Colors Ignore: Clothes Changing ReID with Color Disentanglement},
author = {Priyank Pathak and Yogesh Rawat},
url = {https://arxiv.org/abs/2507.07230},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Clothes-Changing Re-Identification (CC-ReID) aims to recognize individuals across different locations and times, irrespective of clothing. Existing methods often rely on additional models or annotations to learn robust, clothing-invariant features, making them resource-intensive. In contrast, we explore the use of color - specifically foreground and background colors - as a lightweight, annotation-free proxy for mitigating appearance bias in ReID models. We propose Colors See, Colors Ignore (CSCI), an RGB-only method that leverages color information directly from raw images or video frames. CSCI efficiently captures color-related appearance bias ('Color See') while disentangling it from identity-relevant ReID features ('Color Ignore'). To achieve this, we introduce S2A self-attention, a novel self-attention to prevent information leak between color and identity cues within the feature space. Our analysis shows a strong correspondence between learned color embeddings and clothing attributes, validating color as an effective proxy when explicit clothing labels are unavailable. We demonstrate the effectiveness of CSCI on both image and video ReID with extensive experiments on four CC-ReID datasets. We improve the baseline by Top-1 2.9% on LTCC and 5.0% on PRCC for image-based ReID, and 1.0% on CCVID and 2.5% on MeVID for video-based ReID without relying on additional supervision. Our results highlight the potential of color as a cost-effective solution for addressing appearance bias in CC-ReID. Github: this https URL.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Shatwell, David; Dave, Ishan; Sirnam, Swetha; Shah, Mubarak
GT-Loc: Unifying When and Where in Images through a Joint Embedding Space Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Shatwell2025,
title = {GT-Loc: Unifying When and Where in Images through a Joint Embedding Space},
author = {David Shatwell and Ishan Dave and Swetha Sirnam and Mubarak Shah},
url = {https://chatpaper.com/fr/chatpaper/paper/163729
https://arxiv.org/abs/2507.10473#:~:text=14%20Jul%202025%5D-,GT%2DLoc%3A%20Unifying%20When%20and%20Where%20in%20Images,Through%20a%20Joint%20Embedding%20Space&text=Abstract%3ATimestamp%20prediction%20aims%20to,%2C%20retrieval%2C%20and%20digital%20forensics.},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Timestamp prediction aims to determine when an image was captured using only visual information, supporting applications such as metadata correction, retrieval, and digital forensics. In outdoor scenarios, hourly estimates rely on cues like brightness, hue, and shadow positioning, while seasonal changes and weather inform date estimation. However, these visual cues significantly depend on geographic context, closely linking timestamp prediction to geo-localization. To address this interdependence, we introduce GT-Loc, a novel retrieval-based method that jointly predicts the capture time (hour and month) and geo-location (GPS coordinates) of an image. Our approach employs separate encoders for images, time, and location, aligning their embeddings within a shared high-dimensional feature space. Recognizing the cyclical nature of time, instead of conventional contrastive learning with hard positives and negatives, we propose a temporal metric-learning objective providing soft targets by modeling pairwise time differences over a cyclical toroidal surface. We present new benchmarks demonstrating that our joint optimization surpasses previous time prediction methods, even those using the ground-truth geo-location as an input during inference. Additionally, our approach achieves competitive results on standard geo-localization tasks, and the unified embedding space facilitates compositional and text-based image retrieval.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Kang, Weitai; Huang, Haifeng; Shang, Yuzhang; Shah, Mubarak; Yan, Yan
Robin3D: Improving 3D Large Language Model via Robust Instruction Tuning Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Kang2025,
title = {Robin3D: Improving 3D Large Language Model via Robust Instruction Tuning},
author = {Weitai Kang and Haifeng Huang and Yuzhang Shang and Mubarak Shah and Yan Yan},
url = {https://arxiv.org/abs/2410.00255
https://github.com/WeitaiKang/Robin3D},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Recent advancements in 3D Large Language Models (3DLLMs) have highlighted their potential in building general-purpose agents in the 3D real world, yet challenges remain due to the lack of high-quality robust instruction-following data, leading to limited discriminative power and generalization of 3DLLMs. In this paper, we introduce Robin3D, a powerful 3DLLM trained on large-scale instruction-following data generated by our novel data engine, Robust Instruction Generation (RIG) engine. RIG generates two key instruction data: 1) the Adversarial Instruction-following data, which features mixed negative and positive samples to enhance the model's discriminative understanding. 2) the Diverse Instruction-following data, which contains various instruction styles to enhance model's generalization. As a result, we construct 1 million instruction-following data, consisting of 344K Adversarial samples, 508K Diverse samples, and 165K benchmark training set samples. To better handle these complex instructions, Robin3D first incorporates Relation-Augmented Projector to enhance spatial understanding, and then strengthens the object referring and grounding ability through ID-Feature Bonding. Robin3D consistently outperforms previous methods across five widely-used 3D multimodal learning benchmarks, without the need for task-specific fine-tuning. Notably, we achieve a 7.8% improvement in the grounding task (Multi3DRefer) and a 6.9% improvement in the captioning task (Scan2Cap).},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Thawakar, Omkar; Demidov, Dmitry; Thawkar, Ritesh; Anwer, Rao; Shah, Mubarak; Khan, Fahad; Khan, Salman
Beyond Simple Edits: Composed Video Retrieval with Dense Modifications Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Thawakar2025,
title = {Beyond Simple Edits: Composed Video Retrieval with Dense Modifications},
author = {Omkar Thawakar and Dmitry Demidov and Ritesh Thawkar and Rao Anwer and Mubarak Shah and Fahad Khan and Salman Khan},
url = {https://iccv.thecvf.com/virtual/2025/poster/1966},
year = {2025},
date = {2025-10-19},
urldate = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Composed video retrieval is a challenging task that strives to retrieve a target video based on a query video and a textual description detailing specific modifications. Standard retrieval frameworks typically struggle to handle the complexity of fine-grained compositional queries and variations in temporal understanding limiting their retrieval ability in the fine-grained setting. To address this issue, we introduce a novel dataset that captures both fine-grained and composed actions across diverse video segments, enabling more detailed compositional changes in retrieved video content.The proposed dataset, named Dense-WebVid-CoVR, consists of 1.6 million samples with dense modification text that is around seven times more than its existing counterpart. We further develop a new model that integrates visual and textual information through Cross-Attention (CA) fusion using grounded text encoder, enabling precise alignment between dense query modifications and target videos. The proposed model achieves state-of-the-art results surpassing existing methods on all metrics. Notably, it achieves 71.3% Recall@1 in visual+text setting and outperforms the state-of-the-art by 3.4%, highlighting its efficacy in terms of leveraging detailed video descriptions and dense modification texts. Our proposed dataset, code, and model will be publicly released.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Fan, Xinqi; Chen, Xueli; Yang, Luoxiao; Yap, Chuin Hong; Qureshi, Rizwan; Dou, Qi; Yap, Moi Hoon; Shah, Mubarak
Test-Time Retrieval-Augmented Adaptation for Vision-Language Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Fan2025,
title = {Test-Time Retrieval-Augmented Adaptation for Vision-Language Models},
author = {Xinqi Fan and Xueli Chen and Luoxiao Yang and Chuin Hong Yap and Rizwan Qureshi and Qi Dou and Moi Hoon Yap and Mubarak Shah},
url = {https://iccv.thecvf.com/virtual/2025/poster/2327},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Vision-language models (VLMs) have shown promise in test-time adaptation tasks due to their remarkable capabilities in understanding and reasoning about visual content through natural language descriptions. However, training VLMs typically demands substantial computational resources, and they often struggle to adapt efficiently to new domains or tasks. Additionally, dynamically estimating the test distribution from streaming data at test time remains a significant challenge. In this work, we propose a novel test-time retrieval-augmented adaption (TT-RAA) method that enables VLMs to maintain high performance across diverse visual recognition tasks without the need for task-specific training or large computational overhead. During inference, TT-RAA employs a streaming mixture of Gaussian database (SMGD) to continuously estimate test distributions, requiring minimal storage. Then, TT-RAA retrieves the most relevant information from the SMGD, enhancing the original VLM outputs. A key limitation of CLIP-based VLMs is their inter-modal vision-language optimization, which does not optimize vision-space similarity, leading to larger intra-modal variance. To address this, we propose a multimodal retrieval augmentation module that transforms the SMGD into a unified multimodal space, enabling retrieval that aligns both vision and language modalities. Extensive experiments across both cross-domain and out-of-distribution benchmarks comprising fourteen datasets demonstrate TT-RAA’s superior performance compared to state-of-the-art methods. Ablation studies and hyperparameter analyses further validate the effectiveness of the proposed modules.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wang, Haoxuan; Shang, Yuzhang; Yuan, Zhihang; Wu, Junyi; Yan, Junchi; Yan, Yan
QuEST: Low-bit Diffusion Model Quantization via Efficient Selective Finetuning Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Wang2025b,
title = {QuEST: Low-bit Diffusion Model Quantization via Efficient Selective Finetuning},
author = {Haoxuan Wang and Yuzhang Shang and Zhihang Yuan and Junyi Wu and Junchi Yan and Yan Yan},
url = {https://arxiv.org/abs/2402.03666},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The practical deployment of diffusion models is still hindered by the high memory and computational overhead. Although quantization paves a way for model compression and acceleration, existing methods face challenges in achieving low-bit quantization efficiently. In this paper, we identify imbalanced activation distributions as a primary source of quantization difficulty, and propose to adjust these distributions through weight finetuning to be more quantization-friendly. We provide both theoretical and empirical evidence supporting finetuning as a practical and reliable solution. Building on this approach, we further distinguish two critical types of quantized layers: those responsible for retaining essential temporal information and those particularly sensitive to bit-width reduction. By selectively finetuning these layers under both local and global supervision, we mitigate performance degradation while enhancing quantization efficiency. Our method demonstrates its efficacy across three high-resolution image generation tasks, obtaining state-of-the-art performance across multiple bit-width settings.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Shang, Yuzhang; Cai, Mu; Xu, Bingxin; Lee, Yong Jae; Yan, Yan
LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Shang2025,
title = {LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal Models},
author = {Yuzhang Shang and Mu Cai and Bingxin Xu and Yong Jae Lee and Yan Yan},
url = {https://llava-prumerge.github.io/
https://arxiv.org/abs/2403.15388
https://github.com/42Shawn/LLaVA-PruMerge},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Large Multimodal Models (LMMs) have shown significant reasoning capabilities by connecting a visual encoder and a large language model. LMMs typically use a fixed amount of visual tokens, such as the penultimate layer features in the CLIP visual encoder, as the prefix content. Recent LMMs incorporate more complex visual inputs, such as high-resolution images and videos, which increase the number of visual tokens significantly. However, due to the design of the Transformer architecture, computational costs associated with these models tend to increase quadratically with the number of input tokens. To tackle this problem, we explore a token reduction mechanism and find, similar to prior work, that many visual tokens are spatially redundant. Based on this, we propose PruMerge, a novel adaptive visual token reduction approach, which largely reduces the number of visual tokens while maintaining comparable model performance. We first select the unpruned visual tokens based on their similarity to class tokens and spatial tokens. We then cluster the pruned tokens based on key similarity and merge the clustered tokens with the unpruned tokens to supplement their information. Empirically, when applied to LLaVA-1.5, our approach can compress the visual tokens by 18 times on average (14 times on MME/TextVQA), and achieve comparable performance across diverse visual question-answering and reasoning tasks. Code and checkpoints will be released. To facilitate future research, we will release our code, dataset, benchmark, and checkpoints at },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Chen; Zhao, Wangbo; Zhang, Huiwen; Zhou, Yuhao; Tang, Weidong; Wang, Shuo; Yuan, Zhihang; Shang, Yuzhang; Peng, Xiaojiang; Wang, Kai; Yang, Dawei
EA-Vit: Efficient Adaptation for Elastic Vision Transformer Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Zhu2025,
title = {EA-Vit: Efficient Adaptation for Elastic Vision Transformer},
author = {Chen Zhu and Wangbo Zhao and Huiwen Zhang and Yuhao Zhou and Weidong Tang and Shuo Wang and Zhihang Yuan and Yuzhang Shang and Xiaojiang Peng and Kai Wang and Dawei Yang},
url = {https://iccv.thecvf.com/virtual/2025/poster/1084
https://arxiv.org/abs/2507.19360},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Vision Transformer (ViT) has emerged as a foundational model in computer vision, excelling in generalization and adaptation to downstream tasks. However, supporting diverse resource constraints typically requires retraining multiple, size-specific ViTs, which is both time-consuming and expensive. In this paper, we propose emph{Efficient Elastic ViT Adaptation}, a single ViT framework that encapsulates multiple submodels of varying sizes, eliminating the need for repeated adaptation.We introduce elastic configurations along four key dimensions—embedding dimension, attention heads, MLP expansion ratio, and layer depth—and a lightweight router that selects the optimal submodel under different computational budgets. Training proceeds in two stages: emph{Staged Elastic Adaptation} progressively introduces complexity for efficient joint training of submodels while preserving as much pre-trained knowledge as possible; Subsequently, we integrate the router to refine the model by balancing accuracy and MACs, guiding it to initially focus on a small set of promising submodels for faster convergence within the large design space.Our approach captures an exponentially large family of submodels in a single adaptation process. Extensive experiments demonstrate that, for any resource constraint, the router identifies the best submodel, delivering high performance and reduced overhead compared to previous methods.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Yuan, Zhihang; Xie, Rui; Shang, Yuzhang; Zhang, Hanling; Wang, Siyuan; Yan, Shengen; Dai, Guohao; Wang, Yu
DLFR-Gen: Diffusion-based Video Generation with Dynamic Latent Frame Rate Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Yuan2025,
title = {DLFR-Gen: Diffusion-based Video Generation with Dynamic Latent Frame Rate},
author = {Zhihang Yuan and Rui Xie and Yuzhang Shang and Hanling Zhang and Siyuan Wang and Shengen Yan and Guohao Dai and Yu Wang},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wang, Haoxuan; Zhao, Zhenghao; Wu, Junyi; Shang, Yuzhang; Liu, Gaowen; Yan, Yan
CaO$_2$ : Rectifying Inconsistencies in Diffusion-Based Dataset Distillation Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Wang2025c,
title = {CaO$_2$ : Rectifying Inconsistencies in Diffusion-Based Dataset Distillation},
author = {Haoxuan Wang and Zhenghao Zhao and Junyi Wu and Yuzhang Shang and Gaowen Liu and Yan Yan},
url = {https://arxiv.org/abs/2506.22637},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The recent introduction of diffusion models in dataset distillation has shown promising potential in creating compact surrogate datasets for large, high-resolution target datasets, offering improved efficiency and performance over traditional bi-level/uni-level optimization methods. However, current diffusion-based dataset distillation approaches overlook the evaluation process and exhibit two critical inconsistencies in the distillation process: (1) Objective Inconsistency, where the distillation process diverges from the evaluation objective, and (2) Condition Inconsistency, leading to mismatches between generated images and their corresponding conditions. To resolve these issues, we introduce Condition-aware Optimization with Objective-guided Sampling (CaO_2), a two-stage diffusion-based framework that aligns the distillation process with the evaluation objective. The first stage employs a probability-informed sample selection pipeline, while the second stage refines the corresponding latent representations to improve conditional likelihood. CaO_2 achieves state-of-the-art performance on ImageNet and its subsets, surpassing the best-performing baselines by an average of 2.3% accuracy.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Ma, Xuran; Liu, Yexin; LIU, Yaofu; Wu, Xianfeng; Zheng, Mingzhe; Wang, Zihao; Lim, Ser-Nam; Yang, Harry
Model Reveals What to Cache: Profiling-Based Feature Reuse for Video Diffusion Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Ma2025,
title = {Model Reveals What to Cache: Profiling-Based Feature Reuse for Video Diffusion Models},
author = {Xuran Ma and Yexin Liu and Yaofu LIU and Xianfeng Wu and Mingzhe Zheng and Zihao Wang and Ser-Nam Lim and Harry Yang},
url = {https://iccv.thecvf.com/virtual/2025/poster/1031
https://arxiv.org/abs/2504.03140},
year = {2025},
date = {2025-10-19},
urldate = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Video generation using diffusion models has shown remarkable progress, yet it remains computationally expensive due to the repeated processing of redundant features across blocks and steps. To address this, we propose a novel adaptive feature reuse mechanism that dynamically identifies and caches the most informative features by focusing on foreground and caching more on background, significantly reducing computational overhead with less sacrificing video quality. By leveraging the step and block caching, our method achieves up to 1.8Ă— speed up on HunyuanVideo while maintaining competitive performance on Vbench, PSNR, SSIM, FID and LPIPS. Extensive experiments demonstrate that our approach not only improves efficiency but also enhances the quality of generated videos. The proposed method is generalizable and can be integrated into existing diffusion transformer frameworks.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Jang, Young Kyun; Lim, Ser-Nam
Towards Cross-modal Backward-compatible Representation Learning for Vision-Language Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Jang2025,
title = {Towards Cross-modal Backward-compatible Representation Learning for Vision-Language Models},
author = {Young Kyun Jang and Ser-Nam Lim},
url = {https://arxiv.org/abs/2405.14715},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Modern retrieval systems often struggle with upgrading to new and more powerful models due to the incompatibility of embeddings between the old and new models. This necessitates a costly process known as backfilling, which involves re-computing the embeddings for a large number of data samples. In vision, Backward-compatible Training (BT) has been proposed to ensure that the new model aligns with the old model's embeddings. This paper extends the concept of vision-only BT to the field of cross-modal retrieval, marking the first attempt to address Cross-modal BT (XBT). Our goal is to achieve backward-compatibility between Vision-Language Pretraining (VLP) models, such as CLIP, for the cross-modal retrieval task. To address XBT challenges, we propose an efficient solution: a projection module that maps the new model's embeddings to those of the old model. This module, pretrained solely with text data, significantly reduces the number of image-text pairs required for XBT learning, and, once it is pretrained, it avoids using the old model during training. Furthermore, we utilize parameter-efficient training strategies that improve efficiency and preserve the off-the-shelf new model's knowledge by avoiding any modifications. Experimental results on cross-modal retrieval datasets demonstrate the effectiveness of XBT and its potential to enable backfill-free upgrades when a new VLP model emerges.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Pang, Yatian; Zhu, Bin; Lin, Bin; Zheng, Mingzhe; Tay, Francis; Lim, Ser-Nam; Yang, Harry; Yuan, Li
DreamDance: Animating Human Images by Enriching 3D Geometry Cues from 2D Poses Conference
IEEE/CVF International Conference on Computer Vision, 2025.
@conference{Pang2025,
title = {DreamDance: Animating Human Images by Enriching 3D Geometry Cues from 2D Poses},
author = {Yatian Pang and Bin Zhu and Bin Lin and Mingzhe Zheng and Francis Tay and Ser-Nam Lim and Harry Yang and Li Yuan},
url = {https://pang-yatian.github.io/Dreamdance-webpage/
https://arxiv.org/abs/2412.00397
https://github.com/PKU-YuanGroup/DreamDance
https://pang-yatian.github.io/Dreamdance-webpage/resources/full_video_ids.txt},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {In this work, we present DreamDance, a novel method for animating human images using only skeleton pose sequences as conditional inputs. Existing approaches struggle with generating coherent, high-quality content in an efficient and user-friendly manner. Concretely, baseline methods relying on only 2D pose guidance lack the cues of 3D information, leading to suboptimal results, while methods using 3D representation as guidance achieve higher quality but involve a cumbersome and time-intensive process. To address these limitations, DreamDance enriches 3D geometry cues from 2D poses by introducing an efficient diffusion model, enabling high-quality human image animation with various guidance. Our key insight is that human images naturally exhibit multiple levels of correlation, progressing from coarse skeleton poses to fine-grained geometry cues, and further from these geometry cues to explicit appearance details. Capturing such correlations could enrich the guidance signals, facilitating intra-frame coherency and inter-frame consistency. Specifically, we construct the TikTok-Dance5K dataset, comprising 5K high-quality dance videos with detailed frame annotations, including human pose, depth, and normal maps. Next, we introduce a Mutually Aligned Geometry Diffusion Model to generate fine-grained depth and normal maps for enriched guidance. Finally, a Cross-domain Controller incorporates multi-level guidance to animate human images effectively with a video diffusion model. Extensive experiments demonstrate that our method achieves state-of-the-art performance in animating human images.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}