UCF researchers had 15 papers accepted into the Thirty-Eighth Annual Conference on Neural Information Processing Systems, that will be held at Singapore Expo Thursday, April 24 through Monday, April 18th
The International Conference on Learning Representations (ICLR) is the premier gathering of professionals dedicated to the advancement of the branch of artificial intelligence called representation learning, but generally referred to as deep learning.
ICLR is globally renowned for presenting and publishing cutting-edge research on all aspects of deep learning used in the fields of artificial intelligence, statistics and data science, as well as important application areas such as machine vision, computational biology, speech recognition, text understanding, gaming, and robotics.
Participants at ICLR span a wide range of backgrounds, from academic and industrial researchers, to entrepreneurs and engineers, to graduate students and postdocs.
The h5-index is the h-index for articles published in the last 5 complete years. According to Google Scholar Metrics, ICLR is 10th overall and ranked 4th in the Engineering & Computer Science subcategory as well as 2nd in the Artificial Intelligence subcategory in the h5-index rankings.
You can access the CRCV Publications Page for enhanced search capabilities.
Kang, Weitai; Qu, Mengxue; Kini, Jyoti; Wei, Yunchao; Shah, Mubarak; Yan, Yan
Intent3D: 3D Object Detection in RGB-D Scans Based on Human Intention Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{nokey,
title = {Intent3D: 3D Object Detection in RGB-D Scans Based on Human Intention},
author = {Weitai Kang and Mengxue Qu and Jyoti Kini and Yunchao Wei and Mubarak Shah and Yan Yan},
url = {https://openreview.net/forum?id=5GgjiRzYp3&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/3450_Intent3D_3D_Object_Detect.pdf.pdf},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In real-life scenarios, humans seek out objects in the 3D world to fulfill their daily needs or intentions. This inspires us to introduce 3D intention grounding, a new task in 3D object detection employing RGB-D, based on human intention, such as “I want something to support my back.” Closely related, 3D visual grounding focuses on understanding human reference. To achieve detection based on human intention, it relies on humans to observe the scene, reason out the target that aligns with their intention (“pillow” in this case), and finally provide a reference to the AI system, such as “A pillow on the couch”. Instead, 3D intention grounding challenges AI agents to automatically observe, reason and detect the desired target solely based on human intention. To tackle this challenge, we introduce the new Intent3D dataset, consisting of 44,990 intention texts associated with 209 f ine-grained classes from 1,042 scenes of the ScanNet [Dai et al., 2017] dataset. We also establish several baselines based on different language-based 3D object detection models on our benchmark. Finally, we propose IntentNet, our unique approach, designed to tackle this intention-based detection problem. It focuses on three key aspects: intention understanding, reasoning to identify object candidates, and cascaded adaptive learning that leverages the intrinsic priority logic of different losses for multiple objective optimization.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Chhipa, Prakash Chandra; Vashishtha, Gautam; Jithamanyu, Settur; Saini, Rajkumar; Shah, Mubarak; Liwicki, Marcus
ASTrA: Adversarial Self-supervised Training with Adaptive-Attacks Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Chhipa2025,
title = {ASTrA: Adversarial Self-supervised Training with Adaptive-Attacks},
author = {Prakash Chandra Chhipa and Gautam Vashishtha and Settur Jithamanyu and Rajkumar Saini and Mubarak Shah and Marcus Liwicki},
url = {https://prakashchhipa.github.io/projects/ASTrA
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7338_ASTrA_Adversarial_Self_su.pdf.pdf},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Existing self-supervised adversarial training (self-AT) methods rely on handcrafted adversarial attack strategies for PGD attacks, which fail to adapt to the evolving learning dynamics of the model and do not account for instancespecific characteristics of images. This results in sub-optimal adversarial robustness and limits the alignment between clean and adversarial data distributions. To address this, we propose ASTrA (Adversarial Self-supervised Training with Adaptive-Attacks), a novel framework introducing a learnable, self-supervised attack strategy network that autonomously discovers optimal attack parameters through exploration-exploitation in a single training episode. ASTrA leverages a reward mechanism based on contrastive loss, optimized with REINFORCE, enabling adaptive attack strategies without labeled data or additional hyperparameters. We further introduce a mixed contrastive objective to align the distribution of clean and adversarial examples in representation space. ASTrA achieves state-of-the-art results on CIFAR10, CIFAR100, and STL10 while integrating seamlessly as a plug-and-play module for other self-AT methods. ASTrAshows scalability to larger datasets, demonstrates strong semi-supervised performance, and is resilient to robust overfitting, backed by explainability analysis on optimal attack strategies. Project page for source code and other details at https://prakashchhipa.github.io/projects/ASTrA.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Dave, Ishan Rajendrakumar; Shah, Mubarak
ALBAR: Adversarial Learning approach to mitigate Biases in Action Recognition Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Fioresi2025,
title = {ALBAR: Adversarial Learning approach to mitigate Biases in Action Recognition},
author = {Joseph Fioresi and Ishan Rajendrakumar Dave and Mubarak Shah},
url = {https://openreview.net/forum?id=9KiE3t6CsL&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7609_ALBAR_Adversarial_Learnin.pdf},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Bias in machine learning models can lead to unfair decision making, and while it has been well-studied in the image and text domains, it remains underexplored in action recognition. Action recognition models often suffer from background bias (i.e., inferring actions based on background cues) and foreground bias (i.e., relying on subject appearance), which can be detrimental to real-life applications such as autonomous vehicles or assisted living monitoring. While prior approaches have mainly focused on mitigating background bias using specialized augmentations, we thoroughly study both biases. We propose approachname, a novel adversarial training method that mitigates foreground and background biases without requiring specialized knowledge of the bias attributes. Our framework applies an adversarial cross-entropy loss to the sampled static clip (where all the frames are the same) and aims to make its class probabilities uniform using a proposed textit{entropy maximization} loss. Additionally, we introduce a textit{gradient penalty} loss for regularization against the debiasing process. We evaluate our method on established background and foreground bias protocols, setting a new state-of-the-art and strongly improving combined debiasing performance by over textbf{12%} on HMDB51. Furthermore, we identify an issue of background leakage in the existing UCF101 protocol for bias evaluation which provides a shortcut to predict actions and does not provide an accurate measure of the debiasing capability of a model. We address this issue by proposing more fine-grained segmentation boundaries for the actor, where our method also outperforms existing approaches.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Cui, Yuning; Zamir, Syed Waqas; Khan, Salman; Knoll, Alois; Shah, Mubarak; Khan, Fahad Shahbaz
AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and Modulation Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Cui2025,
title = {AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and Modulation},
author = {Yuning Cui and Syed Waqas Zamir and Salman Khan and Alois Knoll and Mubarak Shah and Fahad Shahbaz Khan},
url = {https://openreview.net/forum?id=M5t0WvjfCg&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/8311_AdaIR_Adaptive_All_in_One.pdf.pdf},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In the image acquisition process, various forms of degradation, including noise, blur, haze, and rain, are frequently introduced. These degradations typically arise from the inherent limitations of cameras or unfavorable ambient conditions. To recover clean images from their degraded versions, numerous specialized restoration methods have been developed, each targeting a specific type of degradation. Recently, all-in-one algorithms have garnered significant attention by addressing different types of degradations within a single model without requiring the prior information of the input degradation type. However, most methods purely operate in the spatial domain and do not delve into the distinct frequency variations inherent to different degradation types. To address this gap, we propose an adaptive all-in-one image restoration network based on frequency mining and modulation. Our approach is motivated by the observation that different degradation types impact the image content on different frequency subbands, thereby requiring different treatments for each restoration task. Specifically, we first mine low- and high-frequency information from the input features, guided by the adaptively decoupled spectra of the degraded image. The extracted features are then modulated by a bidirectional operator to facilitate interactions between different frequency components. Finally, the modulated features are merged into the original input for a progressively guided restoration. With this approach, the model achieves adaptive reconstruction by accentuating the informative frequency subbands according to different input degradations. Extensive experiments demonstrate that the proposed method, named AdaIR, achieves state-of-the-art performance on different image restoration tasks, including image denoising, dehazing, deraining, motion deblurring, and low-light image enhancement. Our code and models will be made publicly available.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Chen, Chen; Liu, Daochang; Shah, Mubarak; Xu, Chang
Exploring Local Memorization in Diffusion Models via Bright Ending Attention Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Chen2025,
title = {Exploring Local Memorization in Diffusion Models via Bright Ending Attention},
author = {Chen Chen and Daochang Liu and Mubarak Shah and Chang Xu},
url = {https://openreview.net/forum?id=p4cLtzk4oe&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/9283_Exploring_Local_Memorizat.pdf},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In this paper, we identify and leverage a novel `bright ending' (BE) anomaly in diffusion models prone to memorizing training images to address a new task: locating localized memorization regions within these models. BE refers to a distinct cross-attention pattern observed in text-to-image generations using diffusion models. Specifically, memorized image patches exhibit significantly greater attention to the end token during the final inference step compared to non-memorized patches. This attention map effectively highlights regions where the generated image replicates training data. Furthermore, driven by our observation that local memorization significantly underperforms in existing tasks of measuring, detecting, and mitigating memorization in diffusion models compared to global memorization, we propose a simple yet effective method to integrate BE and the results of the new localization task into these existing frameworks. This integration effectively improves their performances by narrowing the performance gap caused by local memorization. Our results not only demonstrate the successful execution of the new localization task but also establish new state-of-the-art performance across all existing tasks, underscoring the significance of the BE phenomenon.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Monsefi, Amin Karimi; Zhou, Mengxi; Monsefi, Nastaran Karimi; Lim, Ser-Nam; Chao, Wei-Lun; Ramnath, Rajiv
Frequency-Guided Masking for Enhanced Vision Self-Supervised Learning Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Monsefi2025,
title = {Frequency-Guided Masking for Enhanced Vision Self-Supervised Learning },
author = {Amin Karimi Monsefi and Mengxi Zhou and Nastaran Karimi Monsefi and Ser-Nam Lim and Wei-Lun Chao and Rajiv Ramnath},
url = {https://arxiv.org/abs/2409.10362
https://arxiv.org/pdf/2409.10362},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {We present a novel frequency-based Self-Supervised Learning (SSL) approach that significantly enhances its efficacy for pre-training. Prior work in this direction masks out pre-defined frequencies in the input image and employs a reconstruction loss to pre-train the model. While achieving promising results, such an implementation has two fundamental limitations as identified in our paper. First, using pre-defined frequencies overlooks the variability of image frequency responses. Second, pre-trained with frequency-filtered images, the resulting model needs relatively more data to adapt to naturally looking images during fine-tuning. To address these drawbacks, we propose FOurier transform compression with seLf-Knowledge distillation (FOLK), integrating two dedicated ideas. First, inspired by image compression, we adaptively select the masked-out frequencies based on image frequency responses, creating more suitable SSL tasks for pre-training. Second, we employ a two-branch framework empowered by knowledge distillation, enabling the model to take both the filtered and original images as input, largely reducing the burden of downstream tasks. Our experimental results demonstrate the effectiveness of FOLK in achieving competitive performance to many state-of-the-art SSL methods across various downstream tasks, including image classification, few-shot learning, and semantic segmentation.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Kumar, Akash; Kira, Zsolt; Rawat, Yogesh
Contextual Self-paced Learning for Weakly Supervised Spatio-Temporal Video Grounding Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Kumar2025,
title = {Contextual Self-paced Learning for Weakly Supervised Spatio-Temporal Video Grounding},
author = {Akash Kumar and Zsolt Kira and Yogesh Rawat},
url = {https://arxiv.org/pdf/2501.17053
https://github.com/AKASH2907/copsal-weakly-stvg
https://akash2907.github.io/cospal_webpage/
},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In this work, we focus on Weakly Supervised Spatio-Temporal Video Grounding (WSTVG). It is a multimodal task aimed at localizing specific subjects spatiotemporally based on textual queries without bounding box supervision. Motivated by recent advancements in multi-modal foundation models for grounding tasks, we f irst explore the potential of state-of-the-art object detection models for WSTVG. Despite their robust zero-shot capabilities, our adaptation reveals significant limitations, including inconsistent temporal predictions, inadequate understanding of complex queries, and challenges in adapting to difficult scenarios. We propose CoSPaL (Contextual Self-Paced Learning), a novel approach which is designed to overcome these limitations. CoSPaL integrates three core components: (1) Tubelet Phrase Grounding (TPG), which introduces spatio-temporal prediction by linking textual queries to tubelets; (2) Contextual Referral Grounding (CRG), which improves comprehension of complex queries by extracting contextual information to refine object identification over time; and (3) Self-Paced Scene Understanding (SPS), a training paradigm that progressively increases task difficulty, enabling the model to adapt to complex scenarios by transitioning from coarse to fine-grained understanding. We demonstrate the effectiveness of CoSPaL on three benchmark WSTVGdatasets, achieving a 3.9% absolute improvement on VidSTG and a 7.9% improvement on HCSTVG-v1.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Tang, Feilong; Huang, Zile; Liu, Chengzhi; Sun, Qiang; Yang, Harry; Lim, Ser-Nam
Intervening Anchor Token: Decoding Strategy in Alleviating Hallucinations for MLLMs Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Tang2025,
title = {Intervening Anchor Token: Decoding Strategy in Alleviating Hallucinations for MLLMs},
author = {Feilong Tang and Zile Huang and Chengzhi Liu and Qiang Sun and Harry Yang and Ser-Nam Lim},
url = {https://openreview.net/forum?id=zGb4WgCW5i
https://openreview.net/pdf?id=zGb4WgCW5i},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Multimodal large language models (MLLMs) offer a powerful mechanism for interpreting visual information. However, they often suffer from hallucinations, which impede the real-world usage of these models. Existing methods attempt to alleviate this issue by designing special decoding strategies that penalize the summary tokens. However, these methods lack analysis of the relationship between hallucination and summarization mechanism of LLMs. Interestingly, we find that penalizing summary tokens is not necessary: merely intervening the query-key parameters variance, without costing extra inference time, still alleviates hallucinations. Specifically, we explore the causes of hallucinations by analyzing localized self-attention patterns called ``anchor" tokens and define the attention localization degree of the model as token propagation probabilities. Our analysis reveals that over-propagation of anchor tokens occurs when the distribution of eigenvalues of the query and key matrices has a non-zero mean and a polarized variance, leading to excessive dependence on anchor tokens while neglecting vision information and describes the image content with hallucination. Based on the observation, we propose a versatile plug-and-play decoding strategy, Dynamic Token Propagation Mechanism (TAME), to alleviate excessive propagation by dynamically intervening the eigenspectrum variance of the attention weight, thereby alleviating hallucinations without relying on complex decoding strategies. Extensive experiments reveal a correlation between the eigenspectrum and hallucinations across various MLLMs, and show that TAME reduces the percentage of hallucinated objects.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Huang, Jiani; Li, Ziyang; Nail, Mayur; Lim, Ser-Nam
LASER: A Neuro-Symbolic Framework for Learning Spatio-Temporal Scene Graphs with Weak Supervision Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Huang2025,
title = {LASER: A Neuro-Symbolic Framework for Learning Spatio-Temporal Scene Graphs with Weak Supervision},
author = {Jiani Huang and Ziyang Li and Mayur Nail and Ser-Nam Lim},
url = {https://arxiv.org/abs/2304.07647
https://arxiv.org/pdf/2304.07647},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {We propose LASER, a neuro-symbolic approach to learn semantic video representations that capture rich spatial and temporal properties in video data by leveraging high-level logic specifications. In particular, we formulate the problem in terms of alignment between raw videos and spatio-temporal logic specifications. The alignment algorithm leverages a differentiable symbolic reasoner and a combination of contrastive, temporal, and semantics losses. It effectively and efficiently trains low-level perception models to extract a fine-grained video representation in the form of a spatio-temporal scene graph that conforms to the desired high-level specification. To practically reduce the manual effort of obtaining ground truth labels, we derive logic specifications from captions by employing a large language model with a generic prompting template. In doing so, we explore a novel methodology that weakly supervises the learning of spatio-temporal scene graphs with widely accessible video-caption data. We evaluate our method on three datasets with rich spatial and temporal specifications: 20BN-Something-Something, MUGEN, and OpenPVSG. We demonstrate that our method learns better fine-grained video semantics than existing baselines.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Pathak, Priyank; Marjit, Shyam; Vyas, Shruti; Rawat, Yogesh
LR0.FM: Low-Resolution Zero-Shot Classification Benchmark for Foundation Models Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Pathak2025,
title = {LR0.FM: Low-Resolution Zero-Shot Classification Benchmark for Foundation Models},
author = {Priyank Pathak and Shyam Marjit and Shruti Vyas and Yogesh Rawat},
url = {https://arxiv.org/pdf/2502.03950
https://ucf-crcv.github.io/lr0.fm/
https://github.com/shyammarjit/LR0.FM
},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Visual-language foundation Models (FMs) exhibit remarkable zero-shot generalization across diverse tasks, largely attributed to extensive pre-training on largescale datasets. However, their robustness on low-resolution/pixelated (LR) images, a common challenge in real-world scenarios, remains underexplored. We introduce this http URL, a comprehensive benchmark evaluating the impact of low resolution on the zero-shot classification performance of 10 FM(s) across 66 backbones and 15 datasets. We propose a novel metric, Weighted Aggregated Robustness, to address the limitations of existing metrics and better evaluate model performance across resolutions and datasets. Our key findings show that: (i) model size positively correlates with robustness to resolution degradation, (ii) pre-training dataset quality is more important than its size, and (iii) fine-tuned and higher resolution models are less robust against LR. Our analysis further reveals that the model makes semantically reasonable predictions at LR, and the lack of fine-grained details in input adversely impacts the model's initial layers more than the deeper layers. We use these insights and introduce a simple strategy, LR-TK0, to enhance the robustness of models without compromising their pre-trained weights. We demonstrate the effectiveness of LR-TK0 for robustness against low-resolution across several datasets and its generalization capability across backbones and other approaches. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhang, Chi; Farhat, Zain Ulabedeen; Atia, George K.; Wang, Yue
Model-Free Offline Reinforcement Learning with Enhanced Robustness Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Zhang2025,
title = {Model-Free Offline Reinforcement Learning with Enhanced Robustness},
author = {Chi Zhang and Zain Ulabedeen Farhat and George K. Atia and Yue Wang},
url = {https://openreview.net/forum?id=QyVLJ7EnAC
https://openreview.net/pdf?id=QyVLJ7EnAC},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Offline reinforcement learning (RL) has gained considerable attention for its ability to learn policies from pre-collected data without real-time interaction, which makes it particularly useful for high-risk applications. However, due to its reliance on offline datasets, existing works inevitably introduce assumptions to ensure effective learning, which, however, often lead to a trade-off between robustness to model mismatch and scalability to large environments. In this paper, we enhance both aspects with a novel double-pessimism principle, which conservatively estimates performance and accounts for both limited data and potential model mismatches, two major reasons for the previous trade-off. We then propose a universal, model-free algorithm to learn an optimal policy that is robust to potential environment mismatches, which enhances robustness in a scalable manner. Furthermore, we provide a sample complexity analysis of our algorithm when the mismatch is modeled by the
-norm, which also theoretically demonstrates the efficiency of our method. Extensive experiments further demonstrate that our approach significantly improves robustness in a more scalable manner than existing methods.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
-norm, which also theoretically demonstrates the efficiency of our method. Extensive experiments further demonstrate that our approach significantly improves robustness in a more scalable manner than existing methods.
Alom, Zulfikar; Ngo, Tran Gia Bao; Kantarcioglu, Murat; Akcora, Cuneyt Gurcan
GOttack: Universal Adversarial Attacks on Graph Neural Networks via Graph Orbits Learning Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{nokey,
title = {GOttack: Universal Adversarial Attacks on Graph Neural Networks via Graph Orbits Learning},
author = {Zulfikar Alom and Tran Gia Bao Ngo and Murat Kantarcioglu and Cuneyt Gurcan Akcora},
url = {https://openreview.net/forum?id=YbURbViE7l
https://openreview.net/pdf?id=YbURbViE7l
https://github.com/cakcora/GOttack},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
publisher = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Graph Neural Networks (GNNs) have demonstrated superior performance in node classification tasks across diverse applications. However, their vulnerability to adversarial attacks, where minor perturbations can mislead model predictions, poses significant challenges. This study introduces GOttack, a novel adversarial attack framework that exploits the topological structure of graphs to undermine the integrity of GNN predictions systematically.
By defining a topology-aware method to manipulate graph orbits, our approach generates adversarial modifications that are both subtle and effective, posing a severe test to the robustness of GNNs. We evaluate the efficacy of GOttack across multiple prominent GNN architectures using standard benchmark datasets. Our results show that GOttack outperforms existing state-of-the-art adversarial techniques and completes training in approximately 55% of the time required by the fastest competing model, achieving the highest average misclassification rate in 155 tasks. This work not only sheds light on the susceptibility of GNNs to structured adversarial attacks but also shows that certain topological patterns may play a significant role in the underlying robustness of the GNNs. Our Python implementation is shared at https://github.com/cakcora/GOttack.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
By defining a topology-aware method to manipulate graph orbits, our approach generates adversarial modifications that are both subtle and effective, posing a severe test to the robustness of GNNs. We evaluate the efficacy of GOttack across multiple prominent GNN architectures using standard benchmark datasets. Our results show that GOttack outperforms existing state-of-the-art adversarial techniques and completes training in approximately 55% of the time required by the fastest competing model, achieving the highest average misclassification rate in 155 tasks. This work not only sheds light on the susceptibility of GNNs to structured adversarial attacks but also shows that certain topological patterns may play a significant role in the underlying robustness of the GNNs. Our Python implementation is shared at https://github.com/cakcora/GOttack.
Peng, Qucheng; Planche, Benjamin; Gao, Zhongpai; Zheng, Meng; Choudhuri, Anwesa; Chen, Terrence; Chen, Chen; Wu, Ziyan
3D Vision-Language Gaussian Splatting Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Peng2025,
title = {3D Vision-Language Gaussian Splatting},
author = {Qucheng Peng and Benjamin Planche and Zhongpai Gao and Meng Zheng and Anwesa Choudhuri and Terrence Chen and Chen Chen and Ziyan Wu},
url = {https://iclr.cc/virtual/2025/poster/29604},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-04},
publisher = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Recent advancements in 3D reconstruction methods and vision-language models have propelled the development of multi-modal 3D scene understanding, which has vital applications in robotics, autonomous driving, and virtual/augmented reality. However, current multi-modal scene understanding approaches have naively embedded semantic representations into 3D reconstruction methods without striking a balance between visual and language modalities, which leads to unsatisfying semantic rasterization of translucent or reflective objects, as well as over-fitting on color modality. To alleviate these limitations, we propose a solution that adequately handles the distinct visual and semantic modalities, i.e., a 3D vision-language Gaussian splatting model for scene understanding, to put emphasis on the representation learning of language modality. We propose a novel cross-modal rasterizer, using modality fusion along with a smoothed semantic indicator for enhancing semantic rasterization. We also employ a camera-view blending technique to improve semantic consistency between existing and synthesized views, thereby effectively mitigating over-fitting. Extensive experiments demonstrate that our method achieves state-of-the-art performance in open-vocabulary semantic segmentation, surpassing existing methods by a significant margin.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Lai, Zhengfeng; Saveris, Vasileios; Chen, Chen; Chen, Hong-You; Zhang, Haotian; Zhang, Bowen; Hu, Wenze; Tebar, Juan; Gan, Zhe; Grasch, Peter; Cao, Meng; Yang, Yinfei
Revisit Large-Scale Image-Caption Data in Pre-training Multimodal Foundation Models Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Lai2025,
title = {Revisit Large-Scale Image-Caption Data in Pre-training Multimodal Foundation Models},
author = {Zhengfeng Lai and Vasileios Saveris and Chen Chen and Hong-You Chen and Haotian Zhang and Bowen Zhang and Wenze Hu and Juan Tebar and Zhe Gan and Peter Grasch and Meng Cao and Yinfei Yang},
url = {https://iclr.cc/virtual/2025/poster/29536},
year = {2025},
date = {2025-04-24},
publisher = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Recent advancements in multimodal models highlight the value of rewritten captions for improving performance, yet key challenges remain. For example, while synthetic captions often provide superior quality and image-text alignment, it is not clear whether they can fully replace AltTexts: the role of synthetic captions and their interaction with original web-crawled AltTexts in pre-training is still not well understood. Moreover, different multimodal foundation models may have unique preferences for specific caption formats, but efforts to identify the optimal captions for each model remain limited. In this work, we propose a novel, controllable, and scalable captioning pipeline designed to generate diverse caption formats tailored to various multimodal models. By examining short synthetic captions (SSC) and descriptive synthetic captions (DSC) as case studies, we systematically explore their effects and interactions with AltTexts across models such as CLIP, multimodal LLMs, and diffusion models. Our findings reveal that a hybrid approach that keeps both synthetic captions and AltTexts can outperform the use of synthetic captions alone, improving both alignment and performance, with each model demonstrating preferences for particular caption formats. This comprehensive analysis provides valuable insights into optimizing captioning strategies, thereby advancing the pre-training of multimodal foundation models.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhang, Yancheng; Xue, Jiaqi; Zheng, Mengxin; Xie, Mimi; Zhang, Mingzhe; Jiang, Lei; Lou, Qian
CipherPrune: Efficient and Scalable Private Transformer Inference Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
@conference{Zhang2025b,
title = {CipherPrune: Efficient and Scalable Private Transformer Inference},
author = {Yancheng Zhang and Jiaqi Xue and Mengxin Zheng and Mimi Xie and Mingzhe Zhang and Lei Jiang and Qian Lou},
url = { https://openreview.net/pdf?id=mUMvr33FTu
https://github.com/UCF-Lou-Lab-PET/cipher-prune-inference},
year = {2025},
date = {2025-04-24},
publisher = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Private Transformer inference using cryptographic protocols offers promising solutions for privacy-preserving machine learning; however, it still faces significant runtime overhead (efficiency issues) and challenges in handling long-token inputs (scalability issues). We observe that the Transformer’s operational complexity scales quadratically with the number of input tokens, making it essential to reduce the input token length. Notably, each token varies in importance, and many inputs contain redundant tokens. Additionally, prior private inference methods that rely on high-degree polynomial approximations for non-linear activations are computationally expensive. Therefore, reducing the polynomial degree for less important tokens can significantly accelerate private inference. Building on these observations, we propose CipherPrune, an efficient and scalable private inference framework that includes a secure encrypted token pruning protocol, a polynomial reduction protocol, and corresponding Transformer network optimizations. At the protocol level, encrypted token pruning adaptively removes unimportant tokens from encrypted inputs in a progressive, layer-wise manner. Additionally, encrypted polynomial reduction assigns lower-degree polynomials to less important tokens after pruning, enhancing efficiency without decryption. At the network level, we introduce protocol-aware network optimization via a gradient-based search to maximize pruning thresholds and polynomial reduction conditions while maintaining the desired accuracy. Our experiments demonstrate that CipherPrune reduces the execution overhead of private Transformer inference by approximately 6.1× for 128-token inputs and 10.6× for 512-token inputs, compared to previous methods, with only a marginal drop in accuracy. The code is publicly available at https://github.com/UCF-Lou-Lab-PET/cipher-prune-inference.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}