The 2025 Conference on Empirical Methods in Natural Language Processing (EMNLP 2025) will be held in Suzhou, China from November 4th to November 9th, 2025.
EMNLP 2025 invites the submission of long and short papers featuring substantial, original, and unpublished research on empirical methods for Natural Language Processing. EMNLP 2025 has a goal of curating a diverse technical program—in addition to traditional research results, papers may contribute negative findings, survey an area, announce the creation of a new resource, argue a position, report novel linguistic insights derived using existing computational techniques, and reproduce, or fail to reproduce, previous results.
You can access the CRCV Publications Page for enhanced search capabilities.
Shafique, Bhuiyan Sanjid; Vayani, Ashmal; Maaz, Muhammad; Rasheed, Hanoona Abdul; Dissanayake, Dinura; Kurpath, Mohammed Irfan; Hmaiti, Yahya; Inoue, Go; Lahoud, Jean; Rashid, Md. Safirur; Quasem, Shadid Intisar; Fatima, Maheen; Vidal, Franco; Maslych, Mykola; More, Ketan Pravin; Baliah, Sanoojan; Watawana, Hasindri; Li, Yuhao; Farestam, Fabian; Schaller, Leon; Tymtsiv, Roman; Weber, Simon; Cholakkal, Hisham; Laptev, Ivan; Satoh, Shin'ichi; Felsberg, Michael; Shah, Mubarak; Khan, Salman; Khan, Fahad Shahbaz
A Culturally-diverse Multilingual Multimodal Video Benchmark & Model Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Shafique2025,
title = {A Culturally-diverse Multilingual Multimodal Video Benchmark & Model},
author = {Bhuiyan Sanjid Shafique and Ashmal Vayani and Muhammad Maaz and Hanoona Abdul Rasheed and Dinura Dissanayake and Mohammed Irfan Kurpath and Yahya Hmaiti and Go Inoue and Jean Lahoud and Md. Safirur Rashid and Shadid Intisar Quasem and Maheen Fatima and Franco Vidal and Mykola Maslych and Ketan Pravin More and Sanoojan Baliah and Hasindri Watawana and Yuhao Li and Fabian Farestam and Leon Schaller and Roman Tymtsiv and Simon Weber and Hisham Cholakkal and Ivan Laptev and Shin'ichi Satoh and Michael Felsberg and Mubarak Shah and Salman Khan and Fahad Shahbaz Khan},
url = {https://mbzuai-oryx.github.io/ViMUL/
https://arxiv.org/abs/2506.07032
https://huggingface.co/datasets/MBZUAI/ViMUL-Bench},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Large multimodal models (LMMs) have recently gained attention due to their effectiveness to understand and generate descriptions of visual content. Most existing LMMs are in English language. While few recent works explore multilingual image LMMs, to the best of our knowledge, moving beyond the English language for cultural and linguistic inclusivity is yet to be investigated in the context of video LMMs. In pursuit of more inclusive video LMMs, we introduce a multilingual Video LMM benchmark, named ViMUL-Bench, to evaluate Video LMMs across 14 languages, including both low and high-resource languages: English, Chinese, Spanish, French, German, Hindi, Arabic, Russian, Bengali, Urdu, Sinhala, Tamil, Swedish, and Japanese. Our ViMUL-Bench is designed to rigorously test video LMMs across 15 categories including eight culturally diverse categories, ranging from lifestyles and festivals to foods and rituals and from local landmarks to prominent cultural personalities. ViMUL-Bench comprises both open-ended (short and long-form) and multiple-choice questions spanning various video durations (short, medium, and long) with 8k samples that are manually verified by native language speakers. In addition, we also introduce a machine translated multilingual video training set comprising 1.2 million samples and develop a simple multilingual video LMM, named ViMUL, that is shown to provide a better tradeoff between high-and low-resource languages for video understanding. We hope our ViMUL-Bench and multilingual video LMM along with a large-scale multilingual video training set will help ease future research in developing cultural and linguistic inclusive multilingual video LMMs. Our proposed benchmark, video LMM and training data will be publicly released.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Saeed, Muhammed; Raza, Shaina; Vayani, Ashmal; Abdul-Mageed, Muhammad; Emami, Ali; Shehata, Shady
Beyond Content: How Grammatical Gender Shapes Visual Representation in Text-to-Image Models Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Saeed2025,
title = {Beyond Content: How Grammatical Gender Shapes Visual Representation in Text-to-Image Models},
author = {Muhammed Saeed and Shaina Raza and Ashmal Vayani and Muhammad Abdul-Mageed and Ali Emami and Shady Shehata},
url = {https://arxiv.org/abs/2508.03199#:~:text=Our%20analysis%20reveals%20that%20grammatical,to%2028%5C%25%20in%20English).},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Research on bias in Text-to-Image (T2I) models has primarily focused on demographic representation and stereotypical attributes, overlooking a fundamental question: how does grammatical gender influence visual representation across languages? We introduce a cross-linguistic benchmark examining words where grammatical gender contradicts stereotypical gender associations (e.g., ``une sentinelle'' - grammatically feminine in French but referring to the stereotypically masculine concept ``guard''). Our dataset spans five gendered languages (French, Spanish, German, Italian, Russian) and two gender-neutral control languages (English, Chinese), comprising 800 unique prompts that generated 28,800 images across three state-of-the-art T2I models. Our analysis reveals that grammatical gender dramatically influences image generation: masculine grammatical markers increase male representation to 73% on average (compared to 22% with gender-neutral English), while feminine grammatical markers increase female representation to 38% (compared to 28% in English). These effects vary systematically by language resource availability and model architecture, with high-resource languages showing stronger effects. Our findings establish that language structure itself, not just content, shapes AI-generated visual outputs, introducing a new dimension for understanding bias and fairness in multilingual, multimodal systems.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Agrawal, Aakriti; Aralikatti, Rohith; Satheesh, Anirudh; Chakraborty, Souradip; Bedi, Amrit Singh; Huang, Furong
Uncertainty-Aware Answer Selection for Improved Reasoning in Multi-LLM Systems Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Agrawal2025,
title = {Uncertainty-Aware Answer Selection for Improved Reasoning in Multi-LLM Systems},
author = {Aakriti Agrawal and Rohith Aralikatti and Anirudh Satheesh and Souradip Chakraborty and Amrit Singh Bedi and Furong Huang},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Liu, Aoming; Miller, Kevin; Saligrama, Venkatesh; Saenko, Kate; Gong, Boqing; Lim, Ser-Nam; Plummer, Bryan A.
Temporal Experts Averaging for Large-scale Temporal Domain Generalization Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Liu2025,
title = {Temporal Experts Averaging for Large-scale Temporal Domain Generalization},
author = {Aoming Liu and Kevin Miller and Venkatesh Saligrama and Kate Saenko and Boqing Gong and Ser-Nam Lim and Bryan A. Plummer},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wang, Song; Tan, Zhen; Chen, Zihan; Zhou, Shuang; Chen, Tianlong; Li, Jundong
AnyMAC: Cascading Flexible Multi-Agent Collaboration via Next-Agent Prediction Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Wang2025d,
title = {AnyMAC: Cascading Flexible Multi-Agent Collaboration via Next-Agent Prediction},
author = {Song Wang and Zhen Tan and Zihan Chen and Shuang Zhou and Tianlong Chen and Jundong Li},
url = {https://arxiv.org/abs/2506.17784},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Recent progress in large language model (LLM)-based multi-agent collaboration highlights the power of structured communication in enabling collective intelligence. However, existing methods largely rely on static or graph-based inter-agent topologies, lacking the potential adaptability and flexibility in communication. In this work, we propose a new framework that rethinks multi-agent coordination through a sequential structure rather than a graph structure, offering a significantly larger topology space for multi-agent communication. Our method focuses on two key directions: (1) Next-Agent Prediction, which selects the most suitable agent role at each step, and (2) Next-Context Selection (NCS), which enables each agent to selectively access relevant information from any previous step. Together, these components construct task-adaptive communication pipelines that support both role flexibility and global information flow. Extensive evaluations across multiple benchmarks demonstrate that our approach achieves superior performance while substantially reducing communication overhead.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wang, Song; Chen, Zihan; Zhepei Wei Peng Wang, Zhen Tan
Separate the Wheat from the Chaff: Winnowing Down Divergent Views in Retrieval Augmented Generation Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Wang2025e,
title = {Separate the Wheat from the Chaff: Winnowing Down Divergent Views in Retrieval Augmented Generation},
author = {Song Wang and Zihan Chen and Peng Wang, Zhepei Wei, Zhen Tan, Yu Meng, Cong Shen, Jundong Li},
url = {https://arxiv.org/pdf/2311.01108},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Retrieval-augmented generation (RAG) addresses the limitation of large language models (LLMs) in achieving up-to-date information by integrating external knowledge sources, but it is hindered by noisy or irrelevant retrieved data, leading to reduced accuracy. Additionally, most RAG methods rely on task-specific supervision, reducing their adaptability across domains. To overcome these challenges, we propose WinnowRAG, a novel multi-agent debate-based RAG framework. WinnowRAG operates in two stages: in Stage I, query-aware clustering groups similar documents, with each cluster assigned to an LLM agent for generating personalized responses. A critic LLM then consolidates these answers, forming super-agents. In Stage II, the super-agents engage in a structured discussion to filter out incorrect or irrelevant information, ensuring only relevant knowledge is used for final response generation. Crucially, WinnowRAG is unsupervised and leverages pretrained LLMs without requiring fine-tuning, making it easily adaptable to various tasks. The experiments on various realistic datasets demonstrate the effectiveness of WinnowRAG over state-of-the-art baselines.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Lei, Zhenyu; Tan, Zhen; Wang, Song; Zhu, Yaochen; Chen, Zihan; Dong, Yushun; Li, Jundong
Learning from Diverse Reasoning Paths with Routing and Collaboration Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Lei2025,
title = {Learning from Diverse Reasoning Paths with Routing and Collaboration},
author = {Zhenyu Lei and Zhen Tan and Song Wang and Yaochen Zhu and Zihan Chen and Yushun Dong and Jundong Li},
url = {https://arxiv.org/abs/2508.16861
https://github.com/LzyFischer/Distill},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Advances in large language models (LLMs) significantly enhance reasoning capabilities but their deployment is restricted in resource-constrained scenarios. Knowledge distillation addresses this by transferring knowledge from powerful teacher models to compact and transparent students. However, effectively capturing the teacher's comprehensive reasoning is challenging due to conventional token-level supervision's limited scope. Using multiple reasoning paths per query alleviates this problem, but treating each path identically is suboptimal as paths vary widely in quality and suitability across tasks and models. We propose Quality-filtered Routing with Cooperative Distillation (QR-Distill), combining path quality filtering, conditional routing, and cooperative peer teaching. First, quality filtering retains only correct reasoning paths scored by an LLM-based evaluation. Second, conditional routing dynamically assigns paths tailored to each student's current learning state. Finally, cooperative peer teaching enables students to mutually distill diverse insights, addressing knowledge gaps and biases toward specific reasoning styles. Experiments demonstrate QR-Distill's superiority over traditional single- and multi-path distillation methods. Ablation studies further highlight the importance of each component including quality filtering, conditional routing, and peer teaching in effective knowledge transfer.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Chen, Zihan; Wang, Song; Fu, Xingbo; Shi, Chengshuai; Lei, Zhenyu; Shen, Cong; Li, Jundong
From Cross-Task Examples to In-Task Prompts: A Graph-Based Pseudo-Labeling Framework for In-context Learning Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Chen2025d,
title = {From Cross-Task Examples to In-Task Prompts: A Graph-Based Pseudo-Labeling Framework for In-context Learning},
author = {Zihan Chen and Song Wang and Xingbo Fu and Chengshuai Shi and Zhenyu Lei and Cong Shen and Jundong Li},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wang, Dongwei; Liu, Zijie; Wang, Song; Ren, Yuxin; Deng, Jianing; Hu, Jingtong; Chen, Tianlong; Yang, Huanrui
FIER: Fine-Grained and Efficient KV Cache Retrieval for Long-context LLM Inference Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Wang2025f,
title = {FIER: Fine-Grained and Efficient KV Cache Retrieval for Long-context LLM Inference},
author = {Dongwei Wang and Zijie Liu and Song Wang and Yuxin Ren and Jianing Deng and Jingtong Hu and Tianlong Chen and Huanrui Yang},
url = {https://arxiv.org/abs/2508.08256},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {The Key-Value (KV) cache reading latency increases significantly with context lengths, hindering the efficiency of long-context LLM inference. To address this, previous works propose retaining a small fraction of KV cache based on token importance. For example, KV eviction uses static heuristics to retain tokens, while KV retrieval dynamically selects query-relevant tokens for more adaptive cache management. However, we observe that important tokens are often sparsely distributed across the long context. This sparsity makes existing page-level KV retrieval inaccurate, as each page may include irrelevant tokens and miss critical ones. In this work, we propose Fier, a underline{Fi}ne-Grained and underline{E}fficient KV cache underline{R}etrieval method. Fier uses 1-bit quantized keys to estimate the importance of each token, resulting in efficient and precise retrieval. Experiments show that Fier matches full KV performance using only 11% of the cache budget across various long-context tasks, reducing decoding latency by 1.2times to 1.5times.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Zaiyi; Wang, Song; Chen, Zihan; Zhu, Yaochen; He, Yinhan; Hong, Liangjie; Guo, Qi; Li, Jundong
CoRAG: Enhancing Hybrid Retrieval-Augmented Generation through a Cooperative Retriever Architecture Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Zheng2025,
title = {CoRAG: Enhancing Hybrid Retrieval-Augmented Generation through a Cooperative Retriever Architecture},
author = {Zaiyi Zheng and Song Wang and Zihan Chen and Yaochen Zhu and Yinhan He and Liangjie Hong and Qi Guo and Jundong Li},
url = {https://arxiv.org/abs/2504.01883},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Retrieval-Augmented Generation (RAG) models excel in knowledge-intensive tasks, especially under few-shot learning constraints. We introduce CoRAG, a framework extending RAG to collaborative settings, where clients jointly train a shared model using a collaborative passage store. To evaluate CoRAG, we introduce CRAB, a benchmark for collaborative homogeneous open-domain question answering. Our experiments demonstrate that CoRAG consistently outperforms both parametric collaborative learning methods and locally trained RAG models in low-resource scenarios. Further analysis reveals the critical importance of relevant passages within the shared store, the surprising benefits of incorporating irrelevant passages, and the potential for hard negatives to negatively impact performance. This introduces a novel consideration in collaborative RAG: the trade-off between leveraging a collectively enriched knowledge base and the potential risk of incorporating detrimental passages from other clients. Our findings underscore the viability of CoRAG, while also highlighting key design challenges and promising avenues for future research.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Salvador, John; Bansal, Naman; Akter, Mousumi; Sarkar, Souvika; Das, Anupam; Karmaker, Santu
Benchmarking LLMs on the Semantic Overlap Summarization Task Conference
Empirical Methods in Natural Language Processing, 2025.
@conference{Salvador2025,
title = {Benchmarking LLMs on the Semantic Overlap Summarization Task},
author = {John Salvador and Naman Bansal and Mousumi Akter and Souvika Sarkar and Anupam Das and Santu Karmaker},
url = {https://arxiv.org/abs/2402.17008
https://anonymous.4open.science/r/llm_eval-E16D/README.md},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Semantic Overlap Summarization (SOS) is a constrained multi-document summarization task, where the constraint is to capture the common/overlapping information between two alternative narratives. In this work, we perform a benchmarking study of popular Large Language Models (LLMs) exclusively on the SOS task. Additionally, we introduce the PrivacyPolicyPairs (3P) dataset to expand the space of SOS benchmarks in terms of quantity and variety. This dataset provides 135 high-quality SOS data samples sourced from privacy policy documents. We then use a standard prompting taxonomy called TELeR to create and evaluate 905,216 distinct LLM-generated summaries over two SOS datasets from different domains, and we further conduct human evaluation on a subset of 540 samples. We conclude the paper by analyzing models' performances and the reliability of automatic evaluation. The code and datasets used to conduct this study are available at https://anonymous.4open.science/r/llm_eval-E16D/README.md},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}