The Neural Information Processing Systems (NeurIPS) annual meeting fosters the exchange of research on neural information processing systems in their biological, technological, mathematical, and theoretical aspects. The core focus is peer-reviewed novel research which is presented and discussed in the general session, along with invited talks by leaders in their fields.
In 2022, UCF’s Center for Research in Computer Vision (CRCV) had 3 papers accepted into the NeurIPS conference.
You can access the CRCV Publications Page for enhanced search capabilities.
2022
Rana, Aayush; Rawat, Yogesh
Are all Frames Equal? Active Sparse Labeling for Video Action Detection Conference
36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022.
@conference{nokey,
title = {Are all Frames Equal? Active Sparse Labeling for Video Action Detection },
author = {Aayush Rana and Yogesh Rawat},
url = {https://www.crcv.ucf.edu/research/projects/active-sparse-labeling-for-video-action-detection/
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/neurips_poster_ASL_upload.png
https://github.com/aayushjr/ASL-video },
year = {2022},
date = {2022-11-28},
urldate = {2022-11-28},
publisher = {36th Conference on Neural Information Processing Systems (NeurIPS 2022)},
abstract = {Video action detection requires annotations at every frame, which drastically increases the labeling cost. In this work, we focus on efficient labeling of videos for action detection to minimize this cost. We propose active sparse labeling (ASL), a novel active learning strategy for video action detection. We propose a novel frame-level scoring mechanism aimed at selecting the most informative frames in a video. We also introduce a novel loss formulation which enables training of action detection model with these sparsely selected frames. We evaluated the proposed approach on two different action detection benchmark datasets, UCF-101-24 and J-HMDB-21, and observed that active sparse labeling can be very effective in saving annotation costs. We demonstrate that the proposed approach performs better than random selection, outperforming all other baselines, with performance comparable to supervised approach using merely 10% annotations.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Schiappa, Madeline Chantry; Vyas, Shruti; Palangi, Hamid; Rawat, Yogesh; Vineet, Vibhav
Robustness Analysis of Video-Language Models Against Visual and Language Perturbations Conference
36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022.
@conference{Schiappa2022,
title = {Robustness Analysis of Video-Language Models Against Visual and Language Perturbations},
author = {Madeline Chantry Schiappa and Shruti Vyas and Hamid Palangi and Yogesh Rawat and Vibhav Vineet},
url = {https://sites.google.com/view/videolanguagerobustness/home
https://openreview.net/forum?id=A79jAS4MeW9
https://github.com/Maddy12/VideoLanguageModelRobustness/tree/master},
year = {2022},
date = {2022-11-28},
publisher = {36th Conference on Neural Information Processing Systems (NeurIPS 2022)},
abstract = {Joint visual and language modeling on large-scale datasets has recently shown good progress in multi-modal tasks when compared to single modal learning. However, robustness of these approaches against real-world perturbations has not been studied. In this work, we perform the first extensive robustness study of video-language models against various real-world perturbations. We focus on text-to-video retrieval and propose two large-scale benchmark datasets, MSRVTT-P and YouCook2-P, which utilize 90 different visual and 35 different text perturbations. The study reveals some interesting initial findings from the studied models: 1) models are more robust when text is perturbed versus when video is perturbed, 2) models that are pre-trained are more robust than those trained from scratch, 3) models attend more to scene and objects rather than motion and action. We hope this study will serve as a benchmark and guide future research in robust video-language learning. The benchmark introduced in this study along with the code and datasets is available at https://bit.ly/3CNOly4.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Xu, Ziwei; Rawat, Yogesh; Wong, Yongkang; Kankanhalli, Mohan; Shah, Mubarak
Don’t Pour Cereal into Coffee: Differentiable Temporal Logic for Temporal Action Segmentation Conference
36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022.
@conference{Xu2022,
title = {Don’t Pour Cereal into Coffee: Differentiable Temporal Logic for Temporal Action Segmentation},
author = {Ziwei Xu and Yogesh Rawat and Yongkang Wong and Mohan Kankanhalli and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ziwei_neurips2022.pdf
https://diff-tl.github.io/
https://github.com/ZiweiXU/DTL-action-segmentation},
year = {2022},
date = {2022-11-09},
urldate = {2022-11-09},
publisher = {36th Conference on Neural Information Processing Systems (NeurIPS 2022)},
abstract = {We propose Differentiable Temporal Logic (DTL), a model-agnostic framework that introduces temporal constraints to deep networks. DTL treats the outputs of a network as a truth assignment of a temporal logic formula, and computes a temporal logic loss reflecting the consistency between the output and the constraints. We propose a comprehensive set of constraints, which are implicit in data annotations, and incorporate them with deep networks via DTL. We evaluate the effectiveness of DTL on the temporal action segmentation task and observe improved performance and reduced logical errors in the output of different task models. Furthermore, we provide an extensive analysis to visualize the desirable effects of DTL.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}