
CRCV | Center for Research in Computer Vision
University of Central Florida
4328 Scorpius St.
HEC 245D
Orlando, FL 32816-2365
Phone: (407) 823-5077
Fax: (407) 823-0594
Email: shah@crcv.ucf.edu
UCF Graduate Faculty Profile: Click here
2023
Hanif, Asif; Naseer, Muzammal; Khan, Salman; Shah, Mubarak; Khan, Fahad Shahbaz
Frequency Domain Adversarial Training for Robust Volumetric Medical Segmentation Conference
The 26th International Conference on Medical Image Computing and Computer Assisted Intervention, MICCAI 2023, 2023.
BibTeX | Links:
@conference{nokey,
title = {Frequency Domain Adversarial Training for Robust Volumetric Medical Segmentation},
author = {Asif Hanif and Muzammal Naseer and Salman Khan and Mubarak Shah and Fahad Shahbaz Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Frequency-Domain-Adversarial-Training-for-Robust-Volumetric-Medical-Segmentation.pdf
https://github.com/asif-hanif/vafa},
doi = {https://doi.org/10.48550/arXiv.2307.07269},
year = {2023},
date = {2023-10-08},
publisher = {The 26th International Conference on Medical Image Computing and Computer Assisted Intervention, MICCAI 2023},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Sirnam, Swetha; Rizve, Mamshad Nayeem; Kuhne, Hilde; Shah, Mubarak
Preserving Modality Structure Improves Multi-Modal Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Preserving Modality Structure Improves Multi-Modal Learning },
author = {Swetha Sirnam and Mamshad Nayeem Rizve and Hilde Kuhne and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.13077.pdf
https://arxiv.org/abs/2308.13077
https://github.com/Swetha5/Multi_Sinkhorn_Knopp},
doi = {https://doi.org/10.48550/arXiv.2308.13077},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Self-supervised learning on large-scale multi-modal datasets allows learning semantically meaningful embeddings in a joint multi-modal representation space without relying on human annotations. These joint embeddings enable zero-shot cross-modal tasks like retrieval and classification. However, these methods often struggle to generalize well on out-of-domain data as they ignore the semantic structure present in modality-specific embeddings. In this context, we propose a novel Semantic-Structure-Preserving Consistency approach to improve generalizability by preserving the modality-specific relationships in the joint embedding space. To capture modality-specific semantic relationships between samples, we propose to learn multiple anchors and represent the multifaceted relationship between samples with respect to their relationship with these anchors. To assign multiple anchors to each sample, we propose a novel Multi-Assignment Sinkhorn-Knopp algorithm. Our experimentation demonstrates that our proposed approach learns semantically meaningful anchors in a self-supervised manner. Furthermore, our evaluation on MSR-VTT and YouCook2 datasets demonstrates that our proposed multi-anchor assignment based solution achieves state-of-the-art performance and generalizes to both inand out-of-domain datasets. Code: https://github.com/Swetha5/Multi_Sinkhorn_Knopp},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Shah, Mubarak; Xu, Chang
Diffusion Action Segmentation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Diffusion Action Segmentation},
author = {Mubarak Shah and Chang Xu},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.17959.pdf
https://arxiv.org/abs/2303.17959
https://finspire13.github.io/DiffAct-Project-Page/
https://github.com/Finspire13/DiffAct},
doi = { https://doi.org/10.48550/arXiv.2303.17959},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Temporal action segmentation is crucial for understanding long-form videos. Previous works on this task commonly adopt an iterative refinement paradigm by using multi-stage models. We propose a novel framework via denoising diffusion models, which nonetheless shares the same inherent spirit of such iterative refinement. In this framework, action predictions are iteratively generated from random noise with input video features as conditions. To enhance the modeling of three striking characteristics of human actions, including the position prior, the boundary ambiguity, and the relational dependency, we devise a unified masking strategy for the conditioning inputs in our framework. Extensive experiments on three benchmark datasets, i.e., GTEA, 50Salads, and Breakfast, are performed and the proposed method achieves superior or comparable results to state-of-the-art methods, showing the effectiveness of a generative approach for action segmentation.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wasim, Syed Talal; Khattak, Muhammad Uzair; Naseer, Muzammal; Khan, Salman; Shah, Mubarak; Khan, Fahad Shahbaz
Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition },
author = {Syed Talal Wasim and Muhammad Uzair Khattak and Muzammal Naseer and Salman Khan and Mubarak Shah and Fahad Shahbaz Khan },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2307.06947.pdf
https://arxiv.org/abs/2307.06947
https://talalwasim.github.io/Video-FocalNets/
https://github.com/TalalWasim/Video-FocalNets
https://talalwasim.github.io/Video-FocalNets/#BibTeX},
doi = { https://doi.org/10.48550/arXiv.2307.06947},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Recent video recognition models utilize Transformer models for long-range spatio-temporal context modeling. Video transformer designs are based on self-attention that can model global context at a high computational cost. In comparison, convolutional designs for videos offer an efficient alternative but lack long-range dependency modeling. Towards achieving the best of both designs, this work proposes Video-FocalNet, an effective and efficient architecture for video recognition that models both local and global contexts. Video-FocalNet is based on a spatio-temporal focal modulation architecture that reverses the interaction and aggregation steps of self-attention for better efficiency. Further, the aggregation step and the interaction step are both implemented using efficient convolution and element-wise multiplication operations that are computationally less expensive than their self-attention counterparts on video representations. We extensively explore the design space of focal modulation-based spatio-temporal context modeling and demonstrate our parallel spatial and temporal encoding design to be the optimal choice. Video-FocalNets perform favorably well against the state-of-the-art transformer-based models for video recognition on three large-scale datasets (Kinetics-400, Kinetics-600, and SS-v2) at a lower computational cost. Our code/models are publicly released.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Samarasinghe, Sarinda; Nayeem, Mamshad; Kardan, Rizve Navid; Shah, Mubarak
CDFSL-V: Cross-Domain Few-Shot Learning for Videos Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Samarasinghe2023,
title = {CDFSL-V: Cross-Domain Few-Shot Learning for Videos},
author = {Sarinda Samarasinghe and Mamshad Nayeem and Rizve Navid Kardan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CDFSL_Video_Combined_Final.pdf
https://sarinda251.github.io/CDFSL-V-site/},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Few-shot video action recognition is an effective approach to recognizing new categories with only a few labeled examples, thereby reducing the challenges associated with collecting and annotating large-scale video datasets. Existing methods in video action recognition rely on large labeled datasets from the same domain. However, this setup is not realistic as novel categories may come from different data domains that may have different spatial and temporal characteristics. This dissimilarity between the source and target domains can pose a significant challenge, rendering traditional few-shot action recognition techniques ineffective. To address this issue, in this work, we propose a novel cross-domain few-shot video action recognition method that leverages self-supervised learning and curriculum learning to balance the information from the source and target domains. To be particular, our method employs a masked autoencoder-based self-supervised training objective to learn from both source and target data in a self-supervised manner. Then a progressive curriculum balances learning the discriminative information from the source dataset with the generic information learned from the target domain. Initially, our curriculum utilizes supervised learning to learn class discriminative features from the source data. As the training progresses, we transition to learning target-domain-specific features. We propose a progressive curriculum to encourage the emergence of rich features in the target domain based on class discriminative supervised features in the source domain. %a schedule that helps with this transition. We evaluate our method on several challenging benchmark datasets and demonstrate that our approach outperforms existing cross-domain few-shot learning techniques.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Vahidian, Saeed; Kadaveru, Sreevatsank; Baek, Woonjoon; Wang, Weijia; Kungurtsev, Vyacheslav; Chen, Chen; Shah, Mubarak; Lin, Bill
When Do Curricula Work in Federated Learning? Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Vahidian2023b,
title = {When Do Curricula Work in Federated Learning? },
author = {Saeed Vahidian and Sreevatsank Kadaveru and Woonjoon Baek and Weijia Wang and Vyacheslav Kungurtsev and Chen Chen and Mubarak Shah and Bill Lin},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2212.12712.pdf
https://arxiv.org/abs/2212.12712},
doi = {https://doi.org/10.48550/arXiv.2212.12712},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {An oft-cited open problem of federated learning is the existence of data heterogeneity at the clients. One pathway to understanding the drastic accuracy drop in federated learning is by scrutinizing the behavior of the clients' deep models on data with different levels of "difficulty", which has been left unaddressed. In this paper, we investigate a different and rarely studied dimension of FL: ordered learning. Specifically, we aim to investigate how ordered learning principles can contribute to alleviating the heterogeneity effects in FL. We present theoretical analysis and conduct extensive empirical studies on the efficacy of orderings spanning three kinds of learning: curriculum, anti-curriculum, and random curriculum. We find that curriculum learning largely alleviates non-IIDness. Interestingly, the more disparate the data distributions across clients the more they benefit from ordered learning. We provide analysis explaining this phenomenon, specifically indicating how curriculum training appears to make the objective landscape progressively less convex, suggesting fast converging iterations at the beginning of the training procedure. We derive quantitative results of convergence for both convex and nonconvex objectives by modeling the curriculum training on federated devices as local SGD with locally biased stochastic gradients. Also, inspired by ordered learning, we propose a novel client selection technique that benefits from the real-world disparity in the clients. Our proposed approach to client selection has a synergic effect when applied together with ordered learning in FL.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Dave, Ishan; Shah, Mubarak
TeD-SPAD: Temporal Distinctiveness for Self-supervised Privacy-preservation for video Anomaly Detection Conference
IEEE/CVF International Conference on Computer Vision, 2023.
BibTeX | Links:
@conference{Fioresi2023,
title = {TeD-SPAD: Temporal Distinctiveness for Self-supervised Privacy-preservation for video Anomaly Detection},
author = {Joseph Fioresi and Ishan Dave and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.11072.pdf
https://arxiv.org/abs/2308.11072
https://github.com/UCF-CRCV/TeD-SPAD
https://joefioresi718.github.io/TeD-SPAD_webpage/
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/TeDSPAD_ICCV_poster.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/5min_TeD-SPAD_Finalv3.mp4},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Kini, Jyoti; Fleischer, Sarah; Dave, Ishan; Shah, Mubarak
Ensemble Modeling for Multimodal Visual Action Recognition Workshop
22nd International Conference on Image Analysis and Processing Workshops - Multimodal Action Recognition on the MECCANO Dataset, 2023.
BibTeX | Links:
@workshop{Kini2023b,
title = {Ensemble Modeling for Multimodal Visual Action Recognition},
author = {Jyoti Kini and Sarah Fleischer and Ishan Dave and Mubarak Shah},
url = {https://arxiv.org/pdf/2308.05430.pdf
https://www.crcv.ucf.edu/research/projects/ensemble-modeling-for-multimodal-visual-action-recognition/},
year = {2023},
date = {2023-09-11},
urldate = {2023-09-11},
booktitle = {22nd International Conference on Image Analysis and Processing Workshops - Multimodal Action Recognition on the MECCANO Dataset},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
Wasim, Syed Talal; Naseer, Muzammal; Khan, Salman; Khan, Fahad; Shah, Mubarak
Vita-CLIP: Video and text adaptive CLIP via Multimodal Prompting Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Wasim2023,
title = {Vita-CLIP: Video and text adaptive CLIP via Multimodal Prompting},
author = {Syed Talal Wasim and Muzammal Naseer and Salman Khan and Fahad Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/vita_clip_video_and_text_adapt-Camera-ready-PDF.pdf
},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Adopting contrastive image-text pretrained models like CLIP towards video classification has gained attention due to its cost-effectiveness and competitive performance. However, recent works in this area face a trade-off. Finetuning the pretrained model to achieve strong supervised performance results in low zero-shot generalization. Similarly, freezing the backbone to retain zero-shot capability causes significant drop in supervised accuracy. Because of this,
recent works in literature typically train separate models for supervised and zero-shot action recognition. In this work, we propose a multimodal prompt learning scheme that works to balance the supervised and zero-shot performance under a single unified training. Our prompting approach on the vision side caters for three aspects: 1) Global video-level prompts to model the data distribution; 2) Local frame-level prompts to provide per-frame discriminative
conditioning; and 3) a summary prompt to extract a condensed video representation. Additionally, we define a prompting scheme on the text side to augment the textual context. Through this prompting scheme, we can achieve state-of-the-art zero-shot performance on Kinetics-600, HMDB51 and UCF101 while remaining competitive in the supervised setting. By keeping the pretrained backbone frozen, we optimize a much lower number of parameters and retain the existing general representation which helps achieve the strong zero-shot performance. Our codes and models will be publicly released. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
recent works in literature typically train separate models for supervised and zero-shot action recognition. In this work, we propose a multimodal prompt learning scheme that works to balance the supervised and zero-shot performance under a single unified training. Our prompting approach on the vision side caters for three aspects: 1) Global video-level prompts to model the data distribution; 2) Local frame-level prompts to provide per-frame discriminative
conditioning; and 3) a summary prompt to extract a condensed video representation. Additionally, we define a prompting scheme on the text side to augment the textual context. Through this prompting scheme, we can achieve state-of-the-art zero-shot performance on Kinetics-600, HMDB51 and UCF101 while remaining competitive in the supervised setting. By keeping the pretrained backbone frozen, we optimize a much lower number of parameters and retain the existing general representation which helps achieve the strong zero-shot performance. Our codes and models will be publicly released.
Urooj, Aisha; Kuehne, Hilde; Wu, Bo; Chheu, Kim; Bousselham, Walid; Gan, Chuang; Lobo, Niels; Shah, Mubarak
Learning Situation Hyper-Graphs for Video Question Answering Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Urooj2023,
title = {Learning Situation Hyper-Graphs for Video Question Answering},
author = {Aisha Urooj and Hilde Kuehne and Bo Wu and Kim Chheu and Walid Bousselham and Chuang Gan and Niels Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2023072364-4.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/SHG_VQA_CVPR2023_cam_ready_supp.pdf
},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Answering questions about complex situations in videos requires not only capturing the presence of actors, objects, and their relations but also the evolution of these relationships over time. A situation hyper-graph is a representation that describes situations as scene sub-graphs for video frames and hyper-edges for connected sub-graphs and has been proposed to capture all such information in a compact structured form. In this work, we propose an architecture for Video Question Answering (VQA) that enables answering questions related to video content by predicting situation hyper-graphs, coined Situation Hyper-Graph based Video Question Answering (SHG-VQA). To this end, we train a situation hyper-graph decoder to implicitly identify graph representations with actions and object/human-object relationships from the input video clip. and to use cross-attention
between the predicted situation hyper-graphs and the question embedding to predict the correct answer. The proposed
method is trained in an end-to-end manner and optimized by a VQA loss with the cross-entropy function and a Hungarian
matching loss for the situation graph prediction. The effectiveness of the proposed architecture is extensively evaluated
on two challenging benchmarks: AGQA and STAR. Our results show that learning the underlying situation hypergraphs
helps the system to significantly improve its performance for novel challenges of video question-answering tasks. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
between the predicted situation hyper-graphs and the question embedding to predict the correct answer. The proposed
method is trained in an end-to-end manner and optimized by a VQA loss with the cross-entropy function and a Hungarian
matching loss for the situation graph prediction. The effectiveness of the proposed architecture is extensively evaluated
on two challenging benchmarks: AGQA and STAR. Our results show that learning the underlying situation hypergraphs
helps the system to significantly improve its performance for novel challenges of video question-answering tasks.
Dave, Ishan Rajendrakumar; Rizve, Mamshad Nayeem; Chen, Chen; Shah, Mubarak
TimeBalance: Temporally-Invariant and Temporally-Distinctive Video Representations for Semi-Supervised Action Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Dave2023,
title = {TimeBalance: Temporally-Invariant and Temporally-Distinctive Video Representations for Semi-Supervised Action Recognition},
author = {Ishan Rajendrakumar Dave and Mamshad Nayeem Rizve and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/TimeBalance_CVPR23_arxiv.pdf
https://daveishan.github.io/timebalance_webpage/
https://github.com/DAVEISHAN/TimeBalance},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Semi-Supervised Learning can be more beneficial for the video domain compared to images because of its higher annotation
cost and dimensionality. Besides, any video understanding task requires reasoning over both spatial and temporal dimensions. In order to learn both the static and motion related features for the semi-supervised action recognition task, existing methods rely on hard input inductive biases like using two-modalities (RGB and Optical-flow) or two-stream of different playback rates.
Instead of utilizing unlabeled videos through diverse input streams, we rely on self-supervised video representations,
particularly, we utilize temporally-invariant and temporally-distinctive representations. We observe that these representations complement each other depending on the nature of the action. Based on this observation, we propose a student-teacher semi-supervised learning framework, TimeBalance, where we distill the knowledge from a temporally-invariant and a temporally-distinctive teacher. Depending on the nature of the unlabeled video, we dynamically combine the knowledge of these two teachers based on a novel temporal similarity-based reweighting scheme. Our method achieves state-of-the-art performance
on three action recognition benchmarks: UCF101, HMDB51, and Kinetics400. Code: https://github.com/DAVEISHAN/TimeBalance.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
cost and dimensionality. Besides, any video understanding task requires reasoning over both spatial and temporal dimensions. In order to learn both the static and motion related features for the semi-supervised action recognition task, existing methods rely on hard input inductive biases like using two-modalities (RGB and Optical-flow) or two-stream of different playback rates.
Instead of utilizing unlabeled videos through diverse input streams, we rely on self-supervised video representations,
particularly, we utilize temporally-invariant and temporally-distinctive representations. We observe that these representations complement each other depending on the nature of the action. Based on this observation, we propose a student-teacher semi-supervised learning framework, TimeBalance, where we distill the knowledge from a temporally-invariant and a temporally-distinctive teacher. Depending on the nature of the unlabeled video, we dynamically combine the knowledge of these two teachers based on a novel temporal similarity-based reweighting scheme. Our method achieves state-of-the-art performance
on three action recognition benchmarks: UCF101, HMDB51, and Kinetics400. Code: https://github.com/DAVEISHAN/TimeBalance.
Zhu, Sijie; Yang, Linjie; Chen, Chen; Shah, Mubarak; Shen, Xiaohui; Wang, Heng
R2Former: Unified retrieval and ranking Transformer for Place Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Zhu2023,
title = {R2Former: Unified retrieval and ranking Transformer for Place Recognition},
author = {Sijie Zhu and Linjie Yang and Chen Chen and Mubarak Shah and Xiaohui Shen and Heng Wang},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CVPR_2023_PlaceRecognitionFinal.pdf
https://arxiv.org/pdf/2304.03410.pdf
https://github.com/Jeff-Zilence/R2Former},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Visual Place Recognition (VPR) estimates the location of query images by matching them with images in a reference database. Conventional methods generally adopt aggregated CNN features for global retrieval and RANSAC-based geometric verification for reranking. However, RANSAC only considers geometric information but ignores other possible information that could be useful for reranking, e.g. local feature correlation, and attention values. In this paper, we propose a unified place recognition framework that handles both retrieval and reranking with a novel transformer model, named R2Former. The proposed reranking module takes feature correlation, attention value, and xy coordinates into account, and learns to determine whether the image pair is from the same location. The whole pipeline is end-to-end trainable and the reranking module alone can
also be adopted on other CNN or transformer backbones as a generic component. Remarkably, R2Former significantly
outperforms state-of-the-art methods on major VPR datasets with much less inference time and memory consumption.
It also achieves the state-of-the-art on the holdout MSLS challenge set and could serve as a simple yet strong solution for real-world large-scale applications. Experiments also show vision transformer tokens are comparable and sometimes better than CNN local features on local matching. The code will be publicly available. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
also be adopted on other CNN or transformer backbones as a generic component. Remarkably, R2Former significantly
outperforms state-of-the-art methods on major VPR datasets with much less inference time and memory consumption.
It also achieves the state-of-the-art on the holdout MSLS challenge set and could serve as a simple yet strong solution for real-world large-scale applications. Experiments also show vision transformer tokens are comparable and sometimes better than CNN local features on local matching. The code will be publicly available.
Bhunia, Ankan Kumar; Khan, Salman; Cholakkal, Hisham; Anwer, Rao Muhammad; Laaksonen, Jorma Tapio; Shah, Mubarak; Khan, Fahad
Person Image Synthesis via Denoising Diffusion Model Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Bhunia2023,
title = {Person Image Synthesis via Denoising Diffusion Model},
author = {Ankan Kumar Bhunia and Salman Khan and Hisham Cholakkal and Rao Muhammad Anwer and Jorma Tapio Laaksonen and Mubarak Shah and Fahad Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/person_image_synthesis_via_den-Camera-ready-PDF.pdf
https://lnkd.in/d-8v3r8B
https://lnkd.in/dGPTjvge
https://lnkd.in/dxcGQsUX
https://github.com/ankanbhunia/PIDM},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {The pose-guided person image generation task requires synthesizing photorealistic images of humans in arbitrary poses. The existing approaches use generative adversarial networks that do not necessarily maintain realistic textures or need dense correspondences that struggle to handle complex deformations and severe occlusions. In this work, we show how denoising diffusion models can be applied for high-fidelity person image synthesis with
strong sample diversity and enhanced mode coverage of the learnt data distribution. Our proposed Person Image Diffusion Model (PIDM) disintegrates the complex transfer problem into a series of simpler forward-backward denoising steps. This helps in learning plausible source-to-target transformation trajectories that result in faithful textures and undistorted appearance details. We introduce a ‘texture diffusion module’ based on cross-attention to accurately model the correspondences between appearance and pose information available in source and target images. Further, we propose disentangled classifier-free guidance’ to ensure close resemblance between the conditional inputs and the synthesized output in terms of both pose and appearance information. Our extensive results on two large-scale benchmarks and a user study demonstrate the photorealism of our proposed approach under challenging scenarios. We also show how our generated images can help in downstream tasks. Code is available at https://github.com/ankanbhunia/PIDM.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
strong sample diversity and enhanced mode coverage of the learnt data distribution. Our proposed Person Image Diffusion Model (PIDM) disintegrates the complex transfer problem into a series of simpler forward-backward denoising steps. This helps in learning plausible source-to-target transformation trajectories that result in faithful textures and undistorted appearance details. We introduce a ‘texture diffusion module’ based on cross-attention to accurately model the correspondences between appearance and pose information available in source and target images. Further, we propose disentangled classifier-free guidance’ to ensure close resemblance between the conditional inputs and the synthesized output in terms of both pose and appearance information. Our extensive results on two large-scale benchmarks and a user study demonstrate the photorealism of our proposed approach under challenging scenarios. We also show how our generated images can help in downstream tasks. Code is available at https://github.com/ankanbhunia/PIDM.
Clark, Brandon Eric; Kerrigan, Alec; Kulkarni, Parth Parag; Cepeda, Vicente Vivanco; Shah, Mubarak
Where We Are and What We're Looking At: Query Based Worldwide Image Geo-localization Using Hierarchies and Scenes Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Clark2023,
title = {Where We Are and What We're Looking At: Query Based Worldwide Image Geo-localization Using Hierarchies and Scenes},
author = {Brandon Eric Clark and Alec Kerrigan and Parth Parag Kulkarni and Vicente Vivanco Cepeda and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Camera-Ready-Full-Paper.pdf
https://github.com/AHKerrigan/GeoGuessNet
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CVPR23-Poster_THU-PM-246-1.pdf
https://www.youtube.com/watch?v=fp3hZGbwPqk},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Determining the exact latitude and longitude that a photo was taken is a useful and widely applicable task, yet it remains exceptionally difficult despite the accelerated progress of other computer vision tasks. Most previous approaches have opted to learn single representations of query images, which are then classified at different levels of geographic granularity. These approaches fail to exploit the different visual cues that give context to different hierarchies, such as the country, state, and city level. To this end, we introduce an end-to-end transformer-based architecture that exploits the relationship between different geographic levels (which we refer to as hierarchies) and the corresponding visual scene information in an image through hierarchical cross-attention. We achieve this by learning a query for each geographic hierarchy and scene type. Furthermore, we learn a separate representation for different environmental scenes, as different scenes in the same location are often defined by completely different visual features. We achieve state of the art accuracy on 4 standard geo-localization datasets : Im2GPS, Im2GPS3k, YFCC4k, and YFCC26k, as well as qualitatively demonstrate how our method learns different representations for different visual hierarchies and scenes, which has not been demonstrated in the previous methods. Above previous testing datasets mostly consist of iconic landmarks or images taken from social media, which makes the dataset a simple memory task, or makes it biased towards certain places. To address this issue we introduce a much harder testing dataset, Google-World-Streets-15k, comprised of images taken from Google Streetview covering the whole planet and present state of the art results. Our code can be found at https://github.com/AHKerrigan/GeoGuessNet.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Rizve, Mamshad Nayeem; Mittal, Gaurav; Yu, Ye; Hall, Matthew; Sajeev, Sandra; Shah, Mubarak; Chen, Mei
PivoTAL: Prior-Driven Supervision for Weakly-Supervised Temporal Action Localization Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{Rizve2023,
title = {PivoTAL: Prior-Driven Supervision for Weakly-Supervised Temporal Action Localization},
author = {Mamshad Nayeem Rizve and Gaurav Mittal and Ye Yu and Matthew Hall and Sandra Sajeev and Mubarak Shah and Mei Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR_2023.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR_2023_Supplemental_Material.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR2023_Poster.pdf
https://www.youtube.com/watch?v=6kAoQjXfzio},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Gupta, Rohit; Roy, Anirban; Kim, Sujeong; Christensen, Claire; Grindal, Todd; Gerard, Sarah Nixon; Cincebeaux, Madeline; Divakaran, Ajay; Shah, Mubarak
Class Prototypes based Contrastive Learning for Classifying Multi-Label and Fine-Grained Educational Videos Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Gupta2023b,
title = {Class Prototypes based Contrastive Learning for Classifying Multi-Label and Fine-Grained Educational Videos},
author = {Rohit Gupta and Anirban Roy and Sujeong Kim and Claire Christensen and Todd Grindal and Sarah Nixon Gerard and Madeline Cincebeaux and Ajay Divakaran and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Rohit_SRI_CVPR2023_Multi_Modal_Multi_Label_Contrastive_Learning_Camera_Ready-4.pdf
https://www.rohitg.xyz/MMContrast/
https://nusci.csl.sri.com/project/APPROVE},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {The recent growth in the consumption of online media by children during early childhood necessitates data-driven tools enabling educators to filter out appropriate educational content for young learners. This paper presents an approach for detecting educational content in online videos. We focus on two widely used educational content classes: literacy and math. For each class, we choose prominent codes (sub-classes) based on the Common Core Standards. For example, literacy codes include ‘letter names’, ‘letter sounds’, and math codes include ‘counting’, ‘sorting’. We pose this as a fine-grained multilabel classification problem as videos can contain multiple types of educational content and the content classes can get visually similar (e.g., ‘letter names’ vs ‘letter sounds’). We propose a novel class prototypes based supervised contrastive learning approach that can handle fine-grained samples associated with multiple labels. We learn a class prototype for each class and a loss function is employed to minimize the distances between a class prototype and the samples from the class. Similarly,
distances between a class prototype and the samples from other classes are maximized. As the alignment between visual
and audio cues are crucial for effective comprehension, we consider a multimodal transformer network to capture the interaction between visual and audio cues in videos while learning the embedding for videos. For evaluation, we present a dataset, APPROVE, employing educational videos from YouTube labeled with fine-grained education classes by education researchers. APPROVE consists of 193 hours of expert-annotated videos with 19 classes. The proposed approach outperforms strong baselines on APPROVE and other benchmarks such as Youtube-8M, and COIN. The dataset is available at https://nusci.csl.sri.com/project/APPROVE.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
distances between a class prototype and the samples from other classes are maximized. As the alignment between visual
and audio cues are crucial for effective comprehension, we consider a multimodal transformer network to capture the interaction between visual and audio cues in videos while learning the embedding for videos. For evaluation, we present a dataset, APPROVE, employing educational videos from YouTube labeled with fine-grained education classes by education researchers. APPROVE consists of 193 hours of expert-annotated videos with 19 classes. The proposed approach outperforms strong baselines on APPROVE and other benchmarks such as Youtube-8M, and COIN. The dataset is available at https://nusci.csl.sri.com/project/APPROVE.
Zheng, Ce; Wu, Wenhan; Chen, Chen; Yang, Taojiannan; Zhu, Sijie; Shen, Ju; Kehtarnavaz, Nasser; Shah, Mubarak
Deep Learning-Based Human Pose Estimation: A Survey Journal Article
In: ACM Computing Surveys, 2023.
BibTeX | Links:
@article{Zheng2023c,
title = {Deep Learning-Based Human Pose Estimation: A Survey},
author = {Ce Zheng and Wenhan Wu and Chen Chen and Taojiannan Yang and Sijie Zhu and Ju Shen and Nasser Kehtarnavaz and Mubarak Shah},
editor = {Albert Y H Zomaya},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/3603618.pdf
https://github.com/zczcwh/DL-HPE},
doi = {10.1145/3603618},
year = {2023},
date = {2023-06-09},
journal = {ACM Computing Surveys},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Kini, Jyoti; Mian, Ajmal; Shah, Mubarak
3DMODT: Attention-Guided Affinities for Joint Detection & Tracking in 3D Point Clouds Conference
IEEE International Conference on Robotics and Automation, 2023.
BibTeX | Links:
@conference{Kini2023,
title = {3DMODT: Attention-Guided Affinities for Joint Detection & Tracking in 3D Point Clouds},
author = {Jyoti Kini and Ajmal Mian and Mubarak Shah},
url = {https://arxiv.org/pdf/2211.00746.pdf},
year = {2023},
date = {2023-05-29},
urldate = {2023-05-29},
booktitle = {IEEE International Conference on Robotics and Automation},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Sangam, Tushar; Dave, Ishan Rajendrakumar; Sultani, Waqas; Shah, Mubarak
TransVisDrone: Spatio-Temporal Transformer for Vision-based Drone-to-Drone Detection in Aerial Videos Conference
IEEE International Conference on Robotics and Automation, 2023.
BibTeX | Links:
@conference{Sangam2023,
title = {TransVisDrone: Spatio-Temporal Transformer for Vision-based Drone-to-Drone Detection in Aerial Videos},
author = {Tushar Sangam and Ishan Rajendrakumar Dave and Waqas Sultani and Mubarak Shah},
url = {https://arxiv.org/pdf/2210.08423.pdf},
year = {2023},
date = {2023-05-29},
booktitle = {IEEE International Conference on Robotics and Automation},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Beetham, James; Kardan, Navid; Mian, Ajmal; Shah, Mubarak
Dual Student Networks for Data-Free Model Stealing Conference
Dual Student Networks for Data-Free Model Stealing , Eleventh International Conference on Learning Representations (ICLR), 2023.
@conference{Beetham2023,
title = {Dual Student Networks for Data-Free Model Stealing},
author = {James Beetham and Navid Kardan and Ajmal Mian and Mubarak Shah},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
booktitle = {Dual Student Networks for Data-Free Model Stealing },
publisher = {Eleventh International Conference on Learning Representations (ICLR)},
abstract = {Data-free model stealing aims to replicate a target model without direct access to either the training data or the target model. To accomplish this, existing methods use a generator to produce samples in order to train a student model to match the target model outputs. To this end, the two main challenges are estimating gradients of the target model without access to its parameters, and generating a diverse set of images that thoroughly explores the input space. We propose a Dual Student method where two students are symmetrically trained in order to provide the generator a criterion to generate samples that the two students disagree on. On one hand, disagreement on a sample implies at least one student has classified the sample incorrectly when compared with the target model. This push towards disagreeing samples implicitly encourages exploring a more diverse region of input space. On the other hand, our method utilizes gradients of student models to indirectly estimate gradients of the target model. We show that this novel training objective for the generator network is equivalent to optimizing a lower bound on the generator’s loss if we had access to the target model gradients. In other words, our method alters the standard data-free model stealing paradigm by substituting the target model with a separate student model, thereby creating a lower bound which can be directly optimized without additional target model queries or separate synthetic datasets. We show that our new optimization framework provides more accurate gradient estimation of the target model and better accuracies on benchmark classification datasets. Additionally, our approach balances improved query efficiency with training computation cost. Finally, we demonstrate that our method serves as a better proxy model for transfer-based adversarial attacks than existing data-free model stealing methods.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Yang, Peiyu; Akhtar, Naveed; Wen, Zeyi; Shah, Mubarak; Mian, Ajmal
Re-calibrating Feature Attributions for Model Interpretation Conference
Re-calibrating Feature Attributions for Model Interpretation, Eleventh International Conference on Learning Representations (ICLR), notable top 25%, 2023.
@conference{nokey,
title = {Re-calibrating Feature Attributions for Model Interpretation},
author = {Peiyu Yang and Naveed Akhtar and Zeyi Wen and Mubarak Shah and Ajmal Mian},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
booktitle = {Re-calibrating Feature Attributions for Model Interpretation},
publisher = {Eleventh International Conference on Learning Representations (ICLR), notable top 25%},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Barbalau, Antonio; Ionescu, Radu Tudor; Georgescu, Mariana-Iuliana; Dueholm, Jacob; Ramachandra, Bharathkumar; Nasrollahi, Kamal; Khan, Fahad Shahbaz; Moeslund, Thomas B.; Shah, Mubarak
SSMTL++: Revisiting Self-Supervised Multi-Task Learning for Video Anomaly Detection Journal Article
In: Computer Vision and Image Understanding, 2023.
BibTeX | Links:
@article{Barbalau2023,
title = {SSMTL++: Revisiting Self-Supervised Multi-Task Learning for Video Anomaly Detection},
author = {Antonio Barbalau and Radu Tudor Ionescu and Mariana-Iuliana Georgescu and Jacob Dueholm and Bharathkumar Ramachandra and Kamal Nasrollahi and Fahad Shahbaz Khan and Thomas B. Moeslund and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/SSMTL.pdf},
year = {2023},
date = {2023-02-11},
urldate = {2023-02-11},
journal = {Computer Vision and Image Understanding},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Vahidian, Saeed; Morafah, Mahdi; Wang, Weijia; Kungurtsev, Vyacheslav; Chen, Chen; Shah, Mubarak; Lin, Bill
Efficient Distribution Similarity Identification in Clustered Federated Learning via Principal Angles Between Client Data Subspaces Conference
37th AAAI Conference on Artificial Intelligence, 2023.
BibTeX | Links:
@conference{Vahidian2023,
title = {Efficient Distribution Similarity Identification in Clustered Federated Learning via Principal Angles Between Client Data Subspaces},
author = {Saeed Vahidian and Mahdi Morafah and Weijia Wang and Vyacheslav Kungurtsev and Chen Chen and Mubarak Shah and Bill Lin},
url = {https://arxiv.org/abs/2209.10526},
year = {2023},
date = {2023-02-07},
urldate = {2023-02-07},
publisher = {37th AAAI Conference on Artificial Intelligence},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Gupta, Rohit; Akhtar, Naveed; Mian, Ajmal; Shah, Mubarak
Contrastive Self-Supervised Learning Leads to Higher Adversarial Susceptibility Conference
37th AAAI Conference on Artificial Intelligence, 2023.
BibTeX | Links:
@conference{Gupta2023,
title = {Contrastive Self-Supervised Learning Leads to Higher Adversarial Susceptibility},
author = {Rohit Gupta and Naveed Akhtar and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2207.10862.pdf},
year = {2023},
date = {2023-02-07},
publisher = {37th AAAI Conference on Artificial Intelligence},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2022
Xu, Ziwei; Rawat, Yogesh; Wong, Yongkang; Kankanhalli, Mohan; Shah, Mubarak
Don’t Pour Cereal into Coffee: Differentiable Temporal Logic for Temporal Action Segmentation Conference
36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022.
@conference{Xu2022,
title = {Don’t Pour Cereal into Coffee: Differentiable Temporal Logic for Temporal Action Segmentation},
author = {Ziwei Xu and Yogesh Rawat and Yongkang Wong and Mohan Kankanhalli and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ziwei_neurips2022.pdf
https://diff-tl.github.io/
https://github.com/ZiweiXU/DTL-action-segmentation},
year = {2022},
date = {2022-11-09},
urldate = {2022-11-09},
publisher = {36th Conference on Neural Information Processing Systems (NeurIPS 2022)},
abstract = {We propose Differentiable Temporal Logic (DTL), a model-agnostic framework that introduces temporal constraints to deep networks. DTL treats the outputs of a network as a truth assignment of a temporal logic formula, and computes a temporal logic loss reflecting the consistency between the output and the constraints. We propose a comprehensive set of constraints, which are implicit in data annotations, and incorporate them with deep networks via DTL. We evaluate the effectiveness of DTL on the temporal action segmentation task and observe improved performance and reduced logical errors in the output of different task models. Furthermore, we provide an extensive analysis to visualize the desirable effects of DTL.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Khan, Aisha Urooj; Kuehne, Hilde; Gan, Chuang; Lobo, Niels Da Vitoria; Shah, Mubarak
Weakly Supervised Grounding for VQA in Vision-Language Transformers Conference
European Conference on Computer Vision, 2022.
@conference{Khan2022,
title = {Weakly Supervised Grounding for VQA in Vision-Language Transformers},
author = {Aisha Urooj Khan and Hilde Kuehne and Chuang Gan and Niels Da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1011.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1011-supp.pdf
https://github.com/aurooj/WSG-VQA-VLTransformers
https://youtu.be/dekmVb6lq3I},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Transformers for visual-language representation learning have been getting a lot of interest and shown tremendous performance on visual question answering (VQA) and grounding. However, most systems that show good performance of those tasks still rely on pre-trained object
detectors during training, which limits their applicability to the object classes available for those detectors. To mitigate this limitation, this paper
focuses on the problem of weakly supervised grounding in the context of visual question answering in transformers. Our approach leverages
capsules by transforming each visual token into a capsule representation in the visual encoder; it then uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding. Our experiments show that: while removing the information of masked
objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
detectors during training, which limits their applicability to the object classes available for those detectors. To mitigate this limitation, this paper
focuses on the problem of weakly supervised grounding in the context of visual question answering in transformers. Our approach leverages
capsules by transforming each visual token into a capsule representation in the visual encoder; it then uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding. Our experiments show that: while removing the information of masked
objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field.
Rizve, Mamshad Nayeem; Kardan, Navid; Shah, Mubarak
Towards Realistic Semi-Supervised Learning Conference
European Conference on Computer Vision, 2022.
@conference{Rizve2022b,
title = {Towards Realistic Semi-Supervised Learning},
author = {Mamshad Nayeem Rizve and Navid Kardan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7402.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7402-supp.pdf
https://github.com/nayeemrizve/TRSSL
https://youtu.be/mE7GeQ35WyY},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Deep learning is pushing the state-of-the-art in many computer
vision applications. However, it relies on large annotated data
repositories, and capturing the unconstrained nature of the real-world
data is yet to be solved. Semi-supervised learning (SSL) complements
the annotated training data with a large corpus of unlabeled data to
reduce annotation cost. The standard SSL approach assumes unlabeled
data are from the same distribution as annotated data. Recently, a more
realistic SSL problem, called open-world SSL, is introduced, where the
unannotated data might contain samples from unknown classes. In this paper, we propose a novel pseudo-label based approach to tackle SSL in
open-world setting. At the core of our method, we utilize sample uncertainty and incorporate prior knowledge about class distribution to generate reliable class-distribution-aware pseudo-labels for unlabeled data belonging to both known and unknown classes. Our extensive experimentation showcases the effectiveness of our approach on several benchmark datasets, where it substantially outperforms the existing state-of-the art on seven diverse datasets including CIFAR-100 (∼17%), ImageNet-100 (∼5%), and Tiny ImageNet (∼9%). We also highlight the flexibility of our approach in solving novel class discovery task, demonstrate its stability in dealing with imbalanced data, and complement our approach with a technique to estimate the number of novel classes. Code: https://github.com/nayeemrizve/TRSSL},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
vision applications. However, it relies on large annotated data
repositories, and capturing the unconstrained nature of the real-world
data is yet to be solved. Semi-supervised learning (SSL) complements
the annotated training data with a large corpus of unlabeled data to
reduce annotation cost. The standard SSL approach assumes unlabeled
data are from the same distribution as annotated data. Recently, a more
realistic SSL problem, called open-world SSL, is introduced, where the
unannotated data might contain samples from unknown classes. In this paper, we propose a novel pseudo-label based approach to tackle SSL in
open-world setting. At the core of our method, we utilize sample uncertainty and incorporate prior knowledge about class distribution to generate reliable class-distribution-aware pseudo-labels for unlabeled data belonging to both known and unknown classes. Our extensive experimentation showcases the effectiveness of our approach on several benchmark datasets, where it substantially outperforms the existing state-of-the art on seven diverse datasets including CIFAR-100 (∼17%), ImageNet-100 (∼5%), and Tiny ImageNet (∼9%). We also highlight the flexibility of our approach in solving novel class discovery task, demonstrate its stability in dealing with imbalanced data, and complement our approach with a technique to estimate the number of novel classes. Code: https://github.com/nayeemrizve/TRSSL
Rizve, Mamshad Nayeem; Kardan, Navid; Khan, Salman; Khan, Fahad Shahbaz; Shah, Mubarak
OpenLDN: Learning to Discover Novel Classes for Open-World Semi-Supervised Learning Conference
European Conference on Computer Vision, 2022.
@conference{Rizve2022,
title = {OpenLDN: Learning to Discover Novel Classes for Open-World Semi-Supervised Learning},
author = {Mamshad Nayeem Rizve and Navid Kardan and Salman Khan and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/6665.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/6665-supp.pdf
https://github.com/nayeemrizve/OpenLDN
https://youtu.be/p2lYqvklcjA},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Semi-supervised learning (SSL) is one of the dominant approaches to address the annotation bottleneck of supervised learning. Recent SSL methods can effectively leverage a large repository of unlabeled data to improve performance while relying on a small set of labeled data. One common assumption in most SSL methods is that the labeled and unlabeled data are from the same data distribution. However, this is hardly the case in many real-world scenarios, which limits their applicability. In this work, instead, we attempt to solve the challenging open-world SSL problem that does not make such an assumption. In the open-world SSL problem, the objective is to recognize samples of known classes, and simultaneously detect and cluster samples belonging to novel classes present in unlabeled data. This work introduces OpenLDN that utilizes a pairwise similarity loss to discover novel classes. Using a bi-level optimization rule this pairwise similarity loss exploits the information available in the labeled set to implicitly cluster novel class samples, while simultaneously recognizing samples from known classes. After discovering novel classes, OpenLDN transforms the open-world SSL problem into a standard SSL problem to achieve additional performance gains using existing SSL methods. Our extensive experiments demonstrate that OpenLDN outperforms the current state-of-the-art methods on multiple popular classification benchmarks while providing a better accuracy/training time trade-off. Code: https://github.com/nayeemrizve/OpenLDN},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Vyas, Shruti; Chen, Chen; Shah, Mubarak
GAMa: Cross-view Video Geo-localization Conference
European Conference on Computer Vision, 2022.
@conference{Vyas2022,
title = {GAMa: Cross-view Video Geo-localization},
author = {Shruti Vyas and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1512.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1512-supp.pdf
https://youtu.be/KSHuer_VXJo},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {The existing work in cross-view geo-localization is based on images where a ground panorama is matched to an aerial image. In this work, we focus on ground videos instead of images which provides ad-ditional contextual cues which are important for this task. There are no existing datasets for this problem, therefore we propose GAMa dataset, a large-scale dataset with ground videos and corresponding aerial images. We also propose a novel approach to solve this problem. At clip-level, a short video clip is matched with corresponding aerial image and is later used to get video-level geo-localization of a long video. Moreover, we propose a hierarchical approach to further improve the clip-level geo-localization. On this challenging dataset, with unaligned images and lim-ited field of view, our proposed method achieves a Top-1 recall rate of 19.4% and 45.1% @1.0mile. Code & dataset are available at this link.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Kumar, Aakash; Kini, Jyoti; Mian, Ajmal; Shah, Mubarak
Self Supervised Learning for Multiple Object Tracking in 3D Point Clouds Conference
2022 IEEE/RSJ International Conference on Intelligent Robots and Systems, 2022.
@conference{Kumar2022,
title = {Self Supervised Learning for Multiple Object Tracking in 3D Point Clouds},
author = {Aakash Kumar and Jyoti Kini and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/camera_ready_paper.pdf},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {2022 IEEE/RSJ International Conference on Intelligent Robots and Systems},
abstract = {Multiple object tracking in 3D point clouds has applications in mobile robots and autonomous driving. This is a challenging problem due to the sparse nature of the point clouds and the added difficulty of annotation in 3D for supervised learning. To overcome these challenges, we propose a neural network architecture that learns effective object features and their affinities in a self supervised fashion for multiple object tracking in 3D point clouds captured with LiDAR sensors. For self supervision, we use two approaches. First, we generate two augmented LiDAR frames from a single real frame by applying translation, rotation and cutout to the objects. Second, we synthesize a LiDAR frame using CAD models or primitive geometric shapes and then apply the above three augmentations to them. Hence, the ground truth object locations and associations are known in both frames for self supervision. This removes the need to annotate object associations in real data, and additionally the need for training data collection and annotation for object detection in synthetic data. To the best of our knowledge, this is the first self supervised multiple object tracking method for 3D data. Our model achieves state of the art results.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Pillai, Manu S; Bhattacharya, Abhijeet; Baweja, Tanmay; Gupta, Rohit; Shah, Mubarak
DEEPSAR: Vessel Detection In SAR Imagery With Noisy Labels Conference
IEEE International Conference on Image Processing, 2022.
BibTeX | Links:
@conference{Pillai2023,
title = {DEEPSAR: Vessel Detection In SAR Imagery With Noisy Labels},
author = {Manu S Pillai and Abhijeet Bhattacharya and Tanmay Baweja and Rohit Gupta and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICIP_Submission.pdf},
year = {2022},
date = {2022-10-16},
urldate = {2023-10-08},
publisher = {IEEE International Conference on Image Processing},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Kini, Jyoti; Shah, Mubarak
Tag-Based Attention Guided Bottom-Up Approach for Video Instance Segmentation Conference
26th International Conference on Pattern Recognition, 2022.
BibTeX | Links:
@conference{Kini2022b,
title = {Tag-Based Attention Guided Bottom-Up Approach for Video Instance Segmentation},
author = {Jyoti Kini and Mubarak Shah },
url = {https://arxiv.org/pdf/2204.10765.pdf},
year = {2022},
date = {2022-08-21},
urldate = {2022-08-21},
booktitle = {26th International Conference on Pattern Recognition},
issue = {arxiv:2204.10765},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Ristea, Nicolae-Catalin; Madan, Neelu; Ionescu, Radu Tudor; Nasrollahi, Kamal; Khan, Fahad Shahbaz; Moeslund, Thomas B.; Shah, Mubarak
Self-Supervised Predictive Convolutional Attentive Block for Anomaly Detection Conference
IEEE Computer Vision and Pattern Recognition, 2022.
BibTeX | Links:
@conference{nokey,
title = {Self-Supervised Predictive Convolutional Attentive Block for Anomaly Detection},
author = {Nicolae-Catalin Ristea and Neelu Madan and Radu Tudor Ionescu and Kamal Nasrollahi and Fahad Shahbaz Khan and Thomas B. Moeslund and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/SSPCAB_camera-arxiv.pdf},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Sijie; Shah, Mubarak; Chen, Chen
TransGeo: Transformer Is All You Need for Cross-view Image Geo-localization Conference
IEEE Computer Vision and Pattern Recognition, 2022.
BibTeX | Links:
@conference{nokey,
title = {TransGeo: Transformer Is All You Need for Cross-view Image Geo-localization},
author = {Sijie Zhu and Mubarak Shah and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/11695.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/11695-supp.pdf},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Cao, Jiale; Pang, Yenwai; Anwer, Rao Muhammad; Cholakkal, Hisham; Xie, Jin; Shah, Mubarak; Khan, Fahad Shahbaz
PSTR: End-to-End One-Step Person Search With Transformers Conference
IEEE Computer Vision and Pattern Recognition, 2022.
@conference{nokey,
title = {PSTR: End-to-End One-Step Person Search With Transformers},
author = {Jiale Cao and Yenwai Pang and Rao Muhammad Anwer and Hisham Cholakkal and Jin Xie and Mubarak Shah and Fahad Shahbaz Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/05237-2.pdf
https://github.com/JialeCao001/PSTR},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {We propose a novel one-step transformer-based person search framework, PSTR, that jointly performs person detection and re-identification (re-id) in a single architecture. PSTR comprises a person search-specialized (PSS) module that contains a detection encoder-decoder for person detection along with a discriminative re-id decoder for person re-id. The discriminative re-id decoder utilizes a multi-level supervision scheme with a shared decoder for
discriminative re-id feature learning and also comprises a part attention block to encode relationship between different parts of a person. We further introduce a simple multi-scale scheme to support re-id across person instances at different scales. PSTR jointly achieves the diverse objectives of object-level recognition (detection) and instance-level matching (re-id). To the best of our knowledge, we are the first to propose an end-to-end one-step
transformer-based person search framework. Experiments are performed on two popular benchmarks: CUHK-SYSU and PRW. Our extensive ablations reveal the merits of the proposed contributions. Further, the proposed PSTR sets a new state-of-the-art on both benchmarks. On the challenging
PRW benchmark, PSTR achieves a mean average precision (mAP) score of 56.5%. The source code is available at https://github.com/JialeCao001/PSTR.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
discriminative re-id feature learning and also comprises a part attention block to encode relationship between different parts of a person. We further introduce a simple multi-scale scheme to support re-id across person instances at different scales. PSTR jointly achieves the diverse objectives of object-level recognition (detection) and instance-level matching (re-id). To the best of our knowledge, we are the first to propose an end-to-end one-step
transformer-based person search framework. Experiments are performed on two popular benchmarks: CUHK-SYSU and PRW. Our extensive ablations reveal the merits of the proposed contributions. Further, the proposed PSTR sets a new state-of-the-art on both benchmarks. On the challenging
PRW benchmark, PSTR achieves a mean average precision (mAP) score of 56.5%. The source code is available at https://github.com/JialeCao001/PSTR.
Gupta, Akshita; Narayan, Sanath; Joseph, K J; Khan, Salman; Khan, Fahad Shahbaz; Shah, Mubarak
OW-DETR: Open-world Detection Transformer Conference
IEEE Computer Vision and Pattern Recognition, 2022.
@conference{nokey,
title = {OW-DETR: Open-world Detection Transformer},
author = {Akshita Gupta and Sanath Narayan and K J Joseph and Salman Khan and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/03815.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/03815-supp.pdf
https://github.com/akshitac8/OW-DETR.},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Open-world object detection (OWOD) is a challenging computer vision problem, where the task is to detect a known set of object categories while simultaneously identifying unknown objects. Additionally, the model must incrementally learn new classes that become known in the next training episodes. Distinct from standard object detection, the OWOD setting poses significant challenges for generating quality candidate proposals on potentially unknown objects, separating the unknown objects from the background and detecting diverse unknown objects. Here, we introduce a novel end-to-end transformer-based framework, OW-DETR, for open-world object detection. The proposed OW-DETR comprises three dedicated components
namely, attention-driven pseudo-labeling, novelty classification and objectness scoring to explicitly address the aforementioned OWOD challenges. Our OW-DETR explicitly encodes multi-scale contextual information, possesses less inductive bias, enables knowledge transfer from known classes to the unknown class and can better discriminate between unknown objects and background. Comprehensive experiments are performed on two benchmarks: MS-COCO and PASCAL VOC. The extensive ablations reveal the merits of our proposed contributions. Further, our model outperforms the recently introduced OWOD approach, ORE, with absolute gains ranging from 1.8% to 3.3% in terms of unknown recall on MS-COCO. In the case of incremental
object detection, OW-DETR outperforms the state-of-the art for all settings on PASCAL VOC. Our code is available at https://github.com/akshitac8/OW-DETR.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
namely, attention-driven pseudo-labeling, novelty classification and objectness scoring to explicitly address the aforementioned OWOD challenges. Our OW-DETR explicitly encodes multi-scale contextual information, possesses less inductive bias, enables knowledge transfer from known classes to the unknown class and can better discriminate between unknown objects and background. Comprehensive experiments are performed on two benchmarks: MS-COCO and PASCAL VOC. The extensive ablations reveal the merits of our proposed contributions. Further, our model outperforms the recently introduced OWOD approach, ORE, with absolute gains ranging from 1.8% to 3.3% in terms of unknown recall on MS-COCO. In the case of incremental
object detection, OW-DETR outperforms the state-of-the art for all settings on PASCAL VOC. Our code is available at https://github.com/akshitac8/OW-DETR.
Acsintoae, Andra; Florescu, Andrei; Georgescu, Mariana-Iuliana; Mare, Tudor; Sumedrea, Paul; Ionescu, Radu Tudor; Khan, Fahad Shahbaz; Shah, Mubarak
UBnormal: New Benchmark for Supervised Open-Set Video Anomaly Detection Conference
IEEE Computer Vision and Pattern Recognition, 2022.
BibTeX | Links:
@conference{nokey,
title = {UBnormal: New Benchmark for Supervised Open-Set Video Anomaly Detection},
author = {Andra Acsintoae and Andrei Florescu and Mariana-Iuliana Georgescu and Tudor Mare and Paul Sumedrea and Radu Tudor Ionescu and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/04315.pdf
https://github.com/lilygeorgescu/UBnormal},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Karim, Nazmul; Rizve, Mamshad Nayeem; Rahnavard, Nazanin; Mian, Ajmal; Shah, Mubarak
UNICON: Combating Label Noise Through Uniform Selection and Contrastive Learning Conference
IEEE Computer Vision and Pattern Recognition, 2022.
BibTeX | Links:
@conference{nokey,
title = {UNICON: Combating Label Noise Through Uniform Selection and Contrastive Learning},
author = {Nazmul Karim and Mamshad Nayeem Rizve and Nazanin Rahnavard and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/07363.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/07363-supp.pdf
https://github.com/nazmul-karim170/unicon-noisy-label},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Dave, Ishan Rajendrakumar; Chen, Chen; Shah, Mubarak
SPAct: Self-supervised Privacy Preservation for Action Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2022.
BibTeX | Links:
@conference{nokey,
title = {SPAct: Self-supervised Privacy Preservation for Action Recognition},
author = {Ishan Rajendrakumar Dave and Chen Chen and Mubarak Shah},
url = {https://arxiv.org/pdf/2203.15205.pdf
https://github.com/DAVEISHAN/SPAct
https://www.youtube.com/watch?v=_PAlMT7ozts},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Dave, Ishan; Gupta, Rohit; Rizve, Mamshad Nayeem; Shah, Mubarak
TCLR: Temporal Contrastive Learning for Video Representation Journal Article
In: Computer Vision and Image Understanding, vol. 219, iss. 1077-3142, pp. 103406, 2022.
@article{nokey,
title = {TCLR: Temporal Contrastive Learning for Video Representation},
author = {Ishan Dave and Rohit Gupta and Mamshad Nayeem Rizve and Mubarak Shah },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1-s2.0-S1077314222000376-main.pdf
https://github.com/DAVEISHAN/TCLR},
doi = {https://doi.org/10.1016/j.cviu.2022.103406},
year = {2022},
date = {2022-05-01},
urldate = {2022-05-01},
journal = {Computer Vision and Image Understanding},
volume = {219},
issue = {1077-3142},
pages = {103406},
abstract = {Contrastive learning has nearly closed the gap between supervised and self-supervised learning of image representations, and has also been explored for videos. However, prior work on contrastive learning for video data has not explored the effect of explicitly encouraging the features to be distinct across the temporal dimension. We develop a new temporal contrastive learning framework consisting of two novel losses to improve upon existing contrastive self-supervised video representation learning methods. The local–local temporal contrastive loss adds the task of discriminating between non-overlapping clips from the same video, whereas the global–local temporal contrastive aims to discriminate between timesteps of the feature map of an input clip in order to increase the temporal diversity of the learned features. Our proposed temporal contrastive learning framework achieves significant improvement over the state-of-the-art results in various downstream video understanding tasks such as action recognition, limited-label action classification, and nearest-neighbor video retrieval on multiple video datasets and backbones. We also demonstrate significant improvement in fine-grained action classification for visually similar classes. With the commonly used 3D ResNet-18 architecture with UCF101 pretraining, we achieve 82.4% (+5.1% increase over the previous best) top-1 accuracy on UCF101 and 52.9% (+5.4% increase) on HMDB51 action classification, and 56.2% (+11.7% increase) Top-1 Recall on UCF101 nearest neighbor video retrieval. Code released at https://github.com/DAVEISHAN/TCLR.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Kini, Jyoti; Khan, Fahad Shahbaz; Khan, Salman; Shah, Mubarak
Self-Supervised Video Object Segmentation via Cutout Prediction and Tagging Technical Report
no. arXiv:2204.10846, 2022.
BibTeX | Links:
@techreport{Kini2022,
title = {Self-Supervised Video Object Segmentation via Cutout Prediction and Tagging},
author = {Jyoti Kini and Fahad Shahbaz Khan and Salman Khan and Mubarak Shah
},
url = {https://arxiv.org/pdf/2204.10846.pdf},
year = {2022},
date = {2022-04-24},
urldate = {2022-04-24},
number = {arXiv:2204.10846},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
Modi, Rajat; Rana, Aayush Jung; Kumar, Akash; Tirupattur, Praveen; Vyas, Shruti; Rawat, Yogesh Singh; Shah, Mubarak
Video Action Detection: Analysing Limitations and Challenges Conference
IEEE Computer Vision and Pattern Recognition, 2022.
BibTeX | Links:
@conference{Modi2022,
title = {Video Action Detection: Analysing Limitations and Challenges},
author = {Rajat Modi and Aayush Jung Rana and Akash Kumar and Praveen Tirupattur and Shruti Vyas and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2204.07892.pdf},
year = {2022},
date = {2022-04-17},
urldate = {2022-04-17},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Pestana, Camilo; Akhtar, Naveed; Rahnavard, Nazanin; Shah, Mubarak; Mian, Ajmal
Transferable 3D Adversarial Textures using End-to-end Optimization Conference
IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), 2022.
@conference{Pestana2022,
title = {Transferable 3D Adversarial Textures using End-to-end Optimization},
author = {Camilo Pestana and Naveed Akhtar and Nazanin Rahnavard and Mubarak Shah and Ajmal Mian},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/WACV_2022_Deceptive_Textures-1.pdf},
doi = {10.1109/WACV51458.2022.00080},
year = {2022},
date = {2022-02-15},
urldate = {2022-02-15},
pages = {727-736},
publisher = {IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},
abstract = {Deep visual models are known to be vulnerable to adversarial attacks. The last few years have seen numerous techniques to compute adversarial inputs for these models. However, there are still under-explored avenues in this critical research direction. Among those is the estimation of adversarial textures for 3D models in an end-to-end optimization scheme. In this paper, we propose such a scheme to generate adversarial textures for 3D models that are highly transferable and invariant to different camera views and lighting conditions. Our method makes use of neural rendering with explicit control over the model texture and background. We ensure transferability of the adversarial textures by employing an ensemble of robust and non-robust models. Our technique utilizes 3D models as a proxy to simulate closer to real-life conditions, in contrast to conventional use of 2D images for adversarial attacks. We show the efficacy of our method with extensive experiments.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Aafaq, Nayyer; Mian, Ajmal; Naveed Akhtar, Wei Liu; Shah, Mubarak
Dense Video Captioning with Early Linguistic Information Fusion Journal Article
In: IEEE Transactions on Multimedia, pp. 1-1, 2022.
@article{nokey,
title = {Dense Video Captioning with Early Linguistic Information Fusion},
author = {Nayyer Aafaq and Ajmal Mian and Naveed Akhtar, Wei Liu and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/IEEE_TMM_Nayyer_Jan_2022_Final_Version_Manuscript.pdf},
doi = {10.1109/TMM.2022.3146005},
year = {2022},
date = {2022-01-25},
urldate = {2022-01-25},
journal = {IEEE Transactions on Multimedia},
pages = {1-1},
abstract = {Dense captioning methods generally detect events in videos first and then generate captions for the individual events. Events are localized solely based on the visual cues while ignoring the associated linguistic information and context. Whereas end-to-end learning may implicitly take guidance from language, these methods still fall short of the power of explicit modeling. In this paper, we propose a Visual-Semantic Embedding (ViSE) Framework that models the word(s)-context distributional properties over the entire semantic space and computes weights for all the n-grams such that higher weights are assigned to the more informative n-grams. The weights are accounted for in learning distributed representations of all the captions to construct a semantic space. To perform the contextualization
of visual information and the constructed semantic space in a supervised manner, we design Visual-Semantic Joint Modeling Network (VSJM-Net). The learned ViSE embeddings are then temporally encoded with a Hierarchical Descriptor Transformer (HDT). For caption generation, we exploit a transformer architecture to decode the input embeddings into natural language descriptions. Experiments on the large-scale ActivityNet Captions dataset and YouCook-II dataset demonstrate the efficacy of our method.
Index Terms—Dense video captioning, event localisation, language and vision, video captioning, context modeling.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
of visual information and the constructed semantic space in a supervised manner, we design Visual-Semantic Joint Modeling Network (VSJM-Net). The learned ViSE embeddings are then temporally encoded with a Hierarchical Descriptor Transformer (HDT). For caption generation, we exploit a transformer architecture to decode the input embeddings into natural language descriptions. Experiments on the large-scale ActivityNet Captions dataset and YouCook-II dataset demonstrate the efficacy of our method.
Index Terms—Dense video captioning, event localisation, language and vision, video captioning, context modeling.
Kardan, Navid; Hill, Mitchell; Shah, Mubarak
Self-Joint Supervised Learning Conference
International Conference on Learning Representations (ICLR), 2022.
@conference{Kardan2022,
title = {Self-Joint Supervised Learning},
author = {Navid Kardan and Mitchell Hill and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Self_joint_ICLR-002.pdf
https://github.com/ndkn/Self-joint-Learning},
year = {2022},
date = {2022-01-20},
urldate = {2022-01-20},
publisher = {International Conference on Learning Representations (ICLR)},
abstract = {Supervised learning is a fundamental framework used to train machine learning systems. A supervised learning problem is often formulated using an i.i.d. as-sumption that restricts model attention to a single relevant signal at a time when predicting. This contrasts with the human ability to actively use related samples as reference when making decisions. We hypothesize that the restriction to a single signal for each prediction in the standard i.i.d. framework contributes to well-known drawbacks of supervised learning: making overconfident predictions and vulnerability to overfitting, adversarial attacks, and out-of-distribution data. To address these limitations, we propose a new supervised learning paradigm called self-joint learning that generalizes the standard approach by modeling the joint conditional distribution of two observed samples, where each sample is an im-age and its label. Rather than assuming samples are independent, our models explicitly learn the sample-to-sample relation of conditional independence. Our framework can naturally incorporate auxiliary unlabeled data to further improve the performance. Experiments on benchmark image datasets show our method offers significant improvement over standard supervised learning in terms of ac-curacy, robustness against adversarial attacks, out-of-distribution detection, and overconfidence mitigation. Code: github.com/ndkn/Self-joint-Learning},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Colvin, Dylan J.; Frota, Rafaela; Gupta, Rohit; Li, Mengjie; Seigneur, Hubert P.; Vyas, Shruti; Oliveira, Sofia; Shah, Mubarak; Davis, Kristopher O.
Automated Defect Detection and Localization in Photovoltaic Cells Using Semantic Segmentation of Electroluminescence Images Journal Article
In: IEEE Journal of Photovoltaics, vol. 12, no. 1, pp. 53-61, 2022.
@article{Fioresi2022,
title = {Automated Defect Detection and Localization in Photovoltaic Cells Using Semantic Segmentation of Electroluminescence Images},
author = {Joseph Fioresi and Dylan J. Colvin and Rafaela Frota and Rohit Gupta and Mengjie Li and Hubert P. Seigneur and Shruti Vyas and Sofia Oliveira and Mubarak Shah and Kristopher O. Davis},
url = {https://ieeexplore.ieee.org/document/9650542},
doi = {10.1109/JPHOTOV.2021.3131059},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {IEEE Journal of Photovoltaics},
volume = {12},
number = {1},
pages = {53-61},
abstract = {In this article, we propose a deep learning based semantic segmentation model that identifies and segments defects in electroluminescence (EL) images of silicon photovoltaic (PV) cells. The proposed model can differentiate between cracks, contact interruptions, cell interconnect failures, and contact corrosion for both multicrystalline and monocrystalline silicon cells. Our model utilizes a segmentation Deeplabv3 model with a ResNet-50 backbone. It was trained on 17,064 EL images including 256 physically realistic simulated images of PV cells generated to deal with class imbalance. While performing semantic segmentation for five defect classes, this model achieves a weighted F1-score of 0.95, an unweighted F1-score of 0.69, a pixel-level global accuracy of 95.4%, and a mean intersection over union score of 57.3%. In addition, we introduce the UCF EL Defect dataset, a large-scale dataset consisting of 17,064 EL images, which will be publicly available for use by the PV and computer vision research communities.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2021
Kerrigan, Alec; Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
Reformulating Zero-shot Action Recognition for Multi-label Actions Conference
Thirty-fifth Conference on Neural Information Processing Systems, 2021.
BibTeX | Links:
@conference{Kerrigan2021,
title = {Reformulating Zero-shot Action Recognition for Multi-label Actions},
author = {Alec Kerrigan and Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/reformulating_zero_shot_action2.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ZSL-Supp.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Reformulating-Zero-shot-Action-Recognition-forMulti-label-Actions.pptx},
year = {2021},
date = {2021-12-06},
urldate = {2021-12-06},
booktitle = {Thirty-fifth Conference on Neural Information Processing Systems},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Lei, Huan; Akhtar, Naveed; Shah, Mubarak; Mian, Ajmal
Geometric Feature Learning for 3D Meshes Journal Article
In: arXiv, 2021.
@article{nokey,
title = {Geometric Feature Learning for 3D Meshes},
author = {Huan Lei and Naveed Akhtar and Mubarak Shah and Ajmal Mian},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2112.01801.pdf
https://github.com/EnyaHermite/Picasso},
year = {2021},
date = {2021-12-03},
journal = {arXiv},
abstract = {—Geometric feature learning for 3D meshes is central to computer graphics and highly important for numerous vision applications. However, deep learning currently lags in hierarchical modeling of heterogeneous 3D meshes due to the lack of required operations and/or their efficient implementations. In this paper, we propose a series of modular operations for effective geometric deep learning over heterogeneous 3D meshes. These operations include mesh convolutions, (un)pooling and efficient mesh decimation. We provide open source implementation of these operations, collectively termed Picasso. The mesh decimation module of Picasso is GPU-accelerated, which can process a batch of meshes on-the-fly for deep learning. Our (un)pooling operations compute features for newly-created neurons across network layers of varying resolution. Our mesh convolutions include facet2vertex, vertex2facet, and facet2facet convolutions that exploit vMF mixture and Barycentric interpolation to incorporate fuzzy modelling. Leveraging the modular operations of Picasso, we contribute a novel hierarchical neural network, PicassoNet-II, to learn highly discriminative features from 3D meshes. PicassoNet-II accepts primitive geometrics and fine textures of mesh facets as input features, while processing full scene meshes. Our network achieves highly competitive performance for shape analysis and scene parsing on a variety of benchmarks. We release Picasso and PicassoNet-II on Github.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Duarte, Kevin; Chen, Brian; Shvetsova, Nina; Rouditchenko, Andrew; Thomas, Samuel; Liu, Alexander; Harwath, David; Glass, James; Kuehne, Hilde; Shah, Mubarak
Routing with Self-Attention for Multimodal Capsule Networks Unpublished
arXiv preprint arXiv:2112.00775, 2021.
@unpublished{nokey,
title = {Routing with Self-Attention for Multimodal Capsule Networks},
author = {Kevin Duarte and Brian Chen and Nina Shvetsova and Andrew Rouditchenko and Samuel Thomas and Alexander Liu and David Harwath and James Glass and Hilde Kuehne and Mubarak Shah},
editor = {arXiv},
url = {https://arxiv.org/pdf/2112.00775.pdf
https://arxiv.org/abs/2112.00775},
doi = { https://doi.org/10.48550/arXiv.2112.00775},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
abstract = {The task of multimodal learning has seen a growing interest recently as it allows for training neural architectures based on different modalities such as vision, text, and audio. One challenge in training such models is that they need to jointly learn semantic concepts and their relationships across different input representations. Capsule networks have been shown to perform well in context of capturing the relation between low-level input features and higher-level concepts. However, capsules have so far mainly been used only in small-scale fully supervised settings due to the resource demand of conventional routing algorithms. We present a new multimodal capsule network that allows us to leverage the strength of capsules in the context of a multimodal learning framework on large amounts of video data. To adapt the capsules to large-scale input data, we propose a novel routing by self-attention mechanism that selects relevant capsules which are then used to generate a final joint multimodal feature representation. This allows not only for robust training with noisy video data, but also to scale up the size of the capsule network compared to traditional routing methods while still being computationally efficient. We evaluate the proposed architecture by pretraining it on a large-scale multimodal video dataset and applying it on four datasets in two challenging downstream tasks. Results show that the proposed multimodal capsule network is not only able to improve results compared to other routing techniques, but also achieves competitive performance on the task of multimodal learning.},
howpublished = {arXiv preprint arXiv:2112.00775},
keywords = {},
pubstate = {published},
tppubtype = {unpublished}
}
Rajasegaran, Jathushan; Khan, Salman; Hayat, Munawar; Khan, Fahad Shahbaz; Shah, Mubarak
Self-supervised Knowledge Distillation for Few-shot Learning Conference
British Machine Vision Conference, Nov 22-25, 2021.
BibTeX | Links:
@conference{Rajasegaran2020,
title = {Self-supervised Knowledge Distillation for Few-shot Learning},
author = {Jathushan Rajasegaran and Salman Khan and Munawar Hayat and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Self-supervised-Knowledge-Distillation-for-Few-shot-Learning.pdf
https://bmvc2021-virtualconference.com/conference/papers/paper_0820.html
https://github.com/brjathu/SKD},
year = {2021},
date = {2021-11-22},
urldate = {2021-11-22},
booktitle = {British Machine Vision Conference, Nov 22-25},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
- Fellow, ACM (Association for Computing Machinery), 2021.
- Fellow, NAI (National Academy of Inventors), 2020.
- Fellow, AAAS (American Association of Advancement of Science), 2010.
- Fellow, SPIE (Society of Photographic Instrumentation Engineers), 2008.
- Fellow, IAPR (International Association of Pattern Recognition), 2006.
- Fellow, IEEE (Institute of Electrical and Electronic Engineers), 2003.
- ACM SIGMM award for Outstanding Technical Contributions to Multimedia Computing, Communications and Applications, 2019.
- Inducted to UCF Chapter of National Academy of Innovators, December 2017.
- UCF Luminary Award, October 2017
- University Excellence in Research Award, 2017.
- Faculty Excellence in Mentoring Doctoral Students Award, 2017
- Top 10% Paper Award at International Conference on Image Processing (ICIP-2014), 2014.
- 2nd place in NHK Where is beauty? Grand Challenge at the ACM Multimedia 2013 conference.
- NGA(National Geo-spatial Intelligence Agency) Best Research Poster award at NARP Symposium Award, 2013.
- University Distinguished Researcher award, 2012.
- College of Engineering & Computer Science Advisory Board Award For Faculty Excellence, 2011
- Scholarship of Teaching and Learning (SoTL) award, 2011.
- Finalist for the Best Paper award, ACM Conference on Multimedia, 2010.
- ACM Distinguished Speaker (DSP), 2008-2014.
- University Distinguished Researcher award, 2007.
- Sindhi Association of North American award, 2007.
- Pegasus Professor, 2006.
- UCF Millionaires' Club, 2005, 2006, 2009-2013, 2015.
- Honorable mention, ICCV 2005 Where Am I? Challenge Problem.
- Finalist for the Best Paper award, ACM Conference on Multimedia, 2005.
- Research Incentive Award (RIA), 2003, 2009, 2014.
- Teaching Incentive Program (TIP) Award, 1996, 2003.
- IEEE Distinguished Visitors Program Speaker, 1997-2000.
- Engineering Achievement Award of Information Systems Division of Harris Corporation, 1999.
- IEEE Outstanding Engineering Educator Award, 1997.
- TOKTEN awards by UNDP, 1992, 1995, 2000.
- Philips International Institute Scholarship 1980.
- ACM Member
- ACM SIGMM Member
- IEEE Life Fellow
- IEEE Computer Society Member
- Where We Are and What We're Looking At, CVPR 2023, 18-22 June 2023
- Human Activity Recognition: Learning with Less Labels and Privacy Preservation, Keynote Talk at SPIE automatic Target Recognition XXXII, 4-5 April 2022
- Overview of our Research 2022
- MMP Tracking Workshop Keynote - October 28, 2021
- 34 Years of Research Experience for Undergraduates in Computer Vision
- CVPR 2021 Tutorial: "Cross-View Geo-Localization: Ground-to-Aerial Image Matching"
- Current Funded Research at CRCV - June 2021
- Learning With Less Labels
- Adversarial Computer Vision
- Keynote_(Person Re-Identification and Tracking in Multiple Non-Overlapping Cameras)
- SIGMM Technical Achievement Award 2019, Keynote Talk
- Capsule Networks for Computer Vision – CVPR 2019 Tutorial
- CAP6412 Advanced Computer Vision - Spring 2019
- Deep Learning
- CAP6412 Advanced Computer Vision - Spring 2018
- UCF Computer Vision Video Lectures 2014
- Multi-Object Tracking: Crowd Tracking and Group Action Recognition
- UCF Computer Vision Video Lectures 2012
- Have taught ten different courses at the graduate and undergraduate level, introduced a new honors course (co-taught with a Mathematics Professor), and directed numerous independent studies of undergraduate and graduate students;
- Have conducted seven short courses and tutorials in ve different countries (Italy, US, Pakistan, Mexico, Taiwan) (http://www.cs.ucf.edu/vision/accv2000h-6.pdf.);
- I have authored an unpublished book Fundamental of Computer Vision, which I use for my class and is also available on the web: http://www.cs.ucf.edu/courses/cap6411/book.pdf.
- My pedagogical contributions are covered in four text books by popular authors: Computer Vision: Algorithms and Applications, Richard Szeliski; Robot Vision, Haralick and Shapiro; Introductory Techniques for 3D Computer Vision, Veri and Trucco; and Computer Vision, Shapiro and Stockman.
- I have provided videos of my Computer Vision Lectures on youtube, which has received close to one million views: https://www.youtube.com/playlist?list=PLd3hlSJsX_ Imk_BPmB_H3AQjFKZS9XgZm
- CAP 6412 Advanced Computer Vision https://www.crcv.ucf.edu/courses/cap6412-spring-2019/
- CAP 5415 Computer Vision http://www.cs.ucf.edu/courses/cap6411/cap5415
- CAP 6411 Computer Vision Systems http://www.cs.ucf.edu/courses/cap6411/cap6411/fall02/cap6411_fall02.html
- COT 6505 Numerical Optimization http://www.cs.ucf.edu/courses/cap6411/cot6505/spring03/cot6505_sp03.html
- CAP 3930H Computer Vision Guided Tour of Mathematics
- CAP 6938 Special Topics: Mathematical Tools for Computer Vision
- CAP 4932 Intro Robot Vision
- COT 4110 Numerical Calculus
- COP 3400 Assembly Language
- COP 3402 Systems Concepts and Programming