2024
Arul, S. H.; Bedi, A. S.; Manocha, D.
When, What, and with Whom to Communicate: Enhancing RL-based Multi-Robot Navigation through Selective Communication Conference
IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2024.
Tags: IROS
@conference{Arul2024,
title = {When, What, and with Whom to Communicate: Enhancing RL-based Multi-Robot Navigation through Selective Communication},
author = {S. H. Arul and A. S. Bedi and D. Manocha},
year = {2024},
date = {2024-10-14},
publisher = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Shek, C. L.; Wu, X.; Suttle, W. A.; Busart, C.; Zaroukian, E.; Manocha, D.; Tokekar, P.; Bedi, A. S.
LANCAR: Leveraging Language for Context-Aware Robot Locomotion in Unstructured Environments Conference
IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2024.
Tags: IROS | Links:
@conference{Shek2024,
title = {LANCAR: Leveraging Language for Context-Aware Robot Locomotion in Unstructured Environments},
author = {C. L. Shek and X. Wu and W. A. Suttle and C. Busart and E. Zaroukian and D. Manocha and P. Tokekar and A. S. Bedi},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2310.00481v2.pdf},
year = {2024},
date = {2024-10-14},
publisher = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Sun, Xingpeng; Zhang, Yiran; Tang, Xindi; Bedi, Amrit Singh; Bera, Aniket
TrustNavGPT: Trust-Driven Audio-Guided Robot Navigation under Uncertainty with Large Language Models Conference
IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) [Oral], 2024.
Abstract | Tags: IROS | Links:
@conference{Sun2024b,
title = {TrustNavGPT: Trust-Driven Audio-Guided Robot Navigation under Uncertainty with Large Language Models},
author = {Xingpeng Sun and Yiran Zhang and Xindi Tang and Amrit Singh Bedi and Aniket Bera},
url = {https://xingpengsun0.github.io/trustnav/},
year = {2024},
date = {2024-10-14},
publisher = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) [Oral]},
abstract = {Large language models (LLMs) exhibit a wide range of promising capabilities -- from step-by-step planning to commonsense reasoning --that provide utility for robot navigation. However, as humans communicate with robots in the real world, ambiguity and uncertainty may be embedded inside spoken instructions. While LLMs are proficient at processing text in human conversations, they often encounter difficulties with the nuances of verbal instructions and, thus, remain prone to hallucinate trust in human command. In this work, we present TrustNavGPT, an LLM-based audio-guided navigation agent that uses affective cues in spoken communication—elements such as tone and inflection that convey meaning beyond words—allowing it to assess the trustworthiness of human commands and make effective, safe decisions.},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Kulbaka, Iliya; Dutta, Ayan; Kreidl, O. Patrick; Bölöni, Ladislau; Roy, Swapnoneel
GDM-Net: gas distribution mapping with a mobile robot using deep reinforcement learning and gaussian process regression Conference
IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2024.
Tags: IROS
@conference{nokey,
title = {GDM-Net: gas distribution mapping with a mobile robot using deep reinforcement learning and gaussian process regression},
author = {Iliya Kulbaka and Ayan Dutta and O. Patrick Kreidl and Ladislau Bölöni and Swapnoneel Roy},
year = {2024},
date = {2024-10-14},
publisher = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Kumar, Aakash; Chen, Chen; Mian, Ajmal; Lobo, Neils; Shah, Mubarak
Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR Data Conference
IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2024.
Abstract | Tags: IROS | Links:
@conference{Kumar2024,
title = {Sparse Points to Dense Clouds: Enhancing 3D Detection with Limited LiDAR Data},
author = {Aakash Kumar and Chen Chen and Ajmal Mian and Neils Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2404.06715v1.pdf
https://aakashjuseja-aj.github.io/Sparse_to_Dense/},
year = {2024},
date = {2024-10-14},
publisher = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
abstract = {3D detection is a critical task that enables machines to identify and locate objects in three-dimensional space. It has a broad range of applications in several fields, including autonomous driving, robotics and augmented reality. Monocular 3D detection is attractive as it requires only a single camera, however, it lacks the accuracy and robustness required for real world applications. High resolution LiDAR on the other hand, can be expensive and lead to interference problems in heavy traffic given their active transmissions. We propose a balanced approach that combines the advantages of monocular and point cloud-based 3D detection. Our method requires only a small number of 3D points, that can be obtained from a low-cost, low-resolution sensor. Specifically, we use only 512 points, which is just 1% of a full LiDAR frame in the KITTI dataset. Our method reconstructs a complete 3D point cloud from this limited 3D information combined with a single image. The reconstructed 3D point cloud and corresponding image can be used by any multi-modal off-the-shelf detector for 3D object detection. By using the proposed network architecture with an off-the-shelf multi-modal 3D detector, the accuracy of 3D detection improves by 20% compared to the state-of-the-art monocular detection methods and 6% to 9% compare to the baseline multi-modal methods on KITTI and JackRabbot datasets.},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Chhipa, Prakash Chandra; Chippa, Meenakshi Subhash; De, Kanjar; Saini, Rajkumar; Liwicki, Marcus; Shah, Mubarak
Möbius Transform for Mitigating Perspective Distortions in Representation Learning Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{nokey,
title = {Möbius Transform for Mitigating Perspective Distortions in Representation Learning},
author = {Prakash Chandra Chhipa and Meenakshi Subhash Chippa and Kanjar De and Rajkumar
Saini and Marcus Liwicki and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/MPD_ECCV2024_CameraReady.pdf
https://prakashchhipa.github.io/projects/mpd},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Perspective distortion (PD) causes unprecedented changes in shape, size, orientation, angles, and other spatial relationships of visual concepts in images. Precisely estimating camera intrinsic and extrinsic parameters is a challenging task that prevents synthesizing perspective distortion. Non-availability of dedicated training data poses a critical barrier to developing robust computer vision methods. Additionally, distortion correction methods make other computer vision tasks a multi-step approach and lack performance. In this work, we propose
mitigating perspective distortion (MPD) by employing a fine-grained parameter control on a specific family of Möbius transform to model real-world distortion without estimating camera intrinsic and extrinsic parameters and without the need for actual distorted data. Also, we present a dedicated perspectively distorted benchmark dataset, ImageNet-PD, to benchmark the robustness of deep learning models against this new dataset. The proposed method outperforms existing benchmarks, ImageNet-E and ImageNet-X. Additionally, it significantly
improves performance on ImageNet-PD while consistently performing on standard data distribution. Notably, our method shows improved performance on three PD-affected real-world applications—crowd counting, fisheye image recognition, and person re-identification—and one PD-affected challenging CV task: object detection. The source code, dataset, and models are available on the project webpage at https://prakashchhipa.github.io/projects/mpd.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
mitigating perspective distortion (MPD) by employing a fine-grained parameter control on a specific family of Möbius transform to model real-world distortion without estimating camera intrinsic and extrinsic parameters and without the need for actual distorted data. Also, we present a dedicated perspectively distorted benchmark dataset, ImageNet-PD, to benchmark the robustness of deep learning models against this new dataset. The proposed method outperforms existing benchmarks, ImageNet-E and ImageNet-X. Additionally, it significantly
improves performance on ImageNet-PD while consistently performing on standard data distribution. Notably, our method shows improved performance on three PD-affected real-world applications—crowd counting, fisheye image recognition, and person re-identification—and one PD-affected challenging CV task: object detection. The source code, dataset, and models are available on the project webpage at https://prakashchhipa.github.io/projects/mpd.
Kang, Weitai; Liu, Gaowen; Shah, Mubarak; Yan, Yan
SegVG: Transferring Object Bounding Box to Segmentation for Visual Grounding Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Kang2024,
title = {SegVG: Transferring Object Bounding Box to Segmentation for Visual Grounding},
author = {Weitai Kang and Gaowen Liu and Mubarak Shah and Yan Yan},
url = {https://arxiv.org/pdf/2407.03200
},
doi = {https://doi.org/10.48550/arXiv.2407.03200},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Different from Object Detection, Visual Grounding deals with detecting a bounding box for each text-image pair. This one box for each text-image data provides sparse supervision signals. Although previous works achieve impressive results, their passive utilization of annotation, i.e. the sole use of the box annotation as regression ground truth, results in a suboptimal performance. In this paper, we present SegVG, a novel method transfers the box-level annotation as Segmentation signals to provide an additional pixel-level supervision for Visual Grounding. Specifically, we propose the Multi-layer Multi-task Encoder-Decoder as the target grounding stage, where we learn a regression query and multiple segmentation queries to ground the target by regression and segmentation of the box in each decoding layer, respectively. This approach allows us to iteratively exploit the annotation as signals for both box-level regression and pixel-level segmentation. Moreover, as the backbones are typically initialized by pretrained parameters learned from unimodal tasks and the queries for both regression and segmentation are static learnable embeddings, a domain discrepancy remains among these three types of features, which impairs subsequent target grounding. To mitigate this discrepancy, we introduce the Triple Alignment module, where the query, text, and vision tokens are triangularly updated to share the same space by triple attention mechanism. Extensive experiments on five widely used datasets validate our state-of-the-art (SOTA) performance.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Dave, Ishan Rajendrakumar; Rizve, Mamshad Nayeem; Shah, Mubarak
FinePseudo: Improving Pseudo-Labelling through Temporal-Alignablity for Semi-Supervised Fine-Grained Action Recognition Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Dave2024,
title = {FinePseudo: Improving Pseudo-Labelling through Temporal-Alignablity for Semi-Supervised Fine-Grained Action Recognition},
author = {Ishan Rajendrakumar Dave and Mamshad Nayeem Rizve and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/finepsuedo_eccv24_dave.pdf
https://daveishan.github.io/finepsuedo-webpage/},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Real-life applications of action recognition often require a
fine-grained understanding of subtle movements, e.g., in sports analytics, user interactions in AR/VR, and surgical videos. Although fine-grained actions are more costly to annotate, existing semi-supervised action recognition has mainly focused on coarse-grained action recognition. Since fine-grained actions are more challenging due to the absence of scene bias, classifying these actions requires an understanding of action-phases. Hence, existing coarse-grained semi-supervised methods
do not work effectively. In this work, we for the first time thoroughly investigate semi-supervised fine-grained action recognition (FGAR). We observe that alignment distances like dynamic time warping (DTW) provide a suitable action-phase-aware measure for comparing fine-grained actions, a concept previously unexploited in FGAR. However, since regular DTW distance is pairwise and assumes strict alignment between pairs, it is not directly suitable for classifying fine-grained actions. To
utilize such alignment distances in a limited-label setting, we propose an Alignability-Verification-based Metric learning technique to effectively discriminate between fine-grained action pairs. Our learnable alignability score provides a better phase-aware measure, which we use to refine the pseudo-labels of the primary video encoder. Our collaborative pseudolabeling-based framework ‘FinePseudo’ significantly outperforms prior methods on four fine-grained action recognition datasets: Diving48, FineGym99, FineGym288, and FineDiving, and shows improvement on existing coarse-grained datasets: Kinetics400 and Something-SomethingV2. We also demonstrate the robustness of our collaborative pseudo-labeling in handling novel unlabeled classes in open-world semi-supervised setups.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
fine-grained understanding of subtle movements, e.g., in sports analytics, user interactions in AR/VR, and surgical videos. Although fine-grained actions are more costly to annotate, existing semi-supervised action recognition has mainly focused on coarse-grained action recognition. Since fine-grained actions are more challenging due to the absence of scene bias, classifying these actions requires an understanding of action-phases. Hence, existing coarse-grained semi-supervised methods
do not work effectively. In this work, we for the first time thoroughly investigate semi-supervised fine-grained action recognition (FGAR). We observe that alignment distances like dynamic time warping (DTW) provide a suitable action-phase-aware measure for comparing fine-grained actions, a concept previously unexploited in FGAR. However, since regular DTW distance is pairwise and assumes strict alignment between pairs, it is not directly suitable for classifying fine-grained actions. To
utilize such alignment distances in a limited-label setting, we propose an Alignability-Verification-based Metric learning technique to effectively discriminate between fine-grained action pairs. Our learnable alignability score provides a better phase-aware measure, which we use to refine the pseudo-labels of the primary video encoder. Our collaborative pseudolabeling-based framework ‘FinePseudo’ significantly outperforms prior methods on four fine-grained action recognition datasets: Diving48, FineGym99, FineGym288, and FineDiving, and shows improvement on existing coarse-grained datasets: Kinetics400 and Something-SomethingV2. We also demonstrate the robustness of our collaborative pseudo-labeling in handling novel unlabeled classes in open-world semi-supervised setups.
Swetha, Sirnam; Yang, Jinyu; Neiman, Tal; Rizve, Mamshad Nayeem; Tran, Son; Yao, Benjamin; Chilimbi, Trishul; Shah, Mubarak
X-Former: Unifying Contrastive and Reconstruction Learning for MLLMs Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Swetha2024,
title = {X-Former: Unifying Contrastive and Reconstruction Learning for MLLMs},
author = {Sirnam Swetha and Jinyu Yang and Tal Neiman and Mamshad Nayeem Rizve and Son Tran and Benjamin Yao and Trishul Chilimbi and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2407.13851v1.pdf
https://arxiv.org/abs/2407.13851
https://swetha5.github.io/XFormer/},
doi = {https://doi.org/10.48550/arXiv.2407.13851},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Recent advancements in Multimodal Large Language Models (MLLMs) have revolutionized the field of vision-language understanding by integrating visual perception capabilities into Large Language Models (LLMs). The prevailing trend in this field involves the utilization of a vision encoder derived from vision-language contrastive learning (CL), showing expertise in capturing overall representations while facing difficulties in capturing detailed local patterns. In this work, we focus on enhancing the visual representations for MLLMs by combining high-frequency and detailed visual representations, obtained through masked image modeling (MIM), with semantically-enriched low-frequency representations captured by CL. To achieve this goal, we introduce X-Former which is a lightweight transformer module designed to exploit the complementary strengths of CL and MIM through an innovative interaction mechanism. Specifically, X-Former first bootstraps vision-language representation learning and multimodal-to-multimodal generative learning from two frozen vision encoders, i.e., CLIP-ViT (CL-based) and MAE-ViT (MIM-based). It further bootstraps vision-to-language generative learning from a frozen LLM to ensure visual features from X-Former can be interpreted by the LLM. To demonstrate the effectiveness of our approach, we assess its performance on tasks demanding detailed visual understanding. Extensive evaluations indicate that X-Former excels in visual reasoning tasks involving both structural and semantic categories in the GQA dataset. Assessment on fine-grained visual perception benchmark further confirms its superior capabilities in visual understanding.
},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Gupta, Rohit; Rizve, Mamshad Nayeem; Tawari, Ashish; Unnikrishnan, Jayakrishnan; Tran, Son; Shah, Mubarak; benjamin Yao,; Chilimbi, Trishul
Open Vocabulary Multi-Label Video Classification Conference
2024.
Abstract | Tags: ECCV | Links:
@conference{Gupta2024,
title = {Open Vocabulary Multi-Label Video Classification},
author = {Rohit Gupta and Mamshad Nayeem Rizve and Ashish Tawari and Jayakrishnan Unnikrishnan and Son Tran and Mubarak Shah and benjamin Yao and Trishul Chilimbi},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/OVMLVidCLS_ECCV_2024_CameraReady-2.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/OVMLVidCLS_ECCV_2024_Supplementary.pdf
https://arxiv.org/html/2407.09073v1#S1},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
abstract = {Pre-trained vision-language models (VLMs) have enabled significant progress in open vocabulary computer vision tasks such as image classification, object detection and image segmentation. Some recent works have focused on extending VLMs to open vocabulary single label action classification in videos. However, previous methods fall short in holistic video understanding which requires the ability to simultaneously recognize multiple actions and entities e.g., objects in the video in an open vocabulary setting. We formulate this problem as open vocabulary multi-label video classification and propose a method to adapt a pre-trained VLM such as CLIP to solve this task. We leverage large language models (LLMs) to provide semantic guidance to the VLM about class labels to improve its open vocabulary performance with two key contributions. First, we propose an end-to-end trainable architecture that learns to prompt an LLM to generate soft attributes for the CLIP text-encoder to enable it to recognize novel classes. Second, we integrate a temporal modeling module into CLIP’s vision encoder to effectively model the spatio-temporal dynamics of video concepts as well as propose a novel regularized finetuning technique to ensure strong open vocabulary classification performance in the video domain. Our extensive experimentation showcases the efficacy of our approach on multiple benchmark datasets.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Dave, Ishan Rajendrakumar; Heilbron, Fabian Caba; Shah, Mubarak; Jenni, Simon
Sync from the Sea: Retrieving Alignable Videos from Large-Scale Datasets Conference
The 18th European Conference on Computer Vision ECCV 2024, Oral (Top 3%), 2024.
Abstract | Tags: ECCV | Links:
@conference{Dave2024b,
title = {Sync from the Sea: Retrieving Alignable Videos from Large-Scale Datasets},
author = {Ishan Rajendrakumar Dave and Fabian Caba Heilbron and Mubarak Shah and Simon Jenni},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/avr_eccv24_dave.pdf
https://daveishan.github.io/avr-webpage/},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024, Oral (Top 3%)},
abstract = {Temporal video alignment aims to synchronize the key events
like object interactions or action phase transitions in two videos. Such methods could benefit various video editing, processing, and understanding tasks. However, existing approaches operate under the restrictive assumption that a suitable video pair for alignment is given, significantly limiting their broader applicability. To address this, we re-pose temporal alignment as a search problem and introduce the task of Alignable Video Retrieval (AVR). Given a query video, our approach can identify well-alignable videos from a large collection of clips and temporally synchronize them to the query. To achieve this, we make three key contributions: 1) we introduce DRAQ, a video alignability indicator to identify and re-rank the best alignable video from a set of candidates; 2) we propose an effective and generalizable frame-level video feature design to improve the alignment performance of several off-the-shelf feature representations, and 3) we propose a novel benchmark and evaluation protocol for AVR using cycle-consistency metrics. Our experiments on 3 datasets, including large-scale Kinetics700, demonstrate the effectiveness of our approach in identifying alignable video pairs from diverse datasets. },
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
like object interactions or action phase transitions in two videos. Such methods could benefit various video editing, processing, and understanding tasks. However, existing approaches operate under the restrictive assumption that a suitable video pair for alignment is given, significantly limiting their broader applicability. To address this, we re-pose temporal alignment as a search problem and introduce the task of Alignable Video Retrieval (AVR). Given a query video, our approach can identify well-alignable videos from a large collection of clips and temporally synchronize them to the query. To achieve this, we make three key contributions: 1) we introduce DRAQ, a video alignability indicator to identify and re-rank the best alignable video from a set of candidates; 2) we propose an effective and generalizable frame-level video feature design to improve the alignment performance of several off-the-shelf feature representations, and 3) we propose a novel benchmark and evaluation protocol for AVR using cycle-consistency metrics. Our experiments on 3 datasets, including large-scale Kinetics700, demonstrate the effectiveness of our approach in identifying alignable video pairs from diverse datasets.
Yang, Paiyu; Akhtar, Naveed; Shah, Mubarak; Mian, Ajmal
Regulating Model Reliance on Non-Robust Features by Smoothing Input Marginal Density Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Tags: ECCV | Links:
@conference{Yang2024,
title = {Regulating Model Reliance on Non-Robust Features by Smoothing Input Marginal Density},
author = {Paiyu Yang and Naveed Akhtar and Mubarak Shah and Ajmal Mian},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/main_paper-1.pdf
https://arxiv.org/pdf/2407.04370
https://github.com/ypeiyu/input_density_reg},
year = {2024},
date = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Tai, Kai Sheng; Chen, Sirius; Shukla, Satya Narayan; Yu, Hanchao; Torr, Philip; Tian, Taipeng; Lim, Ser-Nam
uCAP: An Unsupervised Prompting Method for Vision-Language Models Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
@conference{Tai2024,
title = {uCAP: An Unsupervised Prompting Method for Vision-Language Models},
author = {Kai Sheng Tai and Sirius Chen and Satya Narayan Shukla and Hanchao Yu and Philip Torr and Taipeng Tian and Ser-Nam Lim},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {This paper addresses a significant limitation that prevents Contrastive Language-Image Pretrained Models (CLIP) from achieving optimal performance on downstream image classification tasks. The key problem with CLIP-style zero-shot classification is that it requires domain-specific context in the form of prompts to better align the class descriptions to the downstream data distribution. In particular, prompts for vision-language models are domain-level texts (e.g., “a centered satellite image of ...”) which, together with the class names, are fed into the text encoder to provide more context for the downstream dataset. These prompts are typically manually tuned, which is time consuming and often sub-optimal. To overcome this bottleneck, this paper proposes uCAP, a method to automatically learn domain-specific prompts/contexts using only unlabeled in-domain images. We achieve this by modeling the generation of images given the class names and a domain-specific prompt with an unsupervised likelihood distribution, and then performing inference of the prompts. We validate the proposed method across various models and datasets, showing that uCAP consistently outperforms manually tuned prompts and related baselines on the evaluated datasets: ImageNet, CIFAR-10, CIFAR-100, OxfordPets (up to 2%), SUN397 (up to 5%), and Caltech101 (up to 3%).},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Chen, Hao; Xie, Saining; Lim, Ser-Nam; Shrivastava, Abhinav
Fast Encoding and Decoding for Implicit Video Representation Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Tags: ECCV
@conference{Chen2024b,
title = {Fast Encoding and Decoding for Implicit Video Representation},
author = {Hao Chen and Saining Xie and Ser-Nam Lim and Abhinav Shrivastava},
year = {2024},
date = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Jang, Young Kyun; Huynh, Dat; Shah, Ashish; Chen, Wen-Kai; Lim, Ser-Nam
Spherical Linear Interpolation and Text-Anchoring for Zero-shot Composed Retrieval Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Tags: ECCV
@conference{Jang2024,
title = {Spherical Linear Interpolation and Text-Anchoring for Zero-shot Composed Retrieval},
author = {Young Kyun Jang and Dat Huynh and Ashish Shah and Wen-Kai Chen and Ser-Nam Lim},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Sun, Guangyu; Mendieta, Matias; Dutta, Aritra; Li, Xin; Chen, Chen
Towards Multi-modal Transformers in Federated Learning Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Tags: ECCV | Links:
@conference{Sun2024,
title = {Towards Multi-modal Transformers in Federated Learning},
author = {Guangyu Sun and Matias Mendieta and Aritra Dutta and Xin Li and Chen Chen},
url = {https://arxiv.org/pdf/2404.12467.pdf
https://github.com/imguangyu/FedCola},
year = {2024},
date = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Wang, Zhaoning; Li, Ming; Chen, Chen
LucidDreaming: Controllable Object-Centric 3D Generation Workshop
The 18th European Conference on Computer Vision (ECCV) Workshop on Computer Vision For Videogames (CV2) 2024, 2024.
@workshop{Wang2024b,
title = {LucidDreaming: Controllable Object-Centric 3D Generation},
author = {Zhaoning Wang and Ming Li and Chen Chen},
url = {https://arxiv.org/pdf/2312.00588.pdf
https://www.zhaoningwang.com/LucidDreaming},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision (ECCV) Workshop on Computer Vision For Videogames (CV2) 2024},
keywords = {},
pubstate = {published},
tppubtype = {workshop}
}
Li, Ming; Yang, Taojiannan; Kuang, Huafeng; Wu, Jie; Wang, Zhaoning; Xiao, Xuefeng; Chen, Chen
ControlNet++: Improving Conditional Controls with Efficient Consistency Feedback Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Li2024,
title = {ControlNet++: Improving Conditional Controls with Efficient Consistency Feedback},
author = {Ming Li and Taojiannan Yang and Huafeng Kuang and Jie Wu and Zhaoning Wang and Xuefeng Xiao and Chen Chen},
url = {https://arxiv.org/pdf/2404.07987.pdf
https://liming-ai.github.io/ControlNet_Plus_Plus/
https://github.com/liming-ai/ControlNet_Plus_Plus
https://huggingface.co/spaces/limingcv/ControlNet-Plus-Plus},
year = {2024},
date = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {To enhance the controllability of text-to-image diffusion models, existing efforts like ControlNet incorporated image-based conditional controls. In this paper, we reveal that existing methods still face significant challenges in generating images that align with the image conditional controls. To this end, we propose ControlNet++, a novel approach that improves controllable generation by explicitly optimizing pixel-level cycle consistency between generated images and conditional controls. Specifically, for an input conditional control, we use a pre-trained discriminative reward model to extract the corresponding condition of the generated images, and then optimize the consistency loss between the input conditional control and extracted condition. A straightforward implementation would be generating images from random noises and then calculating the consistency loss, but such an approach requires storing gradients for multiple sampling timesteps, leading to considerable time and memory costs. To address this, we introduce an efficient reward strategy that deliberately disturbs the input images by adding noise, and then uses the single-step denoised images for reward fine-tuning. This avoids the extensive costs associated with image sampling, allowing for more efficient reward fine-tuning. Extensive experiments show that ControlNet++ significantly improves controllability under various conditional controls. For example, it achieves improvements over ControlNet by 7.9% mIoU, 13.4% SSIM, and 7.6% RMSE, respectively, for segmentation mask, line-art edge, and depth conditions.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Pinyoanuntapong, Ekkasit; Saleem, Muhammad Usama; Wang, Pu; Lee, Minwoo; Das, Srijan; Chen, Chen
BAMM: Bidirectional Autoregressive Motion Model Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{nokey,
title = {BAMM: Bidirectional Autoregressive Motion Model},
author = {Ekkasit Pinyoanuntapong and Muhammad Usama Saleem and Pu Wang and Minwoo Lee and Srijan Das and Chen Chen},
url = {https://arxiv.org/pdf/2403.19435.pdf
https://exitudio.github.io/BAMM-page/},
year = {2024},
date = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Generating human motion from text has been dominated by denoising motion models either through diffusion or generative masking process. However, these models face great limitations in usability by requiring prior knowledge of the motion length. Conversely, autoregressive motion models address this limitation by adaptively predicting motion endpoints, at the cost of degraded generation quality and editing capabilities. To address these challenges, we propose Bidirectional Autoregressive Motion Model (BAMM), a novel text-to-motion generation framework. BAMM consists of two key components: (1) a motion tokenizer that transforms 3D human motion into discrete tokens in latent space, and (2) a masked self-attention transformer that autoregressively predicts randomly masked tokens via a hybrid attention masking strategy. By unifying generative masked modeling and autoregressive modeling, BAMM captures rich and bidirectional dependencies among motion tokens, while learning the probabilistic mapping from textual inputs to motion outputs with dynamically-adjusted motion sequence length. This feature enables BAMM to simultaneously achieving high-quality motion generation with enhanced usability and built-in motion editability. Extensive experiments on HumanML3D and KIT-ML datasets demonstrate that BAMM surpasses current state-of-the-art methods in both qualitative and quantitative measures.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Khalid, Umar; Iqbal, Hasan; Farooq, Azib; Hua, Jing; Chen, Chen
3DEgo: 3D Editing on the Go! Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Khalid2024,
title = {3DEgo: 3D Editing on the Go!},
author = {Umar Khalid and Hasan Iqbal and Azib Farooq and Jing Hua and Chen Chen},
url = {https://arxiv.org/pdf/2407.10102
https://3dego.github.io/},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {We introduce 3DEgo to address a novel problem of directly synthesizing photorealistic 3D scenes from monocular videos guided by textual prompts. Conventional methods construct a text-conditioned 3D scene through a three-stage process, involving pose estimation using Structure-from-Motion (SfM) libraries like COLMAP, initializing the 3D model with unedited images, and iteratively updating the dataset with edited images to achieve a 3D scene with text fidelity. Our framework streamlines the conventional multi-stage 3D editing process into a single-stage workflow by overcoming the reliance on COLMAP and eliminating the cost of model initialization. We apply a diffusion model to edit video frames prior to 3D scene creation by incorporating our designed noise blender module for enhancing multi-view editing consistency, a step that does not require additional training or fine-tuning of T2I diffusion models. 3DEgo utilizes 3D Gaussian Splatting to create 3D scenes from the multi-view consistent edited frames, capitalizing on the inherent temporal continuity and explicit point cloud data. 3DEgo demonstrates remarkable editing precision, speed, and adaptability across a variety of video sources, as validated by extensive evaluations on six datasets, including our own prepared GS25 dataset.
},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Khalid, Umar; Iqbal, Hasan; Tayyab, Muhammad; Karim, Md Nazmul; Hua, Jing; Chen, Chen
LatentEditor: Text Driven Local Editing of 3D Scenes Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Khalid2024b,
title = {LatentEditor: Text Driven Local Editing of 3D Scenes},
author = {Umar Khalid and Hasan Iqbal and Muhammad Tayyab and Md Nazmul Karim and Jing Hua and Chen Chen},
url = {https://arxiv.org/pdf/2312.09313.pdf
https://latenteditor.github.io/},
year = {2024},
date = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {While neural fields have made significant strides in view synthesis and scene reconstruction, editing them poses a formidable challenge due to their implicit encoding of geometry and texture information from multi-view inputs. In this paper, we introduce LatenEditor, an innovative framework designed to empower users with the ability to perform precise and locally controlled editing of neural fields using text prompts. Leveraging denoising diffusion models, we successfully embed real-world scenes into the latent space, resulting in a faster and more adaptable NeRF backbone for editing compared to traditional methods. To enhance editing precision, we introduce a delta score to calculate the 2D mask in the latent space that serves as a guide for local modifications while preserving irrelevant regions. Our novel pixel-level scoring approach harnesses the power of InstructPix2Pix (IP2P) to discern the disparity between IP2P conditional and unconditional noise predictions in the latent space. The edited latents conditioned on the 2D masks are then iteratively updated in the training set to achieve 3D local editing. Our approach achieves faster editing speeds and superior output quality compared to existing 3D editing models, bridging the gap between textual instructions and high-quality 3D scene editing in latent space.
},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Karim, Md Nazmul; Iqbal, Hasan; Khalid, Umar; Chen, Chen; Hua, Jing
Free-Editor: Zero-shot Text-driven 3D Scene Editing Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Karim2024,
title = {Free-Editor: Zero-shot Text-driven 3D Scene Editing},
author = {Md Nazmul Karim and Hasan Iqbal and Umar Khalid and Chen Chen and Jing Hua
},
url = {https://arxiv.org/pdf/2312.13663.pdf
https://free-editor.github.io/},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Text-to-Image (T2I) diffusion models have gained popularity recently due to their multipurpose and easy-to-use nature, e.g. image and video generation as well as editing. However, training a diffusion model specifically for 3D scene editing is not straightforward due to the lack of large-scale datasets. To date, editing 3D scenes requires either re-training the model to adapt to various 3D edited scenes or design-specific methods for each special editing type. Furthermore, state-of-the-art (SOTA) methods require multiple synchronized edited images from the same scene to facilitate the scene editing. Due to the current limitations of T2I models, it is very challenging to apply consis- tent editing effects to multiple images, i.e. multi-view inconsistency in editing. This in turn compromises the desired 3D scene editing performance if these images are used. In our work, we propose a novel training-free 3D scene editing technique, FREE-EDITOR, which allows users to edit 3D scenes without further re-training the model during test time. Our proposed method successfully avoids the multi- view style inconsistency issue in SOTA methods with the help of a “single-view editing” scheme. Specifically, we show that editing a particular 3D scene can be performed by only modifying a single view. To this end, we introduce an Edit Transformer that enforces intra-view consistency and inter-view style transfer by utilizing self- and cross- attention, respectively. Since it is no longer required to re-train the model and edit every view in a scene, the editing time, as well as memory resources, are reduced significantly, e.g., the runtime being ∼ 20× faster than SOTA. We have conducted extensive experiments on a wide range of benchmark datasets and achieve diverse editing capabilities with our proposed technique.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Fang, Xiang; Xiong, Zeyu; Fang, Wanlong; Qu, Xiaoye; Chen, Chen; Dong, Jianfeng; Tang, Keke; Zhou, Pan; Cheng, Yu; Liu, Daizong
Rethinking Weakly-supervised Video Temporal Grounding From a Game Perspective Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Fang2024,
title = {Rethinking Weakly-supervised Video Temporal Grounding From a Game Perspective},
author = {Xiang Fang and Zeyu Xiong and Wanlong Fang and Xiaoye Qu and Chen Chen and Jianfeng Dong and Keke Tang and Pan Zhou and Yu Cheng and Daizong Liu},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ECCV2024_Grounding_camera.pdf
https://eccv2024.ecva.net/virtual/2024/poster/1833},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {This paper addresses the challenging task of weakly-supervised video temporal grounding. Existing approaches are generally based on the moment candidate selection pipeline that utilizes contrastive learning and reconstruction paradigm for scoring the pre-defined moments. Although they have achieved significant progress, we argue that their current frameworks have overlooked two indispensable issues: (1) Coarse-grained cross-modal learning: previous methods solely capture the global video-level alignment with the query, failing to model the detailed consistency between video frames and query words for accurately grounding the moment boundaries. (2) Complex moment candidates: the performance of these methods severely relies on the quality of moment candidates, which are also time-consuming and complicated for selection. To this end, in this paper, we make the first attempt to tackle this task from a novel game perspective, which effectively learns the uncertain relationship between each frame-word pair with diverse granularity and flexible combination for fine-grained cross-modal interaction. Specifically, we creatively model each video frame and query word as game players with multivariate cooperative game theory to learn their contribution to the cross-modal similarity score. By quantifying the trend of frame-word cooperation within a coalition via the game-theoretic interaction, we are able to value all uncertain but possible correspondence between frames and words. At last, instead of using moment proposals, we utilize the learned query-guided frame-wise scores for fine-grained moment boundary grounding. Experiments show that our method achieves superior performance on both Charades-STA and ActivityNet Caption datasets.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Pillai, Manu S; Rizve, Mamshad Nayeem; Shah, Mubarak
GAReT: Cross-view Video Geolocalization with Adapters and Auto-Regressive Transformers Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{Pillai2024,
title = {GAReT: Cross-view Video Geolocalization with Adapters and Auto-Regressive Transformers},
author = {Manu S Pillai and Mamshad Nayeem Rizve and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/07875-supp.pdf
https://arxiv.org/abs/2408.02840
https://github.com/manupillai308/GAReT},
doi = {https://doi.org/10.48550/arXiv.2408.02840},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Cross-view video geo-localization (CVGL) aims to derive GPS trajectories from street-view videos by aligning them with aerial-view images. Despite their promising performance, current CVGL methods face significant challenges. These methods use camera and odometry data, typically absent in real-world scenarios. They utilize multiple adjacent frames and various encoders for feature extraction, resulting in high computational costs. Moreover, these approaches independently predict each street-view frame's location, resulting in temporally inconsistent GPS trajectories. To address these challenges, in this work, we propose GAReT, a fully transformer-based method for CVGL that does not require camera and odometry data. We introduce GeoAdapter, a transformer-adapter module designed to efficiently aggregate image-level representations and adapt them for video inputs. Specifically, we train a transformer encoder on video frames and aerial images, then freeze the encoder to optimize the GeoAdapter module to obtain video-level representation. To address temporally inconsistent trajectories, we introduce TransRetriever, an encoder-decoder transformer model that predicts GPS locations of street-view frames by encoding top-k nearest neighbor predictions per frame and auto-regressively decoding the best neighbor based on the previous frame's predictions. Our method's effectiveness is validated through extensive experiments, demonstrating state-of-the-art performance on benchmark datasets.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Kulkarni, Parth Parag; Nayak, Guarav Kumar; Shah, Mubarak
CityGuessr: City-Level Video Geo-Localization on a Global Scale Conference
Workshop on Computer Vision For Videogames (CV2), 2024.
Abstract | Tags: ECCV | Links:
@conference{Kulkarni2024,
title = {CityGuessr: City-Level Video Geo-Localization on a Global Scale},
author = {Parth Parag Kulkarni and Guarav Kumar Nayak and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CityGuessr.pdf
https://parthpk.github.io/cityguessr-webpage/
},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {Workshop on Computer Vision For Videogames (CV2)},
abstract = {Video geolocalization is a crucial problem in current times. Given just a video, ascertaining where it was captured from can have a plethora of advantages. The problem of worldwide geolocalization has been tackled before, but only using the image modality. Its video counterpart remains relatively unexplored. Meanwhile, video geolocalization has also garnered some attention in the recent past, but the existing methods are all restricted to specific regions. This motivates us to explore the problem of video geolocalization at a global scale. Hence, we propose a novel problem of worldwide video geolocalization with the objective of hierarchically predicting the correct city, state/province, country, and continent, given a video. However, no large scale video datasets that have extensive worldwide coverage exist, to train models for solving this problem. To this end, we introduce a new dataset, “CityGuessr68k” comprising of 68,269 videos from 166 cities all over the world. We also propose a novel baseline approach to this problem, by designing a transformerbased architecture comprising of an elegant “Self-Cross Attention” module for incorporating scenes as well as a “TextLabel Alignment” strategy for distilling knowledge from textlabels in feature space. To further enhance our location prediction, we also utilize soft-scene labels. Finally we demonstrate the performance of our method on our new dataset as well as Mapillary(MSLS) [38]. },
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Karim, Nazmul; Arafat, Abdullah Al; Khalid, Umar; Guo, Zhishan; Rahnavard, Nazanin
Augmented Neural Fine-tuning for Efficient Backdoor Purification Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Abstract | Tags: ECCV | Links:
@conference{nokey,
title = {Augmented Neural Fine-tuning for Efficient Backdoor Purification},
author = {Nazmul Karim and Abdullah Al Arafat and Umar Khalid and Zhishan Guo and Nazanin Rahnavard},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2024_ECCV_NFT.pdf
https://arxiv.org/pdf/2407.10052
https://github.com/nazmul-karim170/NFT-Augmented-Backdoor-Purification},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
abstract = {Recent studies have revealed the vulnerability of deep neural networks (DNNs) to various backdoor attacks, where the behavior of DNNs can be compromised by utilizing certain types of triggers or poisoning mechanisms. State-of-the-art (SOTA) defenses employ too sophisticated mechanisms that require either a computationally expensive adversarial search module for reverse-engineering the trigger distribution or an over-sensitive hyper-parameter selection module. Moreover, they offer sub-par performance in challenging scenarios, e.g., limited validation data and strong attacks. In this paper, we propose—Neural mask Fine-Tuning (NFT)—with an aim to optimally re-organize the neuron activities in a way that the effect of the backdoor is removed. Utilizing a simple data augmentation like MixUp, NFT relaxes the trigger synthesis process and eliminates the requirement of the adversarial search module. Our study further reveals that direct weight fine-tuning under limited validation data results in poor post-purification clean test accuracy, primarily due to overfitting issue. To overcome this, we propose to fine-tune neural masks instead of model weights. In addition, a mask regularizer
has been devised to further mitigate the model drift during the purification process. The distinct characteristics of NFT render it highly efficient in both runtime and sample usage, as it can remove the backdoor even when a single sample is available from each class. We validate the effectiveness of NFT through extensive experiments covering the tasks of image classification, object detection, video action recognition, 3D point cloud, and natural language processing. We evaluate our method against 14 different attacks (LIRA, WaNet, etc.) on 11 benchmark data sets (ImageNet, UCF101, Pascal VOC, ModelNet, OpenSubtitles2012, etc.). },
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
has been devised to further mitigate the model drift during the purification process. The distinct characteristics of NFT render it highly efficient in both runtime and sample usage, as it can remove the backdoor even when a single sample is available from each class. We validate the effectiveness of NFT through extensive experiments covering the tasks of image classification, object detection, video action recognition, 3D point cloud, and natural language processing. We evaluate our method against 14 different attacks (LIRA, WaNet, etc.) on 11 benchmark data sets (ImageNet, UCF101, Pascal VOC, ModelNet, OpenSubtitles2012, etc.).
Peng, Qucheng; Zheng, Ce; Chen, Chen
A Dual-Augmentor Framework for Domain Generalization in 3D Human Pose Estimation Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Peng2024,
title = {A Dual-Augmentor Framework for Domain Generalization in 3D Human Pose Estimation},
author = {Qucheng Peng and Ce Zheng and Chen Chen},
url = {https://arxiv.org/pdf/2403.11310.pdf
https://github.com/davidpengucf/DAF-DG
},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {3D human pose data collected in controlled laboratory settings present challenges for pose estimators that generalize across diverse scenarios. To address this, domain generalization is employed. Current methodologies in domain generalization for 3D human pose estimation typically utilize adversarial training to generate synthetic poses for training. Nonetheless, these approaches exhibit several limitations. First, the lack of prior information about the target domain complicates the application of suitable augmentation through a single pose augmentor, affecting generalization on target domains. Moreover, adversarial training’s discriminator tends to enforce similarity between source and synthesized poses, impeding the exploration of out-ofsource distributions. Furthermore, the pose estimator’s optimization is not exposed to domain shifts, limiting its overall generalization ability.
To address these limitations, we propose a novel framework featuring two pose augmentors: the weak and the strong augmentors. Our framework employs differential strategies for generation and discrimination processes, facilitating the preservation of knowledge related to source poses and the exploration of out-of-source distributions without prior information about target poses. Besides, we leverage meta-optimization to simulate domain shifts in the optimization process of the pose estimator, thereby improving its generalization ability. Our proposed approach significantly outperforms existing methods, as demonstrated through comprehensive experiments on various benchmark datasets.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
To address these limitations, we propose a novel framework featuring two pose augmentors: the weak and the strong augmentors. Our framework employs differential strategies for generation and discrimination processes, facilitating the preservation of knowledge related to source poses and the exploration of out-of-source distributions without prior information about target poses. Besides, we leverage meta-optimization to simulate domain shifts in the optimization process of the pose estimator, thereby improving its generalization ability. Our proposed approach significantly outperforms existing methods, as demonstrated through comprehensive experiments on various benchmark datasets.
Liu, Xianpeng; Zheng, Ce; Qian, Ming; Xue, Nan; Chen, Chen; Zhang, Zhebin; Li, Chen; Wu, Tianfu
Multi-View Attentive Contextualization for Multi-View 3D Object Detection Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Tags: CVPR
@conference{Liu2024,
title = {Multi-View Attentive Contextualization for Multi-View 3D Object Detection},
author = {Xianpeng Liu and Ce Zheng and Ming Qian and Nan Xue and Chen Chen and Zhebin Zhang and Chen Li and Tianfu Wu},
year = {2024},
date = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Yuan, Tongtong; Zhang, Xuange; Liu, Kun; Liu, Bo; Chen, Chen; Jin, Jian; Jiao, Zhenzhen
Towards Surveillance Video-and-Language Understanding: New Dataset, Baselines, and Challenges Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Yuan2024,
title = {Towards Surveillance Video-and-Language Understanding: New Dataset, Baselines, and Challenges},
author = {Tongtong Yuan and Xuange Zhang and Kun Liu and Bo Liu and Chen Chen and Jian Jin and Zhenzhen Jiao},
url = {https://arxiv.org/pdf/2309.13925.pdf
https://xuange923.github.io/Surveillance-Video-Understanding
https://github.com/Xuange923/Surveillance-Video-Understanding},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Surveillance videos are an essential component of daily life with various critical applications, particularly in public security. However, current surveillance video tasks mainly focus on classifying and localizing anomalous events. Existing methods are limited to detecting and classifying the predefined events with unsatisfactory semantic understanding, although they have obtained considerable performance. To address this issue, we propose a new research direction of surveillance video-and-language understanding, and construct the first multimodal surveillance video dataset. We manually annotate the real-world surveillance dataset UCF-Crime with fine-grained event content and timing. Our newly annotated dataset, UCA UCF-Crime Annotation), contains 23,542 sentences, with an average length of 20 words, and its annotated videos are as long as 110.7 hours. Furthermore, we benchmark SOTA models for four multimodal tasks on this newly created dataset, which serve as new baselines for surveillance video-and-language understanding. Through our experiments, we find that mainstream models used in previously publicly available datasets perform poorly on surveillance video, which demonstrates the new challenges in surveillance video-and-language understanding. To validate the effectiveness of our UCA, we conducted experiments on multimodal anomaly detection. The results demonstrate that our multimodal surveillance learning can improve the performance of conventional anomaly detection tasks. All the experiments highlight the necessity of constructing this dataset to advance surveillance AI.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Pinyoanuntapong, Ekkasit; Wang, Pu; Lee, Minwoo; Chen, Chen
MMM: Generative Masked Motion Model Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Pinyoanuntapong2024,
title = {MMM: Generative Masked Motion Model},
author = {Ekkasit Pinyoanuntapong and Pu Wang and Minwoo Lee and Chen Chen},
url = {https://arxiv.org/pdf/2312.03596.pdf
https://exitudio.github.io/MMM-page/
https://github.com/exitudio/MMM/},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Recent advances in text-to-motion generation using diffusion and autoregressive models have shown promising results. However, these models often suffer from a trade-off between real-time performance, high fidelity, and motion editability. To address this gap, we introduce MMM, a novel yet simple motion generation paradigm based on Masked Motion Model. MMM consists of two key components: (1) a motion tokenizer that transforms 3D human motion into a sequence of discrete tokens in latent space, and (2) a conditional masked motion transformer that learns to predict randomly masked motion tokens, conditioned on the pre-computed text tokens. By attending to motion and text tokens in all directions, MMM explicitly captures inherent dependency among motion tokens and semantic mapping between motion and text tokens. During inference, this allows parallel and iterative decoding of multiple motion tokens that are highly consistent with fine-grained text descriptions, therefore simultaneously achieving high-fidelity and high-speed motion generation. In addition, MMM has innate motion editability. By simply placing mask tokens in the place that needs editing, MMM automatically fills the gaps while guaranteeing smooth transitions between editing and non-editing parts. Extensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM surpasses current leading methods in generating high-quality motion (evidenced by superior FID scores of 0.08 and 0.429), while offering advanced editing features such as body-part modification, motion in-betweening, and the synthesis of long motion sequences. In addition, MMM is two orders of magnitude faster on a single mid-range GPU than editable motion diffusion models.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Wang, Xinshun; Fang, Zhongbin; Xia Li, Xiangtai Li; Chen, Chen; Liu, Mengyuan
Skeleton-in-Context: Unified Skeleton Sequence Modeling with In-Context Learning Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Wang2024,
title = {Skeleton-in-Context: Unified Skeleton Sequence Modeling with In-Context Learning},
author = {Xinshun Wang and Zhongbin Fang and Xia Li, Xiangtai Li and Chen Chen and Mengyuan Liu},
url = {https://arxiv.org/pdf/2312.03703.pdf
https://bradleywang0416.github.io/skeletonincontext/
https://github.com/fanglaosi/Skeleton-in-Context},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {In-context learning provides a new perspective for multi-task modeling for vision and NLP. Under this setting, the model can perceive tasks from prompts and accomplish them without any extra task-specific head predictions or model finetuning. However, Skeleton sequence modeling via in-context learning remains unexplored. Directly applying existing in-context models from other areas onto skeleton sequences fails due to the inter-frame and cross-task pose similarity that makes it outstandingly hard to perceive the task correctly from a subtle context.
To address this challenge, we propose Skeleton-in-Context (SiC), an effective framework for in-context skeleton sequence modeling. Our SiC is able to handle multiple skeleton-based tasks simultaneously after a single training process and accomplish each task from context according to the given prompt. It can further generalize to new, unseen tasks according to customized prompts. To facilitate context perception, we additionally propose a task-unified prompt, which adaptively learns tasks of different natures, such as partial joint-level generation, sequence-level prediction, or 2D-to-3D motion prediction. We conduct extensive experiments to evaluate the effectiveness of our SiC on multiple tasks, including motion prediction, pose estimation, joint completion, and future pose estimation. We also evaluate its generalization capability on unseen tasks such as motion-in-between. These experiments show that our model achieves state-of-the-art multi-task performance and even outperforms single-task methods on certain tasks.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
To address this challenge, we propose Skeleton-in-Context (SiC), an effective framework for in-context skeleton sequence modeling. Our SiC is able to handle multiple skeleton-based tasks simultaneously after a single training process and accomplish each task from context according to the given prompt. It can further generalize to new, unseen tasks according to customized prompts. To facilitate context perception, we additionally propose a task-unified prompt, which adaptively learns tasks of different natures, such as partial joint-level generation, sequence-level prediction, or 2D-to-3D motion prediction. We conduct extensive experiments to evaluate the effectiveness of our SiC on multiple tasks, including motion prediction, pose estimation, joint completion, and future pose estimation. We also evaluate its generalization capability on unseen tasks such as motion-in-between. These experiments show that our model achieves state-of-the-art multi-task performance and even outperforms single-task methods on certain tasks.
Chen, Tongjia; Yu, Hongshan; Yang, Zhengeng; Li, Zechuan; Sun, Wei; Chen, Chen
OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for General Video Recognition Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Chen2024,
title = {OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for General Video Recognition},
author = {Tongjia Chen and Hongshan Yu and Zhengeng Yang and Zechuan Li and Wei Sun and Chen Chen},
url = {https://arxiv.org/pdf/2312.00096.pdf
https://tomchen-ctj.github.io/OST/
https://github.com/tomchen-ctj/OST},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Due to the resource-intensive nature of training vision-language models on expansive video data, a majority of studies have centered on adapting pre-trained image-language models to the video domain. Dominant pipelines propose to tackle the visual discrepancies with additional temporal learners while overlooking the substantial discrepancy for web-scaled descriptive narratives and concise action category names, leading to less distinct semantic space and potential performance limitations. In this work, we prioritize the refinement of text knowledge to facilitate generalizable video recognition. To address the limitations of the less distinct semantic space of category names, we prompt a large language model (LLM) to augment action class names into Spatio-Temporal Descriptors thus bridging the textual discrepancy and serving as a knowledge base for general recognition. Moreover, to assign the best descriptors with different video instances, we propose Optimal Descriptor Solver, forming the video recognition problem as solving the optimal matching flow across frame-level representations and descriptors. Comprehensive evaluations in zero-shot, few-shot, and fully supervised video recognition highlight the effectiveness of our approach. Our best model achieves a state-of-the-art zero-shot accuracy of 75.1% on Kinetics-600.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Rizve, Mamshad Nayeem; Fei, Fan; Unnikrishnan, Jayakrishnan; Tran, Son; Yao, Benjamin Z.; Zeng, Belinda; Shah, Mubarak; Chilimbi, Trishul
VidLA: Video-Language Alignment at Scale Conference
2024.
Abstract | Tags: CVPR | Links:
@conference{Rizve2024,
title = {VidLA: Video-Language Alignment at Scale},
author = {Mamshad Nayeem Rizve and Fan Fei and Jayakrishnan Unnikrishnan and Son Tran and Benjamin Z. Yao and Belinda Zeng and Mubarak Shah and Trishul Chilimbi},
url = {https://arxiv.org/abs/2403.14870},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
abstract = {In this paper, we propose VidLA, an approach for video-language alignment at scale. There are two major limitations of previous video-language alignment approaches. First, they do not capture both short-range and long-range temporal dependencies and typically employ complex hierarchical deep network architectures that are hard to integrate with existing pretrained image-text foundation models. To effectively address this limitation, we instead keep the network architecture simple and use a set of data tokens that operate at different temporal resolutions in a hierarchical manner, accounting for the temporally hierarchical nature of videos. By employing a simple two-tower architecture, we are able to initialize our video-language model with pretrained image-text foundation models, thereby boosting the final performance. Second, existing video-language alignment works struggle due to the lack of semantically aligned large-scale training data. To overcome it, we leverage recent LLMs to curate the largest video-language dataset to date with better visual grounding. Furthermore, unlike existing video-text datasets which only contain short clips, our dataset is enriched with video clips of varying durations to aid our temporally hierarchical data tokens in extracting better representations at varying temporal scales. Overall, empirical results show that our proposed approach surpasses state-of-the-art methods on multiple retrieval benchmarks, especially on longer videos, and performs competitively on classification benchmarks.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Ristea, Nicolae Catalin; Croitoru, Florinel Alin; Ionescu, Radu Tudor; Popescu, Marius; Khan, Fahad; Shah, Mubarak
Self-Distilled Masked Auto-Encoders are Efficient Video Anomaly Detectors Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Ristea2024,
title = {Self-Distilled Masked Auto-Encoders are Efficient Video Anomaly Detectors},
author = {Nicolae Catalin Ristea and Florinel Alin Croitoru and Radu Tudor Ionescu and Marius Popescu and Fahad Khan and Mubarak Shah},
url = {https://arxiv.org/abs/2306.12041
https://github.com/ristea/aed-mae},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {We propose an efficient abnormal event detection model based on a lightweight masked auto-encoder (AE) applied at the video frame level. The novelty of the proposed model is threefold. First, we introduce an approach to weight tokens based on motion gradients, thus shifting the focus from the static background scene to the foreground objects. Second, we integrate a teacher decoder and a student decoder into our architecture, leveraging the discrepancy between the outputs given by the two decoders to improve anomaly detection. Third, we generate synthetic abnormal events to augment the training videos, and task the masked AE model to jointly reconstruct the original frames (without anomalies) and the corresponding pixel-level anomaly maps. Our design leads to an efficient and effective model, as demonstrated by the extensive experiments carried out on four benchmarks: Avenue, ShanghaiTech, UBnormal and UCSD Ped2. The empirical results show that our model achieves an excellent trade-off between speed and accuracy, obtaining competitive AUC scores, while processing 1655 FPS. Hence, our model is between 8 and 70 times faster than competing methods. We also conduct an ablation study to justify our design. },
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Thawakar, Omkar Chakradhar; Naseer, Muzammal; Anwer, Rao Muhammad; Khan, Salman; Felsberg, Michael; Shah, Mubarak; Khan, Fahad
Composed Video Retrieval via Enriched Context and Discriminative Embeddings Conference
2024.
Abstract | Tags: CVPR | Links:
@conference{Thawakar2024,
title = {Composed Video Retrieval via Enriched Context and Discriminative Embeddings},
author = {Omkar Chakradhar Thawakar and Muzammal Naseer and Rao Muhammad Anwer and Salman Khan and Michael Felsberg and Mubarak Shah and Fahad Khan },
url = {https://arxiv.org/abs/2403.16997
https://github.com/OmkarThawakar/composed-video-retrieval},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
abstract = {Composed video retrieval (CoVR) is a challenging problem in computer vision which has recently highlighted the integration of modification text with visual queries for more sophisticated video search in large databases. Existing works predominantly rely on visual queries combined with modification text to distinguish relevant videos. However, such a strategy struggles to fully preserve the rich query-specific context in retrieved target videos and only represents the target video using visual embedding. We introduce a novel CoVR framework that leverages detailed language descriptions to explicitly encode query-specific contextual information and learns discriminative embeddings of vision only, text only and vision-text for better alignment to accurately retrieve matched target videos. Our proposed framework can be flexibly employed for both composed video (CoVR) and image (CoIR) retrieval tasks. Experiments on three datasets show that our approach obtains state-of-the-art performance for both CovR and zero-shot CoIR tasks, achieving gains as high as around 7% in terms of recall@K=1 score.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Dutta, Aritra; Das, Srijan; Nielsen, Jacob; Chakraborty, Rajatsubhra; Shah, Mubarak
Multiview Aerial Visual RECognition (MAVREC) Dataset: Can Multi-view Improve Aerial Visual Perception? Conference
2024.
Abstract | Tags: CVPR | Links:
@conference{Dutta2024,
title = {Multiview Aerial Visual RECognition (MAVREC) Dataset: Can Multi-view Improve Aerial Visual Perception?},
author = {Aritra Dutta and Srijan Das and Jacob Nielsen and Rajatsubhra Chakraborty and Mubarak Shah},
url = {https://arxiv.org/abs/2312.04548
https://mavrec.github.io/},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
abstract = {Despite the commercial abundance of UAVs, aerial data acquisition remains challenging, and the existing Asia and North America-centric open-source UAV datasets are small-scale or low-resolution and lack diversity in scene contextuality. Additionally, the color content of the scenes, solar-zenith angle, and population density of different geographies influence the data diversity. These two factors conjointly render suboptimal aerial-visual perception of the deep neural network (DNN) models trained primarily on the ground-view data, including the open-world foundational models.
To pave the way for a transformative era of aerial detection, we present Multiview Aerial Visual RECognition or MAVREC, a video dataset where we record synchronized scenes from different perspectives -- ground camera and drone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard 2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million annotated bounding boxes. This makes MAVREC the largest ground and aerial-view dataset, and the fourth largest among all drone-based datasets across all modalities and tasks. Through our extensive benchmarking on MAVREC, we recognize that augmenting object detectors with ground-view images from the corresponding geographical location is a superior pre-training strategy for aerial detection. Building on this strategy, we benchmark MAVREC with a curriculum-based semi-supervised object detection approach that leverages labeled (ground and aerial) and unlabeled (only aerial) images to enhance the aerial detection.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
To pave the way for a transformative era of aerial detection, we present Multiview Aerial Visual RECognition or MAVREC, a video dataset where we record synchronized scenes from different perspectives -- ground camera and drone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard 2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million annotated bounding boxes. This makes MAVREC the largest ground and aerial-view dataset, and the fourth largest among all drone-based datasets across all modalities and tasks. Through our extensive benchmarking on MAVREC, we recognize that augmenting object detectors with ground-view images from the corresponding geographical location is a superior pre-training strategy for aerial detection. Building on this strategy, we benchmark MAVREC with a curriculum-based semi-supervised object detection approach that leverages labeled (ground and aerial) and unlabeled (only aerial) images to enhance the aerial detection.
Shehreen Azad,; Rawat, Yogesh Singh
Activity-Biometrics: Person Identification from Daily Activities Conference
2024.
Abstract | Tags: CVPR | Links:
@conference{Azad2024,
title = {Activity-Biometrics: Person Identification from Daily Activities},
author = {Shehreen Azad, and Yogesh Singh Rawat},
url = {https://arxiv.org/abs/2403.17360
https://github.com/sacrcv/Activity-Biometrics/
},
year = {2024},
date = {2024-06-17},
abstract = {In this work, we study a novel problem which focuses on person identification while performing daily activities. Learning biometric features from RGB videos is challenging due to spatio-temporal complexity and presence of appearance biases such as clothing color and background. We propose ABNet, a novel framework which leverages disentanglement of biometric and non-biometric features to perform effective person identification from daily activities. ABNet relies on a bias-less teacher to learn biometric features from RGB videos and explicitly disentangle non-biometric features with the help of biometric distortion. In addition, ABNet also exploits activity prior for biometrics which is enabled by joint biometric and activity learning. We perform comprehensive evaluation of the proposed approach across five different datasets which are derived from existing activity recognition benchmarks. Furthermore, we extensively compare ABNet with existing works in person identification and demonstrate its effectiveness for activity-based biometrics across all five datasets.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Garcia, Gustavo; Aparcedo, Alejandro; Nayak, Gaurav Kumar; Ahmed, Tanvir; Shah, Mubarak; Li, Mengjie
Generalized Deep Learning Model for Photovoltaic Module Segmentation from Satellite and Aerial Imagery Journal Article
In: Solar Energy, vol. 274, 2024.
@article{Garcia2024,
title = {Generalized Deep Learning Model for Photovoltaic Module Segmentation from Satellite and Aerial Imagery},
author = {Gustavo Garcia and Alejandro Aparcedo and Gaurav Kumar Nayak and Tanvir Ahmed and Mubarak Shah and Mengjie Li },
url = {https://pdf.sciencedirectassets.com/271459/1-s2.0-S0038092X24X00079/1-s2.0-S0038092X24002330/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEMv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIQCVMhrN8N9Mo8ub2VN%2FLBj9DPWsBs1Uf%2BzyhKBcOqdPGAIgHhTnnbthASSvVlxCMZHsZC5JCFcIZd3qGLeGWHhgKvwqswUIQxAFGgwwNTkwMDM1NDY4NjUiDB8Qpz5z2GubRu8x%2FiqQBf%2FN1Z2xCIaA5NTNXGx8ANCdIOgu3D20kuxLGIkfKklv3zhyEPLQVK%2B4hCKa52lKiGv3JP%2BB%2FFdLMfraPJnrAFuV6CSp%2BflZkPOun24m1Yx1WR86vHTBuOYvaqeTUGvE7eAC5y9sTozkJtU8peCWWLpEIXihdn2CF4UbRDViDvelnSH%2B439ogLqRT54izMVpgbGB0JnDssmjwlP%2BXmDwzY0NMJYZW3FQMLUKLpSCeuTQW9waNXqmGGLpjeXx%2FZ6BDQa8h9JEQedcp%2BjmSHNd5pbVxMcor0e6nAn3A7LAwmowxH%2BF8fy8CUNUWCRNUPvr%2FHH%2Fna4UVeGteRZCU5fdH6ttPUppcdddliK456nGCW7DDlU%2FDBAfGKY5QTPx6OcuP1uiJK6pKXjCk%2B3Z3j3%2BdBc2kx1sYtOcmoZybLonmsEqu4yyT8B86RFVIp%2BkPpY4jOfSzhzO5in7t7WRJQ%2BZPk16wbrAI%2FmwYcMZpGkGOnBNLbPzsGJNiQ%2FqfXZmqW%2B1tyy8TbPSvwrsyckMwQEBzQ2JVKxVDi2RWx079ig07ZgM%2FhY7OP4Ahkf%2BUxqzhjksQOUODLdIkbxFFuPwnRBAf8FCkG3QoI5756LjTduaH3o6CEO1O95wAyRCJ4gtQrA9ScsnIGwmeEu25M6UwAKXHudGQFKBKEWrV1eBjYwRa3656juKxOfhPu83RZAPYQp0EQ9EnmlY%2BEpbpHWSr4kOzw3DMIe5qxKvoBREgyd1NwK2a1yWkGY3BUTgnvEwy5cDfbhqafy5e2PdioY4HuO14w4sbx0dr%2BQj3uuZK0YAGSplwgLtH0iLmr5JiUl%2FjgoOOJf7s7ABX9EcTgTmTPZ3phq2JjIMKHTCwoz8L54eXPzsMLzosbIGOrEBOdb1Nvf4ZJ5QRr6dK4oz6b0brYE8KdZXrBSpnbWEVXByzClWX3f3F1J438Pr6FfkYAvbk6fKq4bmZroVmiXd%2B%2BBdct6REaFgwxgT%2Fx%2Fp0TGEeRs%2B1VKSVVNjyptfnbeX339CmX98ZYH6RYI3lMAfVLlNkeEBiyMOXcLUsv9dR%2Bq9%2BYxLvJyOZ%2FVqxJgRk6C6%2FYHYsInJ2wtTcOMv8FlXf0UHszkIrDhAmhTUuy8ToY5Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240521T112839Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTYU4QHE644%2F20240521%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=f0328a0d6c68d8d08f3564ab95dec786dd50a2b354ade5faf8c58d328989db49&hash=8a2b0bb6cf41964c98f9da26c87972608e6bb23caff806e9b2ae6f5386ef05bd&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S0038092X24002330&tid=spdf-b6546ec0-1fda-4ac2-99f2-ad4826de3841&sid=f672770a5282964a827afab6e60c14d3690bgxrqa&type=client&tsoh=d3d3LnNjaWVuY2VkaXJlY3QuY29t&ua=11145e5955535c5658&rr=887439686d790335&cc=us
https://www.sciencedirect.com/science/article/pii/S0038092X24002330},
doi = {https://doi.org/10.1016/j.solener.2024.112539},
year = {2024},
date = {2024-05-15},
journal = {Solar Energy},
volume = {274},
abstract = {As solar photovoltaic (PV) has emerged as a dominant player in the energy market, there has been an exponential surge in solar deployment and investment within this sector. With the rapid growth of solar energy adoption, accurate and efficient detection of PV panels has become crucial for effective solar energy mapping and planning. This paper presents the application of the Mask2Former model for segmenting PV panels from a diverse, multi-resolution dataset of satellite and aerial imagery. Our primary objective is to harness Mask2Former’s deep learning capabilities to achieve precise segmentation of PV panels in real-world scenarios. We fine-tune the pre-existing Mask2Former model on a carefully curated multi-resolution dataset and a crowdsourced dataset of satellite and aerial images, showcasing its superiority over other deep learning models like U-Net and DeepLabv3+. Most notably, Mask2Former establishes a new state-of-the-art in semantic segmentation by achieving over 95% IoU scores. Our research contributes significantly to the advancement solar energy mapping and sets a benchmark for future studies in this field.},
keywords = {REU},
pubstate = {published},
tppubtype = {article}
}
Frakes, Ethan; Khalid, Umar; Chen, Chen
Efficient and consistent zero-shot video generation with diffusion models Conference
SPIE Defense+ Commercial Sensing, 2024.
@conference{Frakes2024,
title = {Efficient and consistent zero-shot video generation with diffusion models},
author = {Ethan Frakes and Umar Khalid and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Report-Ethan-Frakes.pdf
https://spie.org/defense-commercial-sensing/presentation/Efficient-and-consistent-zero-shot-video-generation-with-diffusion-models/13034-8#_=_},
year = {2024},
date = {2024-04-22},
urldate = {2024-04-22},
publisher = {SPIE Defense+ Commercial Sensing},
abstract = {Recent diffusion-based generative models employ methods such as one-shot fine-tuning an image diffusion model for video generation. However, this leads to long video generation times and suboptimal efficiency. To resolve this long generation time, zero-shot text-to-video models eliminate the fine-tuning method entirely and can generate novel videos from a text prompt alone. While the zero-shot generation method greatly reduces generation time, many models rely on inefficient cross-frame attention processors, hindering the diffusion model’s utilization for real-time video generation. We address this issue by introducing more efficient attention processors to a video diffusion model. Specifically, we use attention processors (i.e. xFormers, FlashAttention, and HyperAttention) that are highly optimized for efficiency and hardware parallelization. We then apply these processors to a video generator and test with both older diffusion models such as Stable Diffusion 1.5 and newer, high-quality models such as Stable Diffusion XL. Our results show that using efficient attention processors alone can reduce generation time by around 25%, while not resulting in any change in video quality. Combined with the use of higher quality models, this use of efficient attention processors in zero-shot generation presents a substantial efficiency and quality increase, greatly expanding the video diffusion model’s application to real-time video generation.},
keywords = {REU},
pubstate = {published},
tppubtype = {conference}
}
2023
Cepeda, Vicente Vivanco; Nayak, Gaurav Kumar; Shah, Mubarak
GeoCLIP: Clip-Inspired Alignment between Locations and Images for Effective Worldwide Geo-localization Conference
Thirty-seventh Conference on Neural Information Processing Systems, 2023.
Abstract | Tags: | Links:
@conference{Cepeda2023,
title = {GeoCLIP: Clip-Inspired Alignment between Locations and Images for Effective Worldwide Geo-localization},
author = {Vicente Vivanco Cepeda and Gaurav Kumar Nayak and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/GeoCLIP_camera_ready_paper.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/GeoCLIP_camera_ready_supplementary.pdf
https://vicentevivan.github.io/GeoCLIP/},
year = {2023},
date = {2023-12-11},
publisher = {Thirty-seventh Conference on Neural Information Processing Systems},
abstract = {Worldwide Geo-localization aims to pinpoint the precise location of images taken anywhere on Earth. This task has considerable challenges due to immense variation in geographic landscapes. The image-to-image retrieval-based approaches fail to solve this problem on a global scale as it is not feasible to construct a large gallery of images covering the entire world. Instead, existing approaches divide the globe into discrete geographic cells, transforming the problem into a classification task. However, their performance is limited by the predefined classes and often results in inaccurate localizations when an image’s location significantly deviates from its class center. To overcome these limitations, we propose GeoCLIP, a novel CLIP-inspired Image-to-GPS retrieval approach that enforces alignment between the image and its corresponding GPS locations. GeoCLIP’s location encoder models the Earth as a continuous function by employing positional encoding through random Fourier features and constructing a hierarchical representation that captures information at varying resolutions to yield a semantically rich highdimensional feature suitable to use even beyond geo-localization. To the best of our knowledge, this is the first work employing GPS encoding for geo-localization. We demonstrate the efficacy of our method via extensive experiments and ablations on benchmark datasets. We achieve competitive performance with just 20% of training data, highlighting its effectiveness even in limited-data settings. Furthermore, we qualitatively demonstrate geo-localization using a text query by leveraging CLIP backbone of our image encoder.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Modi, Rajat; Vineet, Vibhav; Rawat, Yogesh Singh
On Occlusions in Video Action Detection: Benchmark Datasets And Training Recipes Conference
NeurIPS 2023., 2023.
Abstract | Tags: NeurIPS | Links:
@conference{Modi2023,
title = {On Occlusions in Video Action Detection: Benchmark Datasets And Training Recipes},
author = {Rajat Modi and Vibhav Vineet and Yogesh Singh Rawat},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/nips_23.pdf},
year = {2023},
date = {2023-12-10},
publisher = {NeurIPS 2023.},
abstract = {This paper explores the impact of occlusions in video action detection. We facilitate this study by introducing five new benchmark datasets namely O-UCF and OJHMDB consisting of synthetically controlled static/dynamic occlusions, OVISUCF and OVIS-JHMDB consisting of occlusions with realistic motions and Real-OUCF for occlusions in realistic-world scenarios. We formally confirm an intuitive expectation: existing models suffer a lot as occlusion severity is increased and exhibit different behaviours when occluders are static vs when they are moving. We discover several curious phenomenon emerging in neural nets: 1) transformers can naturally outperform CNN models which might have even used occlusion as a form of data augmentation during training 2) incorporating symbolic-components like capsules to such backbones allows them to bind to occluders never even seen during training and 3) Islands of agreement (similar to the ones hypothesized in Hinton et Al’s GLOM) can emerge in realistic images/videos without instance-level supervision, distillation or contrastive-based objectives2(eg. video-textual training). Such emergent properties allow us to derive simple yet effective training recipes which lead to robust occlusion models inductively satisfying the first two stages of the binding mechanism (grouping/segregation). Models leveraging these recipes outperform existing video action-detectors under occlusion by 32.3% on O-UCF, 32.7% on O-JHMDB & 2.6% on Real-OUCF in terms of the vMAP metric.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Kini, Jyoti; Khan, Fahad Shahbaz; Khan, Salman; Shah, Mubarak
CT-VOS: Cutout Prediction and Tagging for Self-Supervised Video Object Segmentation Journal Article
In: Computer Vision and Image Understanding, 2023.
Tags: CVIU, Video Object Segmentation
@article{Kini2023c,
title = {CT-VOS: Cutout Prediction and Tagging for Self-Supervised Video Object Segmentation},
author = {Jyoti Kini and Fahad Shahbaz Khan and Salman Khan and Mubarak Shah},
year = {2023},
date = {2023-10-09},
journal = {Computer Vision and Image Understanding},
keywords = {CVIU, Video Object Segmentation},
pubstate = {published},
tppubtype = {article}
}
Hanif, Asif; Naseer, Muzammal; Khan, Salman; Shah, Mubarak; Khan, Fahad Shahbaz
Frequency Domain Adversarial Training for Robust Volumetric Medical Segmentation Conference
The 26th International Conference on Medical Image Computing and Computer Assisted Intervention, MICCAI 2023, 2023.
Tags: MICCAI | Links:
@conference{nokey,
title = {Frequency Domain Adversarial Training for Robust Volumetric Medical Segmentation},
author = {Asif Hanif and Muzammal Naseer and Salman Khan and Mubarak Shah and Fahad Shahbaz Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Frequency-Domain-Adversarial-Training-for-Robust-Volumetric-Medical-Segmentation.pdf
https://github.com/asif-hanif/vafa},
doi = {https://doi.org/10.48550/arXiv.2307.07269},
year = {2023},
date = {2023-10-08},
publisher = {The 26th International Conference on Medical Image Computing and Computer Assisted Intervention, MICCAI 2023},
keywords = {MICCAI},
pubstate = {published},
tppubtype = {conference}
}
Li, Ming; Wu, Jie; Wang, Xionghui; Chen, Chen; Qin, Jie; Xiao, Xuefeng; Wang, Rui; Zheng, Min; Pan, Xin
AlignDet: Aligning Pre-training and Fine-tuning in Object Detection Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Tags: ICCV | Links:
@conference{Li2023,
title = {AlignDet: Aligning Pre-training and Fine-tuning in Object Detection},
author = {Ming Li and Jie Wu and Xionghui Wang and Chen Chen and Jie Qin and Xuefeng Xiao and Rui Wang and Min Zheng and Xin Pan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2307.11077.pdf
https://arxiv.org/abs/2307.11077
https://github.com/liming-ai/AlignDet
https://openreview.net/forum?id=8PA2nX9v_r2
https://liming-ai.github.io/AlignDet/},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Peng, Qucheng; Zheng, Ce; Chen, Chen
Source-free Domain Adaptive Human Pose Estimation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{Peng2023,
title = {Source-free Domain Adaptive Human Pose Estimation},
author = {Qucheng Peng and Ce Zheng and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.03202.pdf
https://arxiv.org/abs/2308.03202
https://github.com/davidpengucf/SFDAHPE},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Human Pose Estimation (HPE) is widely used in various fields, including motion analysis, healthcare, and virtual reality. However, the great expenses of labeled real-world datasets present a significant challenge for HPE. To overcome this, one approach is to train HPE models on synthetic datasets and then perform domain adaptation (DA) on realworld data. Unfortunately, existing DA methods for HPE neglect data privacy and security by using both source and target data in the adaptation process. To this end, we propose a new task, named sourcefree domain adaptive HPE, which aims to address the challenges of cross-domain learning of HPE without access to source data during the adaptation process. We further propose a novel framework that consists of three models: source model, intermediate model, and target model, which explores the task from both sourceprotect and target-relevant perspectives. The sourceprotect module preserves source information more effectively while resisting noise, and the target-relevant module reduces the sparsity of spatial representations by building a novel spatial probability space, and pose-specific contrastive learning and information maximization are proposed on the basis of this space. Comprehensive experiments on several domain adaptive HPE benchmarks show that the proposed method outperforms existing approaches by a considerable margin. },
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Sun, Guangyu; Mendieta, Matias; Chen, Chen
FedPerfix: Towards Partial Model Personalization of Vision Transformers in Federated Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {FedPerfix: Towards Partial Model Personalization of Vision Transformers in Federated Learning},
author = {Guangyu Sun and Matias Mendieta and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.09160.pdf
https://arxiv.org/abs/2308.09160
https://github.com/imguangyu/FedPerfix},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {We propose and analyze a general framework of federated learning with partial model personalization. Compared with full model personalization, partial model personalization relies on domain knowledge to select a small portion of the model to personalize, thus imposing a much smaller on-device memory footprint. We propose two federated optimization algorithms for training partially personalized models, where the shared and personal parameters are updated either simultaneously or alternately on each device, but only the shared parameters are communicated and aggregated at the server. We give convergence analyses of both algorithms for minimizing smooth nonconvex functions, providing theoretical support of them for training deep learning models. Our experiments on real-world image and text datasets demonstrate that (a) partial model personalization can obtain most of the benefit of full model personalization with a small fraction of personalized parameters, and, (b) the alternating update algorithm often outperforms the simultaneous update algorithm.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Luo, Jun; Mendieta, Matias; Chen, Chen
PGFed: Personalize Each Client's Global Objective for Federated Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {PGFed: Personalize Each Client's Global Objective for Federated Learning},
author = {Jun Luo and Matias Mendieta and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2212.01448.pdf
https://github.com/ljaiverson/pgfed},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The mediocre performance of conventional federated learning (FL) over heterogeneous data has been facilitating personalized FL solutions, where, unlike conventional FL which trains a single global consensus model, different models are allowed for different clients. However, in most existing personalized FL algorithms, the collaborative knowledge across the federation was only implicitly passed to the clients in ways such as model aggregation or regularization. We observed that this implicit knowledge transfer fails to maximize the potential value of each client's empirical risk toward other clients. Based on our observation, in this work, we propose Personalized Global Federated Learning (PGFed), a novel personalized FL framework that enables each client to personalize its own global objective by explicitly and adaptively aggregating the empirical risks of itself and other clients. To avoid massive (O(N2)) communication overhead and potential privacy leakage, each client's risk is estimated through a first-order approximation for other clients' adaptive risk aggregation. On top of PGFed, we develop a momentum upgrade, dubbed PGFedMo, to more efficiently utilize clients' empirical risks. Our extensive experiments under different federated settings with benchmark datasets show consistent improvements of PGFed over the compared state-of-the-art alternatives.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Dang, Andong; Yang, Taojiannan; Chen, Chen
A Large-scale Study of Spatiotemporal Representation Learning with a New Benchmark on Action Recognition Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {A Large-scale Study of Spatiotemporal Representation Learning with a New Benchmark on Action Recognition},
author = {Andong Dang and Taojiannan Yang and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.13505.pdf
https://arxiv.org/abs/2303.13505
https://github.com/AndongDeng/BEAR},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The goal of building a benchmark (suite of datasets) is to provide a unified protocol for fair evaluation and thus facilitate the evolution of a specific area. Nonetheless, we point out that existing protocols of action recognition could yield partial evaluations due to several limitations. To comprehensively probe the effectiveness of spatiotemporal representation learning, we introduce BEAR, a new BEnchmark on video Action Recognition. BEAR is a collection of 18 video datasets grouped into 5 categories (anomaly, gesture, daily, sports, and instructional), which covers a diverse set of real-world applications. With BEAR, we thoroughly evaluate 6 common spatiotemporal models pre-trained by both supervised and self-supervised learning. We also report transfer performance via standard finetuning, few-shot finetuning, and unsupervised domain adaptation. Our observation suggests that current state-of-the-art cannot solidly guarantee high performance on datasets close to real-world applications, and we hope BEAR can serve as a fair and challenging evaluation benchmark to gain insights on building next-generation spatiotemporal learners. Our dataset, code, and models are released at: https://github.com/AndongDeng/BEAR},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Mendieta, Matias; Chen, Chen
Towards Geospatial Foundation Models via Continual Pretraining Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {Towards Geospatial Foundation Models via Continual Pretraining},
author = {Matias Mendieta and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2302.04476.pdf
https://arxiv.org/abs/2302.04476
https://github.com/mmendiet/GFM},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Geospatial technologies are becoming increasingly essential in our world for a wide range of applications, including agriculture, urban planning, and disaster response. To help improve the applicability and performance of deep learning models on these geospatial tasks, various works have begun investigating foundation models for this domain. Researchers have explored two prominent approaches for introducing such models in geospatial applications, but both have drawbacks in terms of limited performance benefit or prohibitive training cost. Therefore, in this work, we propose a novel paradigm for building highly effective geospatial foundation models with minimal resource cost and carbon impact. We first construct a compact yet diverse dataset from multiple sources to promote feature diversity, which we term GeoPile. Then, we investigate the potential of continual pretraining from large-scale ImageNet-22k models and propose a multi-objective continual pretraining paradigm, which leverages the strong representations of ImageNet while simultaneously providing the freedom to learn valuable in-domain features. Our approach outperforms previous state-of-the-art geospatial pretraining methods in an extensive evaluation on seven downstream datasets covering various tasks such as change detection, classification, multi-label classification, semantic segmentation, and super-resolution.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Chen, Hao; Qu, Chenyuan; Zhang, Yu; Chen, Chen; Jiao, Jianbo
Multi-view Self-supervised Disentanglement for General Image Denoising Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {Multi-view Self-supervised Disentanglement for General Image Denoising},
author = {Hao Chen and Chenyuan Qu and Yu Zhang and Chen Chen and Jianbo Jiao},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICCV2023_MeD_Final_Version.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICCV2023_MeD_Supplymentary_Final_Version.pdf
https://chqwer2.github.io/MeD/
https://github.com/chqwer2/Multi-view-Self-supervised-Disentanglement-Denoising},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {With its significant performance improvements, the deep
learning paradigm has become a standard tool for modern
image denoisers. While promising performance has been
shown on seen noise distributions, existing approaches often suffer from generalisation to unseen noise types or general and real noise. It is understandable as the model is
designed to learn paired mapping (e.g. from a noisy image
to its clean version). In this paper, we instead propose to
learn to disentangle the noisy image, under the intuitive
assumption that different corrupted versions of the same
clean image share a common latent space. A self-supervised
learning framework is proposed to achieve the goal, without looking at the latent clean image. By taking two different corrupted versions of the same image as input, the proposed Multi-view Self-supervised Disentanglement (MeD) approach learns to disentangle the latent clean features from the corruptions and recover the clean image consequently. Extensive experimental analysis on both synthetic and real noise shows the superiority of the proposed method over prior self-supervised approaches, especially on unseen novel noise types. On real noise, the proposed method even outperforms its supervised counterparts by over 3 dB. },
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
learning paradigm has become a standard tool for modern
image denoisers. While promising performance has been
shown on seen noise distributions, existing approaches often suffer from generalisation to unseen noise types or general and real noise. It is understandable as the model is
designed to learn paired mapping (e.g. from a noisy image
to its clean version). In this paper, we instead propose to
learn to disentangle the noisy image, under the intuitive
assumption that different corrupted versions of the same
clean image share a common latent space. A self-supervised
learning framework is proposed to achieve the goal, without looking at the latent clean image. By taking two different corrupted versions of the same image as input, the proposed Multi-view Self-supervised Disentanglement (MeD) approach learns to disentangle the latent clean features from the corruptions and recover the clean image consequently. Extensive experimental analysis on both synthetic and real noise shows the superiority of the proposed method over prior self-supervised approaches, especially on unseen novel noise types. On real noise, the proposed method even outperforms its supervised counterparts by over 3 dB.
Lijun Li, Linrui Tian; Zhang, Xindi; Wang, Qi; Zhang, Bang; Bo, Liefeng; Liu, Mengyuan; Chen, Chen
RenderIH: A large-scale synthetic dataset for 3D interacting hand pose estimation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Tags: ICCV
@conference{nokey,
title = {RenderIH: A large-scale synthetic dataset for 3D interacting hand pose estimation},
author = {Lijun Li, Linrui Tian and Xindi Zhang and Qi Wang and Bang Zhang and Liefeng Bo and Mengyuan Liu and Chen Chen},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Vahidian, Saeed; Kadaveru, Sreevatsank; Baek, Woonjoon; Wang, Weijia; Kungurtsev, Vyacheslav; Chen, Chen; Shah, Mubarak; Lin, Bill
When Do Curricula Work in Federated Learning? Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{Vahidian2023b,
title = {When Do Curricula Work in Federated Learning? },
author = {Saeed Vahidian and Sreevatsank Kadaveru and Woonjoon Baek and Weijia Wang and Vyacheslav Kungurtsev and Chen Chen and Mubarak Shah and Bill Lin},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2212.12712.pdf
https://arxiv.org/abs/2212.12712},
doi = {https://doi.org/10.48550/arXiv.2212.12712},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {An oft-cited open problem of federated learning is the existence of data heterogeneity at the clients. One pathway to understanding the drastic accuracy drop in federated learning is by scrutinizing the behavior of the clients' deep models on data with different levels of "difficulty", which has been left unaddressed. In this paper, we investigate a different and rarely studied dimension of FL: ordered learning. Specifically, we aim to investigate how ordered learning principles can contribute to alleviating the heterogeneity effects in FL. We present theoretical analysis and conduct extensive empirical studies on the efficacy of orderings spanning three kinds of learning: curriculum, anti-curriculum, and random curriculum. We find that curriculum learning largely alleviates non-IIDness. Interestingly, the more disparate the data distributions across clients the more they benefit from ordered learning. We provide analysis explaining this phenomenon, specifically indicating how curriculum training appears to make the objective landscape progressively less convex, suggesting fast converging iterations at the beginning of the training procedure. We derive quantitative results of convergence for both convex and nonconvex objectives by modeling the curriculum training on federated devices as local SGD with locally biased stochastic gradients. Also, inspired by ordered learning, we propose a novel client selection technique that benefits from the real-world disparity in the clients. Our proposed approach to client selection has a synergic effect when applied together with ordered learning in FL.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Samarasinghe, Sarinda; Nayeem, Mamshad; Kardan, Rizve Navid; Shah, Mubarak
CDFSL-V: Cross-Domain Few-Shot Learning for Videos Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{Samarasinghe2023,
title = {CDFSL-V: Cross-Domain Few-Shot Learning for Videos},
author = {Sarinda Samarasinghe and Mamshad Nayeem and Rizve Navid Kardan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CDFSL_Video_Combined_Final.pdf
https://sarinda251.github.io/CDFSL-V-site/
https://www.youtube.com/watch?v=RdlEzfW013o},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Few-shot video action recognition is an effective approach to recognizing new categories with only a few labeled examples, thereby reducing the challenges associated with collecting and annotating large-scale video datasets. Existing methods in video action recognition rely on large labeled datasets from the same domain. However, this setup is not realistic as novel categories may come from different data domains that may have different spatial and temporal characteristics. This dissimilarity between the source and target domains can pose a significant challenge, rendering traditional few-shot action recognition techniques ineffective. To address this issue, in this work, we propose a novel cross-domain few-shot video action recognition method that leverages self-supervised learning and curriculum learning to balance the information from the source and target domains. To be particular, our method employs a masked autoencoder-based self-supervised training objective to learn from both source and target data in a self-supervised manner. Then a progressive curriculum balances learning the discriminative information from the source dataset with the generic information learned from the target domain. Initially, our curriculum utilizes supervised learning to learn class discriminative features from the source data. As the training progresses, we transition to learning target-domain-specific features. We propose a progressive curriculum to encourage the emergence of rich features in the target domain based on class discriminative supervised features in the source domain. %a schedule that helps with this transition. We evaluate our method on several challenging benchmark datasets and demonstrate that our approach outperforms existing cross-domain few-shot learning techniques.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Sirnam, Swetha; Rizve, Mamshad Nayeem; Kuhne, Hilde; Shah, Mubarak
Preserving Modality Structure Improves Multi-Modal Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {Preserving Modality Structure Improves Multi-Modal Learning },
author = {Swetha Sirnam and Mamshad Nayeem Rizve and Hilde Kuhne and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.13077.pdf
https://arxiv.org/abs/2308.13077
https://github.com/Swetha5/Multi_Sinkhorn_Knopp
https://swetha5.github.io/MultiSK/
https://youtu.be/1CrGkUATy50
},
doi = {https://doi.org/10.48550/arXiv.2308.13077},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Self-supervised learning on large-scale multi-modal datasets allows learning semantically meaningful embeddings in a joint multi-modal representation space without relying on human annotations. These joint embeddings enable zero-shot cross-modal tasks like retrieval and classification. However, these methods often struggle to generalize well on out-of-domain data as they ignore the semantic structure present in modality-specific embeddings. In this context, we propose a novel Semantic-Structure-Preserving Consistency approach to improve generalizability by preserving the modality-specific relationships in the joint embedding space. To capture modality-specific semantic relationships between samples, we propose to learn multiple anchors and represent the multifaceted relationship between samples with respect to their relationship with these anchors. To assign multiple anchors to each sample, we propose a novel Multi-Assignment Sinkhorn-Knopp algorithm. Our experimentation demonstrates that our proposed approach learns semantically meaningful anchors in a self-supervised manner. Furthermore, our evaluation on MSR-VTT and YouCook2 datasets demonstrates that our proposed multi-anchor assignment based solution achieves state-of-the-art performance and generalizes to both inand out-of-domain datasets. Code: https://github.com/Swetha5/Multi_Sinkhorn_Knopp},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Liu, Daochang; Li, Qiyue; Dinh, Anh-Dung; Jiang, Tingting; Shah, Mubarak; Xu, Chang
Diffusion Action Segmentation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{Liu2023b,
title = {Diffusion Action Segmentation},
author = {Daochang Liu and Qiyue Li and Anh-Dung Dinh and Tingting Jiang and Mubarak Shah and Chang Xu},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.17959.pdf
https://arxiv.org/abs/2303.17959
https://finspire13.github.io/DiffAct-Project-Page/
https://github.com/Finspire13/DiffAct
https://youtu.be/o_Jp8shth7U
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Slides.pptx},
doi = { https://doi.org/10.48550/arXiv.2303.17959},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Temporal action segmentation is crucial for understanding long-form videos. Previous works on this task commonly adopt an iterative refinement paradigm by using multi-stage models. We propose a novel framework via denoising diffusion models, which nonetheless shares the same inherent spirit of such iterative refinement. In this framework, action predictions are iteratively generated from random noise with input video features as conditions. To enhance the modeling of three striking characteristics of human actions, including the position prior, the boundary ambiguity, and the relational dependency, we devise a unified masking strategy for the conditioning inputs in our framework. Extensive experiments on three benchmark datasets, i.e., GTEA, 50Salads, and Breakfast, are performed and the proposed method achieves superior or comparable results to state-of-the-art methods, showing the effectiveness of a generative approach for action segmentation.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Dave, Ishan; Shah, Mubarak
TeD-SPAD: Temporal Distinctiveness for Self-supervised Privacy-preservation for video Anomaly Detection Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Tags: ICCV | Links:
@conference{Fioresi2023,
title = {TeD-SPAD: Temporal Distinctiveness for Self-supervised Privacy-preservation for video Anomaly Detection},
author = {Joseph Fioresi and Ishan Dave and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.11072.pdf
https://arxiv.org/abs/2308.11072
https://github.com/UCF-CRCV/TeD-SPAD
https://joefioresi718.github.io/TeD-SPAD_webpage/
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/TeDSPAD_ICCV_poster.pdf
https://youtu.be/3a9qeJUD1GU},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Wasim, Syed Talal; Khattak, Muhammad Uzair; Naseer, Muzammal; Khan, Salman; Shah, Mubarak; Khan, Fahad Shahbaz
Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition },
author = {Syed Talal Wasim and Muhammad Uzair Khattak and Muzammal Naseer and Salman Khan and Mubarak Shah and Fahad Shahbaz Khan },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2307.06947.pdf
https://arxiv.org/abs/2307.06947
https://talalwasim.github.io/Video-FocalNets/
https://github.com/TalalWasim/Video-FocalNets
https://talalwasim.github.io/Video-FocalNets/#BibTeX},
doi = { https://doi.org/10.48550/arXiv.2307.06947},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Recent video recognition models utilize Transformer models for long-range spatio-temporal context modeling. Video transformer designs are based on self-attention that can model global context at a high computational cost. In comparison, convolutional designs for videos offer an efficient alternative but lack long-range dependency modeling. Towards achieving the best of both designs, this work proposes Video-FocalNet, an effective and efficient architecture for video recognition that models both local and global contexts. Video-FocalNet is based on a spatio-temporal focal modulation architecture that reverses the interaction and aggregation steps of self-attention for better efficiency. Further, the aggregation step and the interaction step are both implemented using efficient convolution and element-wise multiplication operations that are computationally less expensive than their self-attention counterparts on video representations. We extensively explore the design space of focal modulation-based spatio-temporal context modeling and demonstrate our parallel spatial and temporal encoding design to be the optimal choice. Video-FocalNets perform favorably well against the state-of-the-art transformer-based models for video recognition on three large-scale datasets (Kinetics-400, Kinetics-600, and SS-v2) at a lower computational cost. Our code/models are publicly released.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Jain, Nishant; Behl, Harkirat; Rawat, Yogesh Singh; Vineet, Vibhav
Efficiently Robustify Pre-Trained Models Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {Efficiently Robustify Pre-Trained Models},
author = {Nishant Jain and Harkirat Behl and Yogesh Singh Rawat and Vibhav Vineet},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICCV23_Robust_Learning.pdf},
year = {2023},
date = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {A recent trend in deep learning algorithms has been towards training large scale models, having high parameter count and trained on big dataset. However, robustness of such large scale models towards real-world settings is still a less-explored topic. In this work, we first benchmark the performance of these models under different perturbations and datasets thereby representing real-world shifts, and highlight their degrading performance under these shifts. We then discuss on how complete model fine-tuning based existing robustification schemes might not be a scalable option given very large scale networks and can also lead them to forget some of the desired characteristics. Finally, we propose a simple and cost-effective method to solve this problem, inspired by knowledge transfer literature. It involves robustifying smaller models, at a lower computation cost, and then use them as teachers to tune a fraction of these large scale networks, reducing the overall computational overhead. We evaluate our proposed method under various vision perturbations including ImageNet-C,R,S,A datasets and also for transfer learning, zero-shot evaluation setups on different datasets. Benchmark results show that our method is able to induce robustness to these large scale models efficiently, requiring significantly lower time and also preserves the transfer learning, zero-shot properties of the original model which none of the existing methods are able to achieve. },
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Zhou, Yifei; Li, Zilu; Shrivasta, Abhinav; Zhao, Hengshuang; Torralba, Antonio; Tian, Taipeng; Lim, Ser-Nam
BT^2 : Backward-compatible Training with Basis Transformation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{Zhou2023,
title = {BT^2 : Backward-compatible Training with Basis Transformation},
author = {Yifei Zhou and Zilu Li and Abhinav Shrivasta and Hengshuang Zhao and Antonio Torralba and Taipeng Tian and Ser-Nam Lim},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2211.03989v3.pdf
https://arxiv.org/abs/2211.03989v3},
doi = {https://doi.org/10.48550/arXiv.2211.03989},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Modern retrieval system often requires recomputing the representation of every piece of data in the gallery when updating to a better representation model. This process is known as backfilling and can be especially costly in the real world where the gallery often contains billions of samples. Recently, researchers have proposed the idea of Backward Compatible Training (BCT) where the new representation model can be trained with an auxiliary loss to make it backward compatible with the old representation. In this way, the new representation can be directly compared with the old representation, in principle avoiding the need for any backfilling. However, followup work shows that there is an inherent tradeoff where a backward compatible representation model cannot simultaneously maintain the performance of the new model itself. This paper reports our ``not-so-surprising'' finding that adding extra dimensions to the representation can help here. However, we also found that naively increasing the dimension of the representation did not work. To deal with this, we propose Backward-compatible Training with a novel Basis Transformation (BT2). A basis transformation (BT) is basically a learnable set of parameters that applies an orthonormal transformation. Such a transformation possesses an important property whereby the original information contained in its input is retained in its output. We show in this paper how a BT can be utilized to add only the necessary amount of additional dimensions. We empirically verify the advantage of BT2 over other state-of-the-art methods in a wide range of settings. We then further extend BT2 to other challenging yet more practical settings, including significant change in model architecture (CNN to Transformers), modality change, and even a series of updates in the model architecture mimicking the evolution of deep learning models.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Chen, Xi; Li, Shuang; Lim, Ser-Nam; Torralba, Antonio; Zhao, Hengshuang
Open-vocabulary Panoptic Segmentation with Embedding Modulation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{Chen2023b,
title = {Open-vocabulary Panoptic Segmentation with Embedding Modulation},
author = {Xi Chen and Shuang Li and Ser-Nam Lim and Antonio Torralba and Hengshuang Zhao},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.11324.pdf
https://arxiv.org/abs/2303.11324
https://opsnet-page.github.io/},
doi = {https://doi.org/10.48550/arXiv.2303.11324},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Open-vocabulary image segmentation is attracting increasing attention due to its critical applications in the real world. Traditional closed-vocabulary segmentation methods are not able to characterize novel objects, whereas several recent open-vocabulary attempts obtain unsatisfactory results, i.e., notable performance reduction on the closed vocabulary and massive demand for extra data. To this end, we propose OPSNet, an omnipotent and data-efficient framework for Open-vocabulary Panoptic Segmentation. Specifically, the exquisitely designed Embedding Modulation module, together with several meticulous components, enables adequate embedding enhancement and information exchange between the segmentation model and the visual-linguistic well-aligned CLIP encoder, resulting in superior segmentation performance under both open- and closed-vocabulary settings with much fewer need of additional data. Extensive experimental evaluations are conducted across multiple datasets (e.g., COCO, ADE20K, Cityscapes, and PascalContext) under various circumstances, where the proposed OPSNet achieves state-of-the-art results, which demonstrates the effectiveness and generality of the proposed approach. The code and trained models will be made publicly available.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Hammoud, Hasan Abed Al Kader; Prabhu, Ameya; Lim, Ser-Nam; Torr, Philip; Bibi, Adel; Ghanem, Bernard
Towards a True Evaluation of Rapid Adaptation in Online Continual Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
Abstract | Tags: ICCV | Links:
@conference{Hammoud2023,
title = {Towards a True Evaluation of Rapid Adaptation in Online Continual Learning},
author = {Hasan Abed Al Kader Hammoud and Ameya Prabhu and Ser-Nam Lim and Philip Torr and Adel Bibi and Bernard Ghanem},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2305.09275.pdf
https://arxiv.org/abs/2305.09275
https://github.com/drimpossible/EvalOCL},
doi = {https://doi.org/10.48550/arXiv.2305.09275},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {We revisit the common practice of evaluating adaptation of Online Continual Learning (OCL) algorithms through the metric of online accuracy, which measures the accuracy of the model on the immediate next few samples. However, we show that this metric is unreliable, as even vacuous blind classifiers, which do not use input images for prediction, can achieve unrealistically high online accuracy by exploiting spurious label correlations in the data stream. Our study reveals that existing OCL algorithms can also achieve high online accuracy, but perform poorly in retaining useful information, suggesting that they unintentionally learn spurious label correlations. To address this issue, we propose a novel metric for measuring adaptation based on the accuracy on the near-future samples, where spurious correlations are removed. We benchmark existing OCL approaches using our proposed metric on large-scale datasets under various computational budgets and find that better generalization can be achieved by retaining and reusing past seen information. We believe that our proposed metric can aid in the development of truly adaptive OCL methods. We provide code to reproduce our results at https://github.com/drimpossible/EvalOCL.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Thawakar, Omkar; Anwer, Rao Muhammad; Laaksonen, Jorma; Reiner, Orly; Shah, Mubarak; Khan, Fahad Shahbaz
3D Mitochondria Instance Segmentation with Spatio-Temporal Transformers Conference
Lecture Notes in Computer Science, vol. 14227, Medical Image Computing and Computer Assisted Intervention – MICCAI 2023, 2023, ISBN: 978-3-031-43993-3.
Abstract | Tags: | Links:
@conference{nokey,
title = {3D Mitochondria Instance Segmentation with Spatio-Temporal Transformers},
author = {Omkar Thawakar and Rao Muhammad Anwer and Jorma Laaksonen and Orly Reiner and Mubarak Shah and Fahad Shahbaz Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.12073.pdf
https://github.com/OmkarThawakar/STT-UNET
https://arxiv.org/pdf/2303.12073.pdf
https://link.springer.com/chapter/10.1007/978-3-031-43993-3_59
},
doi = {https://doi.org/10.1007/978-3-031-43993-3_59},
isbn = {978-3-031-43993-3},
year = {2023},
date = {2023-10-01},
booktitle = {Lecture Notes in Computer Science},
journal = {arXiv:2303.12073},
volume = {14227},
publisher = {Medical Image Computing and Computer Assisted Intervention – MICCAI 2023},
abstract = {Accurate 3D mitochondria instance segmentation in electron microscopy (EM) is a challenging problem and serves as a prerequisite to empirically analyze their distributions and morphology. Most existing approaches employ 3D convolutions to obtain representative features. However, these convolution-based approaches struggle to effectively capture long-range dependencies in the volume mitochondria data, due to their limited local receptive field. To address this, we propose a hybrid encoder-decoder framework based on a split spatio-temporal attention module that efficiently computes spatial and temporal self-attentions in parallel, which are later fused through a deformable convolution. Further, we introduce a semantic foreground-background adversarial loss during training that aids in delineating the region of mitochondria instances from the background clutter. Our extensive experiments on three benchmarks, Lucchi, MitoEM-R and MitoEM-H, reveal the benefits of the proposed contributions achieving state-of-the-art results on all three datasets. Our code and models are available at https://github.com/OmkarThawakar/STT-UNET.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Kini, Jyoti; Fleischer, Sarah; Dave, Ishan; Shah, Mubarak
Ensemble Modeling for Multimodal Visual Action Recognition Workshop
22nd International Conference on Image Analysis and Processing Workshops - Multimodal Action Recognition on the MECCANO Dataset, 2023.
Tags: ICIAPW, REU, Video Action Recognition | Links:
@workshop{Kini2023b,
title = {Ensemble Modeling for Multimodal Visual Action Recognition},
author = {Jyoti Kini and Sarah Fleischer and Ishan Dave and Mubarak Shah},
url = {https://arxiv.org/pdf/2308.05430.pdf
https://www.crcv.ucf.edu/research/projects/ensemble-modeling-for-multimodal-visual-action-recognition/},
year = {2023},
date = {2023-09-11},
urldate = {2023-09-11},
booktitle = {22nd International Conference on Image Analysis and Processing Workshops - Multimodal Action Recognition on the MECCANO Dataset},
keywords = {ICIAPW, REU, Video Action Recognition},
pubstate = {published},
tppubtype = {workshop}
}
Zhu, Sijie; Yang, Linjie; Chen, Chen; Shah, Mubarak; Shen, Xiaohui; Wang, Heng
R2Former: Unified retrieval and ranking Transformer for Place Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Abstract | Tags: CVPR | Links:
@conference{Zhu2023,
title = {R2Former: Unified retrieval and ranking Transformer for Place Recognition},
author = {Sijie Zhu and Linjie Yang and Chen Chen and Mubarak Shah and Xiaohui Shen and Heng Wang},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CVPR_2023_PlaceRecognitionFinal.pdf
https://arxiv.org/pdf/2304.03410.pdf
https://github.com/Jeff-Zilence/R2Former},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Visual Place Recognition (VPR) estimates the location of query images by matching them with images in a reference database. Conventional methods generally adopt aggregated CNN features for global retrieval and RANSAC-based geometric verification for reranking. However, RANSAC only considers geometric information but ignores other possible information that could be useful for reranking, e.g. local feature correlation, and attention values. In this paper, we propose a unified place recognition framework that handles both retrieval and reranking with a novel transformer model, named R2Former. The proposed reranking module takes feature correlation, attention value, and xy coordinates into account, and learns to determine whether the image pair is from the same location. The whole pipeline is end-to-end trainable and the reranking module alone can
also be adopted on other CNN or transformer backbones as a generic component. Remarkably, R2Former significantly
outperforms state-of-the-art methods on major VPR datasets with much less inference time and memory consumption.
It also achieves the state-of-the-art on the holdout MSLS challenge set and could serve as a simple yet strong solution for real-world large-scale applications. Experiments also show vision transformer tokens are comparable and sometimes better than CNN local features on local matching. The code will be publicly available. },
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
also be adopted on other CNN or transformer backbones as a generic component. Remarkably, R2Former significantly
outperforms state-of-the-art methods on major VPR datasets with much less inference time and memory consumption.
It also achieves the state-of-the-art on the holdout MSLS challenge set and could serve as a simple yet strong solution for real-world large-scale applications. Experiments also show vision transformer tokens are comparable and sometimes better than CNN local features on local matching. The code will be publicly available.
Gupta, Rohit; Roy, Anirban; Kim, Sujeong; Christensen, Claire; Grindal, Todd; Gerard, Sarah Nixon; Cincebeaux, Madeline; Divakaran, Ajay; Shah, Mubarak
Class Prototypes based Contrastive Learning for Classifying Multi-Label and Fine-Grained Educational Videos Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Abstract | Tags: CVPR | Links:
@conference{Gupta2023b,
title = {Class Prototypes based Contrastive Learning for Classifying Multi-Label and Fine-Grained Educational Videos},
author = {Rohit Gupta and Anirban Roy and Sujeong Kim and Claire Christensen and Todd Grindal and Sarah Nixon Gerard and Madeline Cincebeaux and Ajay Divakaran and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Rohit_SRI_CVPR2023_Multi_Modal_Multi_Label_Contrastive_Learning_Camera_Ready-4.pdf
https://www.rohitg.xyz/MMContrast/
https://nusci.csl.sri.com/project/APPROVE},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {The recent growth in the consumption of online media by children during early childhood necessitates data-driven tools enabling educators to filter out appropriate educational content for young learners. This paper presents an approach for detecting educational content in online videos. We focus on two widely used educational content classes: literacy and math. For each class, we choose prominent codes (sub-classes) based on the Common Core Standards. For example, literacy codes include ‘letter names’, ‘letter sounds’, and math codes include ‘counting’, ‘sorting’. We pose this as a fine-grained multilabel classification problem as videos can contain multiple types of educational content and the content classes can get visually similar (e.g., ‘letter names’ vs ‘letter sounds’). We propose a novel class prototypes based supervised contrastive learning approach that can handle fine-grained samples associated with multiple labels. We learn a class prototype for each class and a loss function is employed to minimize the distances between a class prototype and the samples from the class. Similarly,
distances between a class prototype and the samples from other classes are maximized. As the alignment between visual
and audio cues are crucial for effective comprehension, we consider a multimodal transformer network to capture the interaction between visual and audio cues in videos while learning the embedding for videos. For evaluation, we present a dataset, APPROVE, employing educational videos from YouTube labeled with fine-grained education classes by education researchers. APPROVE consists of 193 hours of expert-annotated videos with 19 classes. The proposed approach outperforms strong baselines on APPROVE and other benchmarks such as Youtube-8M, and COIN. The dataset is available at https://nusci.csl.sri.com/project/APPROVE.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
distances between a class prototype and the samples from other classes are maximized. As the alignment between visual
and audio cues are crucial for effective comprehension, we consider a multimodal transformer network to capture the interaction between visual and audio cues in videos while learning the embedding for videos. For evaluation, we present a dataset, APPROVE, employing educational videos from YouTube labeled with fine-grained education classes by education researchers. APPROVE consists of 193 hours of expert-annotated videos with 19 classes. The proposed approach outperforms strong baselines on APPROVE and other benchmarks such as Youtube-8M, and COIN. The dataset is available at https://nusci.csl.sri.com/project/APPROVE.
Dave, Ishan Rajendrakumar; Rizve, Mamshad Nayeem; Chen, Chen; Shah, Mubarak
TimeBalance: Temporally-Invariant and Temporally-Distinctive Video Representations for Semi-Supervised Action Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Abstract | Tags: CVPR | Links:
@conference{Dave2023,
title = {TimeBalance: Temporally-Invariant and Temporally-Distinctive Video Representations for Semi-Supervised Action Recognition},
author = {Ishan Rajendrakumar Dave and Mamshad Nayeem Rizve and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/TimeBalance_CVPR23_arxiv.pdf
https://daveishan.github.io/timebalance_webpage/
https://github.com/DAVEISHAN/TimeBalance},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Semi-Supervised Learning can be more beneficial for the video domain compared to images because of its higher annotation
cost and dimensionality. Besides, any video understanding task requires reasoning over both spatial and temporal dimensions. In order to learn both the static and motion related features for the semi-supervised action recognition task, existing methods rely on hard input inductive biases like using two-modalities (RGB and Optical-flow) or two-stream of different playback rates.
Instead of utilizing unlabeled videos through diverse input streams, we rely on self-supervised video representations,
particularly, we utilize temporally-invariant and temporally-distinctive representations. We observe that these representations complement each other depending on the nature of the action. Based on this observation, we propose a student-teacher semi-supervised learning framework, TimeBalance, where we distill the knowledge from a temporally-invariant and a temporally-distinctive teacher. Depending on the nature of the unlabeled video, we dynamically combine the knowledge of these two teachers based on a novel temporal similarity-based reweighting scheme. Our method achieves state-of-the-art performance
on three action recognition benchmarks: UCF101, HMDB51, and Kinetics400. Code: https://github.com/DAVEISHAN/TimeBalance.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
cost and dimensionality. Besides, any video understanding task requires reasoning over both spatial and temporal dimensions. In order to learn both the static and motion related features for the semi-supervised action recognition task, existing methods rely on hard input inductive biases like using two-modalities (RGB and Optical-flow) or two-stream of different playback rates.
Instead of utilizing unlabeled videos through diverse input streams, we rely on self-supervised video representations,
particularly, we utilize temporally-invariant and temporally-distinctive representations. We observe that these representations complement each other depending on the nature of the action. Based on this observation, we propose a student-teacher semi-supervised learning framework, TimeBalance, where we distill the knowledge from a temporally-invariant and a temporally-distinctive teacher. Depending on the nature of the unlabeled video, we dynamically combine the knowledge of these two teachers based on a novel temporal similarity-based reweighting scheme. Our method achieves state-of-the-art performance
on three action recognition benchmarks: UCF101, HMDB51, and Kinetics400. Code: https://github.com/DAVEISHAN/TimeBalance.
Rizve, Mamshad Nayeem; Mittal, Gaurav; Yu, Ye; Hall, Matthew; Sajeev, Sandra; Shah, Mubarak; Chen, Mei
PivoTAL: Prior-Driven Supervision for Weakly-Supervised Temporal Action Localization Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Tags: CVPR | Links:
@conference{Rizve2023,
title = {PivoTAL: Prior-Driven Supervision for Weakly-Supervised Temporal Action Localization},
author = {Mamshad Nayeem Rizve and Gaurav Mittal and Ye Yu and Matthew Hall and Sandra Sajeev and Mubarak Shah and Mei Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR_2023.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR_2023_Supplemental_Material.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR2023_Poster.pdf
https://www.youtube.com/watch?v=6kAoQjXfzio},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Urooj, Aisha; Kuehne, Hilde; Wu, Bo; Chheu, Kim; Bousselham, Walid; Gan, Chuang; Lobo, Niels; Shah, Mubarak
Learning Situation Hyper-Graphs for Video Question Answering Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Abstract | Tags: CVPR, REU | Links:
@conference{Urooj2023,
title = {Learning Situation Hyper-Graphs for Video Question Answering},
author = {Aisha Urooj and Hilde Kuehne and Bo Wu and Kim Chheu and Walid Bousselham and Chuang Gan and Niels Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2023072364-4.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/SHG_VQA_CVPR2023_cam_ready_supp.pdf
},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Answering questions about complex situations in videos requires not only capturing the presence of actors, objects, and their relations but also the evolution of these relationships over time. A situation hyper-graph is a representation that describes situations as scene sub-graphs for video frames and hyper-edges for connected sub-graphs and has been proposed to capture all such information in a compact structured form. In this work, we propose an architecture for Video Question Answering (VQA) that enables answering questions related to video content by predicting situation hyper-graphs, coined Situation Hyper-Graph based Video Question Answering (SHG-VQA). To this end, we train a situation hyper-graph decoder to implicitly identify graph representations with actions and object/human-object relationships from the input video clip. and to use cross-attention
between the predicted situation hyper-graphs and the question embedding to predict the correct answer. The proposed
method is trained in an end-to-end manner and optimized by a VQA loss with the cross-entropy function and a Hungarian
matching loss for the situation graph prediction. The effectiveness of the proposed architecture is extensively evaluated
on two challenging benchmarks: AGQA and STAR. Our results show that learning the underlying situation hypergraphs
helps the system to significantly improve its performance for novel challenges of video question-answering tasks. },
keywords = {CVPR, REU},
pubstate = {published},
tppubtype = {conference}
}
between the predicted situation hyper-graphs and the question embedding to predict the correct answer. The proposed
method is trained in an end-to-end manner and optimized by a VQA loss with the cross-entropy function and a Hungarian
matching loss for the situation graph prediction. The effectiveness of the proposed architecture is extensively evaluated
on two challenging benchmarks: AGQA and STAR. Our results show that learning the underlying situation hypergraphs
helps the system to significantly improve its performance for novel challenges of video question-answering tasks.
Bhunia, Ankan Kumar; Khan, Salman; Cholakkal, Hisham; Anwer, Rao Muhammad; Laaksonen, Jorma Tapio; Shah, Mubarak; Khan, Fahad
Person Image Synthesis via Denoising Diffusion Model Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Abstract | Tags: CVPR | Links:
@conference{Bhunia2023,
title = {Person Image Synthesis via Denoising Diffusion Model},
author = {Ankan Kumar Bhunia and Salman Khan and Hisham Cholakkal and Rao Muhammad Anwer and Jorma Tapio Laaksonen and Mubarak Shah and Fahad Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/person_image_synthesis_via_den-Camera-ready-PDF.pdf
https://lnkd.in/d-8v3r8B
https://lnkd.in/dGPTjvge
https://lnkd.in/dxcGQsUX
https://github.com/ankanbhunia/PIDM},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {The pose-guided person image generation task requires synthesizing photorealistic images of humans in arbitrary poses. The existing approaches use generative adversarial networks that do not necessarily maintain realistic textures or need dense correspondences that struggle to handle complex deformations and severe occlusions. In this work, we show how denoising diffusion models can be applied for high-fidelity person image synthesis with
strong sample diversity and enhanced mode coverage of the learnt data distribution. Our proposed Person Image Diffusion Model (PIDM) disintegrates the complex transfer problem into a series of simpler forward-backward denoising steps. This helps in learning plausible source-to-target transformation trajectories that result in faithful textures and undistorted appearance details. We introduce a ‘texture diffusion module’ based on cross-attention to accurately model the correspondences between appearance and pose information available in source and target images. Further, we propose disentangled classifier-free guidance’ to ensure close resemblance between the conditional inputs and the synthesized output in terms of both pose and appearance information. Our extensive results on two large-scale benchmarks and a user study demonstrate the photorealism of our proposed approach under challenging scenarios. We also show how our generated images can help in downstream tasks. Code is available at https://github.com/ankanbhunia/PIDM.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
strong sample diversity and enhanced mode coverage of the learnt data distribution. Our proposed Person Image Diffusion Model (PIDM) disintegrates the complex transfer problem into a series of simpler forward-backward denoising steps. This helps in learning plausible source-to-target transformation trajectories that result in faithful textures and undistorted appearance details. We introduce a ‘texture diffusion module’ based on cross-attention to accurately model the correspondences between appearance and pose information available in source and target images. Further, we propose disentangled classifier-free guidance’ to ensure close resemblance between the conditional inputs and the synthesized output in terms of both pose and appearance information. Our extensive results on two large-scale benchmarks and a user study demonstrate the photorealism of our proposed approach under challenging scenarios. We also show how our generated images can help in downstream tasks. Code is available at https://github.com/ankanbhunia/PIDM.
Wasim, Syed Talal; Naseer, Muzammal; Khan, Salman; Khan, Fahad; Shah, Mubarak
Vita-CLIP: Video and text adaptive CLIP via Multimodal Prompting Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Abstract | Tags: CVPR | Links:
@conference{Wasim2023,
title = {Vita-CLIP: Video and text adaptive CLIP via Multimodal Prompting},
author = {Syed Talal Wasim and Muzammal Naseer and Salman Khan and Fahad Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/vita_clip_video_and_text_adapt-Camera-ready-PDF.pdf
},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Adopting contrastive image-text pretrained models like CLIP towards video classification has gained attention due to its cost-effectiveness and competitive performance. However, recent works in this area face a trade-off. Finetuning the pretrained model to achieve strong supervised performance results in low zero-shot generalization. Similarly, freezing the backbone to retain zero-shot capability causes significant drop in supervised accuracy. Because of this,
recent works in literature typically train separate models for supervised and zero-shot action recognition. In this work, we propose a multimodal prompt learning scheme that works to balance the supervised and zero-shot performance under a single unified training. Our prompting approach on the vision side caters for three aspects: 1) Global video-level prompts to model the data distribution; 2) Local frame-level prompts to provide per-frame discriminative
conditioning; and 3) a summary prompt to extract a condensed video representation. Additionally, we define a prompting scheme on the text side to augment the textual context. Through this prompting scheme, we can achieve state-of-the-art zero-shot performance on Kinetics-600, HMDB51 and UCF101 while remaining competitive in the supervised setting. By keeping the pretrained backbone frozen, we optimize a much lower number of parameters and retain the existing general representation which helps achieve the strong zero-shot performance. Our codes and models will be publicly released. },
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
recent works in literature typically train separate models for supervised and zero-shot action recognition. In this work, we propose a multimodal prompt learning scheme that works to balance the supervised and zero-shot performance under a single unified training. Our prompting approach on the vision side caters for three aspects: 1) Global video-level prompts to model the data distribution; 2) Local frame-level prompts to provide per-frame discriminative
conditioning; and 3) a summary prompt to extract a condensed video representation. Additionally, we define a prompting scheme on the text side to augment the textual context. Through this prompting scheme, we can achieve state-of-the-art zero-shot performance on Kinetics-600, HMDB51 and UCF101 while remaining competitive in the supervised setting. By keeping the pretrained backbone frozen, we optimize a much lower number of parameters and retain the existing general representation which helps achieve the strong zero-shot performance. Our codes and models will be publicly released.
Clark, Brandon Eric; Kerrigan, Alec; Kulkarni, Parth Parag; Cepeda, Vicente Vivanco; Shah, Mubarak
Where We Are and What We're Looking At: Query Based Worldwide Image Geo-localization Using Hierarchies and Scenes Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Abstract | Tags: CVPR | Links:
@conference{Clark2023,
title = {Where We Are and What We're Looking At: Query Based Worldwide Image Geo-localization Using Hierarchies and Scenes},
author = {Brandon Eric Clark and Alec Kerrigan and Parth Parag Kulkarni and Vicente Vivanco Cepeda and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Camera-Ready-Full-Paper.pdf
https://github.com/AHKerrigan/GeoGuessNet
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CVPR23-Poster_THU-PM-246-1.pdf
https://www.youtube.com/watch?v=fp3hZGbwPqk},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Determining the exact latitude and longitude that a photo was taken is a useful and widely applicable task, yet it remains exceptionally difficult despite the accelerated progress of other computer vision tasks. Most previous approaches have opted to learn single representations of query images, which are then classified at different levels of geographic granularity. These approaches fail to exploit the different visual cues that give context to different hierarchies, such as the country, state, and city level. To this end, we introduce an end-to-end transformer-based architecture that exploits the relationship between different geographic levels (which we refer to as hierarchies) and the corresponding visual scene information in an image through hierarchical cross-attention. We achieve this by learning a query for each geographic hierarchy and scene type. Furthermore, we learn a separate representation for different environmental scenes, as different scenes in the same location are often defined by completely different visual features. We achieve state of the art accuracy on 4 standard geo-localization datasets : Im2GPS, Im2GPS3k, YFCC4k, and YFCC26k, as well as qualitatively demonstrate how our method learns different representations for different visual hierarchies and scenes, which has not been demonstrated in the previous methods. Above previous testing datasets mostly consist of iconic landmarks or images taken from social media, which makes the dataset a simple memory task, or makes it biased towards certain places. To address this issue we introduce a much harder testing dataset, Google-World-Streets-15k, comprised of images taken from Google Streetview covering the whole planet and present state of the art results. Our code can be found at https://github.com/AHKerrigan/GeoGuessNet.
},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Rana, Aayush; Rawat, Yogesh
Hybrid Active Learning via Deep Clustering for Video Action Detection Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Tags: CVPR | Links:
@conference{Rana2023,
title = {Hybrid Active Learning via Deep Clustering for Video Action Detection},
author = {Aayush Rana and Yogesh Rawat},
url = {https://www.crcv.ucf.edu/research/projects/hybrid-active-learning-via-deep-clustering-for-video-action-detection/},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Chantry, Madeline; Biyani, Naman; Kamtam, Prudvi; Vyas, Shruti; Palangi, Hamid; Vineet, Vibhav; Rawat, Yogesh
A Large-scale Robustness Analysis of Video Action Recognition Models Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Tags: CVPR | Links:
@conference{Chantry2023,
title = {A Large-scale Robustness Analysis of Video Action Recognition Models},
author = {Madeline Chantry and Naman Biyani and Prudvi Kamtam and Shruti Vyas and Hamid Palangi and Vibhav Vineet and Yogesh Rawat},
url = {https://sites.google.com/view/videorobustnessbenchmark/home
https://www.crcv.ucf.edu/research/projects/ucf101-ds-action-recognition-for-real-world-distribution-shifts/
https://github.com/Maddy12/ActionRecognitionRobustnessEval
https://youtu.be/pv2AJ_t-v90
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PosterCVPR2023.png},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Sijie; Lin, Zhe; Cohen, Scott; Kuen, Jason; Zhang, Zhifei; Chen, Chen
TopNet: Transformer-based Object Placement Network for Image Compositing Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Tags: CVPR | Links:
@conference{Zhu2023b,
title = {TopNet: Transformer-based Object Placement Network for Image Compositing },
author = {Sijie Zhu and Zhe Lin and Scott Cohen and Jason Kuen and Zhifei Zhang and Chen Chen},
url = {https://arxiv.org/pdf/2304.03372.pdf},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Ce; Mendieta, Matias; Yang, Taojiannan; Qi, Guo-Jun; Chen, Chen
FeatER: An Efficient Network for Human Reconstruction via Feature Map-Based TransformER Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Tags: CVPR | Links:
@conference{Zheng2023,
title = {FeatER: An Efficient Network for Human Reconstruction via Feature Map-Based TransformER},
author = {Ce Zheng and Matias Mendieta and Taojiannan Yang and Guo-Jun Qi and Chen Chen},
url = {https://arxiv.org/pdf/2205.15448.pdf
https://zczcwh.github.io/feater_page/
https://github.com/zczcwh/FeatER},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Ce; Liu, Xianpeng; Qi, Guo-Jun; Chen, Chen
POTTER: Pooling Attention Transformer for Efficient Human Mesh Recovery Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Tags: CVPR | Links:
@conference{Zheng2023b,
title = {POTTER: Pooling Attention Transformer for Efficient Human Mesh Recovery},
author = {Ce Zheng and Xianpeng Liu and Guo-Jun Qi and Chen Chen},
url = {https://arxiv.org/pdf/2303.13357.pdf
https://zczcwh.github.io/potter_page/
https://github.com/zczcwh/POTTER},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Zhao, Qitao; Zheng, Ce; Liu, Mengyuan; Wang, Pichao; Chen, Chen
PoseFormerV2: Exploring Frequency Domain for Efficient and Robust 3D Human Pose Estimation Conference
IEEE Computer Vision and Pattern Recognition, 2023.
Tags: CVPR | Links:
@conference{nokey,
title = {PoseFormerV2: Exploring Frequency Domain for Efficient and Robust 3D Human Pose Estimation},
author = {Qitao Zhao and Ce Zheng and Mengyuan Liu and Pichao Wang and Chen Chen},
url = {https://arxiv.org/pdf/2303.17472.pdf
https://qitaozhao.github.io/PoseFormerV2
https://github.com/QitaoZhao/PoseFormerV2},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Cui, Xuanming; Aparcedo, Alejandro; Jang, Young Kyun; Lim, Ser-Nam
On the Robustness of Large Multimodal Models Against Image Adversarial Attacks Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference, 2023.
Abstract | Tags: CVPR, REU | Links:
@conference{Cui2023,
title = {On the Robustness of Large Multimodal Models Against Image Adversarial Attacks},
author = {Xuanming Cui and Alejandro Aparcedo and Young Kyun Jang and Ser-Nam Lim},
url = {https://arxiv.org/pdf/2312.03777
https://arxiv.org/abs/2312.03777},
doi = {https://doi.org/10.48550/arXiv.2312.03777},
year = {2023},
date = {2023-06-18},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference},
abstract = {Recent advances in instruction tuning have led to the development of State-of-the-Art Large Multimodal Models (LMMs). Given the novelty of these models, the impact of visual adversarial attacks on LMMs has not been thoroughly examined. We conduct a comprehensive study of the robustness of various LMMs against different adversarial attacks, evaluated across tasks including image classification, image captioning, and Visual Question Answer (VQA). We find that in general LMMs are not robust to visual adversarial inputs. However, our findings suggest that context provided to the model via prompts, such as questions in a QA pair helps to mitigate the effects of visual adversarial inputs. Notably, the LMMs evaluated demonstrated remarkable resilience to such attacks on the ScienceQA task with only an 8.10% drop in performance compared to their visual counterparts which dropped 99.73%. We also propose a new approach to real-world image classification which we term query decomposition. By incorporating existence queries into our input prompt we observe diminished attack effectiveness and improvements in image classification accuracy. This research highlights a previously under-explored facet of LMM robustness and sets the stage for future work aimed at strengthening the resilience of multimodal systems in adversarial environments.},
keywords = {CVPR, REU},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Ce; Wu, Wenhan; Chen, Chen; Yang, Taojiannan; Zhu, Sijie; Shen, Ju; Kehtarnavaz, Nasser; Shah, Mubarak
Deep Learning-Based Human Pose Estimation: A Survey Journal Article
In: ACM Computing Surveys, 2023.
@article{Zheng2023c,
title = {Deep Learning-Based Human Pose Estimation: A Survey},
author = {Ce Zheng and Wenhan Wu and Chen Chen and Taojiannan Yang and Sijie Zhu and Ju Shen and Nasser Kehtarnavaz and Mubarak Shah},
editor = {Albert Y H Zomaya},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/3603618.pdf
https://github.com/zczcwh/DL-HPE},
doi = {10.1145/3603618},
year = {2023},
date = {2023-06-09},
journal = {ACM Computing Surveys},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Kini, Jyoti; Mian, Ajmal; Shah, Mubarak
3DMODT: Attention-Guided Affinities for Joint Detection & Tracking in 3D Point Clouds Conference
IEEE International Conference on Robotics and Automation, 2023.
Tags: ICRA | Links:
@conference{Kini2023,
title = {3DMODT: Attention-Guided Affinities for Joint Detection & Tracking in 3D Point Clouds},
author = {Jyoti Kini and Ajmal Mian and Mubarak Shah},
url = {https://arxiv.org/pdf/2211.00746.pdf},
year = {2023},
date = {2023-05-29},
urldate = {2023-05-29},
booktitle = {IEEE International Conference on Robotics and Automation},
keywords = {ICRA},
pubstate = {published},
tppubtype = {conference}
}
Sangam, Tushar; Dave, Ishan Rajendrakumar; Sultani, Waqas; Shah, Mubarak
TransVisDrone: Spatio-Temporal Transformer for Vision-based Drone-to-Drone Detection in Aerial Videos Conference
IEEE International Conference on Robotics and Automation, 2023.
Tags: ICRA | Links:
@conference{Sangam2023,
title = {TransVisDrone: Spatio-Temporal Transformer for Vision-based Drone-to-Drone Detection in Aerial Videos},
author = {Tushar Sangam and Ishan Rajendrakumar Dave and Waqas Sultani and Mubarak Shah},
url = {https://arxiv.org/pdf/2210.08423.pdf},
year = {2023},
date = {2023-05-29},
booktitle = {IEEE International Conference on Robotics and Automation},
keywords = {ICRA},
pubstate = {published},
tppubtype = {conference}
}
Yang, Taojiannan; Zhu, Yi; Xie, Yusheng; Zhang, Aston; Chen, Chen; Li, Mu
AIM: Adapting Image Models for Efficient Video Understanding Conference
AIM: Adapting Image Models for Efficient Video Understanding, Eleventh International Conference on Learning Representations (ICLR), 2023.
@conference{Yang2023,
title = {AIM: Adapting Image Models for Efficient Video Understanding},
author = {Taojiannan Yang and Yi Zhu and Yusheng Xie and Aston Zhang and Chen Chen and Mu Li},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
booktitle = {AIM: Adapting Image Models for Efficient Video Understanding},
publisher = {Eleventh International Conference on Learning Representations (ICLR)},
abstract = {Recent vision transformer based video models mostly follow the ``image pre-training then finetuning" paradigm and have achieved great success on multiple video benchmarks. However, full finetuning such a video model could be computationally expensive and unnecessary, given the pre-trained image transformer models have demonstrated exceptional transferability. In this work, we propose a novel method to Adapt pre-trained Image Models (AIM) for efficient video understanding. By freezing the pre-trained image model and adding a few lightweight Adapters, we introduce spatial adaptation, temporal adaptation and joint adaptation to gradually equip an image model with spatiotemporal reasoning capability. We show that our proposed AIM can achieve competitive or even better performance than prior arts with substantially fewer tunable parameters on four video action recognition benchmarks. Thanks to its simplicity, our method is also generally applicable to different image pre-trained models, which has the potential to leverage more powerful image foundation models in the future. },
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Yang, Peiyu; Akhtar, Naveed; Wen, Zeyi; Shah, Mubarak; Mian, Ajmal
Re-calibrating Feature Attributions for Model Interpretation Conference
Re-calibrating Feature Attributions for Model Interpretation, Eleventh International Conference on Learning Representations (ICLR), notable top 25%, 2023.
Tags: ICLR
@conference{nokey,
title = {Re-calibrating Feature Attributions for Model Interpretation},
author = {Peiyu Yang and Naveed Akhtar and Zeyi Wen and Mubarak Shah and Ajmal Mian},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
booktitle = {Re-calibrating Feature Attributions for Model Interpretation},
publisher = {Eleventh International Conference on Learning Representations (ICLR), notable top 25%},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Beetham, James; Kardan, Navid; Mian, Ajmal; Shah, Mubarak
Dual Student Networks for Data-Free Model Stealing Conference
Eleventh International Conference on Learning Representations, 2023.
Tags: ICLR | Links:
@conference{Beetham2023b,
title = {Dual Student Networks for Data-Free Model Stealing},
author = {James Beetham and Navid Kardan and Ajmal Mian and Mubarak Shah},
url = {https://arxiv.org/abs/2309.10058},
year = {2023},
date = {2023-05-01},
booktitle = {Eleventh International Conference on Learning Representations},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Barbalau, Antonio; Ionescu, Radu Tudor; Georgescu, Mariana-Iuliana; Dueholm, Jacob; Ramachandra, Bharathkumar; Nasrollahi, Kamal; Khan, Fahad Shahbaz; Moeslund, Thomas B.; Shah, Mubarak
SSMTL++: Revisiting Self-Supervised Multi-Task Learning for Video Anomaly Detection Journal Article
In: Computer Vision and Image Understanding, 2023.
Tags: CVIU | Links:
@article{Barbalau2023,
title = {SSMTL++: Revisiting Self-Supervised Multi-Task Learning for Video Anomaly Detection},
author = {Antonio Barbalau and Radu Tudor Ionescu and Mariana-Iuliana Georgescu and Jacob Dueholm and Bharathkumar Ramachandra and Kamal Nasrollahi and Fahad Shahbaz Khan and Thomas B. Moeslund and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/SSMTL.pdf},
year = {2023},
date = {2023-02-11},
urldate = {2023-02-11},
journal = {Computer Vision and Image Understanding},
keywords = {CVIU},
pubstate = {published},
tppubtype = {article}
}
Vahidian, Saeed; Morafah, Mahdi; Wang, Weijia; Kungurtsev, Vyacheslav; Chen, Chen; Shah, Mubarak; Lin, Bill
Efficient Distribution Similarity Identification in Clustered Federated Learning via Principal Angles Between Client Data Subspaces Conference
37th AAAI Conference on Artificial Intelligence, 2023.
Tags: AAAI | Links:
@conference{Vahidian2023,
title = {Efficient Distribution Similarity Identification in Clustered Federated Learning via Principal Angles Between Client Data Subspaces},
author = {Saeed Vahidian and Mahdi Morafah and Weijia Wang and Vyacheslav Kungurtsev and Chen Chen and Mubarak Shah and Bill Lin},
url = {https://arxiv.org/abs/2209.10526},
year = {2023},
date = {2023-02-07},
urldate = {2023-02-07},
publisher = {37th AAAI Conference on Artificial Intelligence},
keywords = {AAAI},
pubstate = {published},
tppubtype = {conference}
}
Zhong, Xian; Li, Zipeng; Chen, Shuqin; Jiang, Kui; Chen, Chen; Ye, Mang
Refined Semantic Enhancement Towards Frequency Diffusion for Video Captioning Conference
37th AAAI Conference on Artificial Intelligence, 2023.
Tags: AAAI | Links:
@conference{Zhong2023,
title = {Refined Semantic Enhancement Towards Frequency Diffusion for Video Captioning},
author = {Xian Zhong and Zipeng Li and Shuqin Chen and Kui Jiang and Chen Chen and Mang Ye},
url = {https://arxiv.org/abs/2211.15076},
year = {2023},
date = {2023-02-07},
publisher = {37th AAAI Conference on Artificial Intelligence},
keywords = {AAAI},
pubstate = {published},
tppubtype = {conference}
}
Liu, Mengyuan; Meng, Fanyang; Chen, Chen; Wu, Songtao
Novel Motion Patterns Matter for Practical Skeleton-based Action Recognition Conference
37th AAAI Conference on Artificial Intelligence, 2023.
Tags: AAAI
@conference{Liu2023,
title = {Novel Motion Patterns Matter for Practical Skeleton-based Action Recognition},
author = {Mengyuan Liu and Fanyang Meng and Chen Chen and Songtao Wu},
year = {2023},
date = {2023-02-07},
publisher = {37th AAAI Conference on Artificial Intelligence},
keywords = {AAAI},
pubstate = {published},
tppubtype = {conference}
}
Gupta, Rohit; Akhtar, Naveed; Mian, Ajmal; Shah, Mubarak
Contrastive Self-Supervised Learning Leads to Higher Adversarial Susceptibility Conference
37th AAAI Conference on Artificial Intelligence, 2023.
Tags: AAAI | Links:
@conference{Gupta2023,
title = {Contrastive Self-Supervised Learning Leads to Higher Adversarial Susceptibility},
author = {Rohit Gupta and Naveed Akhtar and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2207.10862.pdf},
year = {2023},
date = {2023-02-07},
publisher = {37th AAAI Conference on Artificial Intelligence},
keywords = {AAAI},
pubstate = {published},
tppubtype = {conference}
}
2022
Rana, Aayush; Rawat, Yogesh
Are all Frames Equal? Active Sparse Labeling for Video Action Detection Conference
36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022.
Abstract | Tags: NeurIPS | Links:
@conference{nokey,
title = {Are all Frames Equal? Active Sparse Labeling for Video Action Detection },
author = {Aayush Rana and Yogesh Rawat},
url = {https://www.crcv.ucf.edu/research/projects/active-sparse-labeling-for-video-action-detection/
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/neurips_poster_ASL_upload.png
https://github.com/aayushjr/ASL-video },
year = {2022},
date = {2022-11-28},
urldate = {2022-11-28},
publisher = {36th Conference on Neural Information Processing Systems (NeurIPS 2022)},
abstract = {Video action detection requires annotations at every frame, which drastically increases the labeling cost. In this work, we focus on efficient labeling of videos for action detection to minimize this cost. We propose active sparse labeling (ASL), a novel active learning strategy for video action detection. We propose a novel frame-level scoring mechanism aimed at selecting the most informative frames in a video. We also introduce a novel loss formulation which enables training of action detection model with these sparsely selected frames. We evaluated the proposed approach on two different action detection benchmark datasets, UCF-101-24 and J-HMDB-21, and observed that active sparse labeling can be very effective in saving annotation costs. We demonstrate that the proposed approach performs better than random selection, outperforming all other baselines, with performance comparable to supervised approach using merely 10% annotations.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Schiappa, Madeline Chantry; Vyas, Shruti; Palangi, Hamid; Rawat, Yogesh; Vineet, Vibhav
Robustness Analysis of Video-Language Models Against Visual and Language Perturbations Conference
36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022.
Abstract | Tags: NeurIPS | Links:
@conference{Schiappa2022,
title = {Robustness Analysis of Video-Language Models Against Visual and Language Perturbations},
author = {Madeline Chantry Schiappa and Shruti Vyas and Hamid Palangi and Yogesh Rawat and Vibhav Vineet},
url = {https://sites.google.com/view/videolanguagerobustness/home
https://openreview.net/forum?id=A79jAS4MeW9
https://github.com/Maddy12/VideoLanguageModelRobustness/tree/master},
year = {2022},
date = {2022-11-28},
publisher = {36th Conference on Neural Information Processing Systems (NeurIPS 2022)},
abstract = {Joint visual and language modeling on large-scale datasets has recently shown good progress in multi-modal tasks when compared to single modal learning. However, robustness of these approaches against real-world perturbations has not been studied. In this work, we perform the first extensive robustness study of video-language models against various real-world perturbations. We focus on text-to-video retrieval and propose two large-scale benchmark datasets, MSRVTT-P and YouCook2-P, which utilize 90 different visual and 35 different text perturbations. The study reveals some interesting initial findings from the studied models: 1) models are more robust when text is perturbed versus when video is perturbed, 2) models that are pre-trained are more robust than those trained from scratch, 3) models attend more to scene and objects rather than motion and action. We hope this study will serve as a benchmark and guide future research in robust video-language learning. The benchmark introduced in this study along with the code and datasets is available at https://bit.ly/3CNOly4.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Xu, Ziwei; Rawat, Yogesh; Wong, Yongkang; Kankanhalli, Mohan; Shah, Mubarak
Don’t Pour Cereal into Coffee: Differentiable Temporal Logic for Temporal Action Segmentation Conference
36th Conference on Neural Information Processing Systems (NeurIPS 2022), 2022.
Abstract | Tags: NeurIPS | Links:
@conference{Xu2022,
title = {Don’t Pour Cereal into Coffee: Differentiable Temporal Logic for Temporal Action Segmentation},
author = {Ziwei Xu and Yogesh Rawat and Yongkang Wong and Mohan Kankanhalli and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ziwei_neurips2022.pdf
https://diff-tl.github.io/
https://github.com/ZiweiXU/DTL-action-segmentation},
year = {2022},
date = {2022-11-09},
urldate = {2022-11-09},
publisher = {36th Conference on Neural Information Processing Systems (NeurIPS 2022)},
abstract = {We propose Differentiable Temporal Logic (DTL), a model-agnostic framework that introduces temporal constraints to deep networks. DTL treats the outputs of a network as a truth assignment of a temporal logic formula, and computes a temporal logic loss reflecting the consistency between the output and the constraints. We propose a comprehensive set of constraints, which are implicit in data annotations, and incorporate them with deep networks via DTL. We evaluate the effectiveness of DTL on the temporal action segmentation task and observe improved performance and reduced logical errors in the output of different task models. Furthermore, we provide an extensive analysis to visualize the desirable effects of DTL.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Vyas, Shruti; Chen, Chen; Shah, Mubarak
GAMa: Cross-view Video Geo-localization Conference
European Conference on Computer Vision, 2022.
Abstract | Tags: ECCV | Links:
@conference{Vyas2022,
title = {GAMa: Cross-view Video Geo-localization},
author = {Shruti Vyas and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1512.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1512-supp.pdf
https://youtu.be/KSHuer_VXJo},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {The existing work in cross-view geo-localization is based on images where a ground panorama is matched to an aerial image. In this work, we focus on ground videos instead of images which provides ad-ditional contextual cues which are important for this task. There are no existing datasets for this problem, therefore we propose GAMa dataset, a large-scale dataset with ground videos and corresponding aerial images. We also propose a novel approach to solve this problem. At clip-level, a short video clip is matched with corresponding aerial image and is later used to get video-level geo-localization of a long video. Moreover, we propose a hierarchical approach to further improve the clip-level geo-localization. On this challenging dataset, with unaligned images and lim-ited field of view, our proposed method achieves a Top-1 recall rate of 19.4% and 45.1% @1.0mile. Code & dataset are available at this link.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Wang, Wenxuan; Chen, Chen; Wang, Jing; Zha, Sen; Zhang, Yan; Li, Jiangyun
Med-DANet: Dynamic Architecture Network for Efficient Medical Volumetric Segmentation Conference
European Conference on Computer Vision, 2022.
Abstract | Tags: ECCV | Links:
@conference{Wang2022,
title = {Med-DANet: Dynamic Architecture Network for Efficient Medical Volumetric Segmentation},
author = {Wenxuan Wang and Chen Chen and Jing Wang and Sen Zha and Yan Zhang and Jiangyun Li},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2206.06575.pdf},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {For 3D medical image (e.g. CT and MRI) segmentation, the difficulty of segmenting each slice in a clinical case varies greatly. Previous research on volumetric medical image segmentation in a slice-byslice manner conventionally use the identical 2D deep neural network to segment all the slices of the same case, ignoring the data heterogeneity among image slices. In this paper, we focus on multi-modal 3D MRI brain tumor segmentation and propose a dynamic architecture network named Med-DANet based on adaptive model selection to achieve effective accuracy and efficiency trade-off. For each slice of the input 3D MRI volume, our proposed method learns a slice-specific decision by the Decision Network to dynamically select a suitable model from the predefined Model Bank for the subsequent 2D segmentation task. Extensive experimental results on both BraTS 2019 and 2020 datasets show that our proposed method achieves comparable or better results than previous state-of-the art methods for 3D MRI brain tumor segmentation with much less model complexity. Compared with the state-of-the-art 3D method TransBTS, the proposed framework improves the model efficiency by up to 3.5 × without sacrificing the accuracy. Our code will be publicly available at https://github.com/Wenxuan-1119/Med-DANet.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Sijie; Lin, Zhe; Cohen, Scott; Kuen, Jason; Zhang, Zhifei; Chen, Chen
GALA: Toward Geometry-and-Lighting-Aware Object Search for Compositing Conference
European Conference on Computer Vision, 2022.
Abstract | Tags: ECCV | Links:
@conference{Zhu2022,
title = {GALA: Toward Geometry-and-Lighting-Aware Object Search for Compositing},
author = {Sijie Zhu and Zhe Lin and Scott Cohen and Jason Kuen and Zhifei Zhang and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2204.00125.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/GALA_supplementary.pdf},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Compositing-aware object search aims to find the most compatible objects for compositing given a background image and a query bounding box. Previous works focus on learning compatibility between the foreground object and background, but fail to learn other important factors from large-scale data, i.e. geometry and lighting. To move a step further, this paper proposes GALA (Geometry-and-Lighting-Aware), a generic foreground object search method with discriminative modeling on geometry and lighting compatibility for open-world image compositing. Remarkably, it achieves state-of-the-art results on the CAIS dataset and generalizes well on large-scale open-world datasets, i.e. Pixabay and
Open Images. In addition, our method can effectively handle non-box scenarios, where users only provide background images without any input bounding box. A web demo (see supplementary materials) is built to showcase applications of the proposed method for compositing-aware search and automatic location/scale prediction for the foreground object. },
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Open Images. In addition, our method can effectively handle non-box scenarios, where users only provide background images without any input bounding box. A web demo (see supplementary materials) is built to showcase applications of the proposed method for compositing-aware search and automatic location/scale prediction for the foreground object.
Khan, Aisha Urooj; Kuehne, Hilde; Gan, Chuang; Lobo, Niels Da Vitoria; Shah, Mubarak
Weakly Supervised Grounding for VQA in Vision-Language Transformers Conference
European Conference on Computer Vision, 2022.
Abstract | Tags: ECCV | Links:
@conference{Khan2022,
title = {Weakly Supervised Grounding for VQA in Vision-Language Transformers},
author = {Aisha Urooj Khan and Hilde Kuehne and Chuang Gan and Niels Da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1011.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1011-supp.pdf
https://github.com/aurooj/WSG-VQA-VLTransformers
https://youtu.be/dekmVb6lq3I},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Transformers for visual-language representation learning have been getting a lot of interest and shown tremendous performance on visual question answering (VQA) and grounding. However, most systems that show good performance of those tasks still rely on pre-trained object
detectors during training, which limits their applicability to the object classes available for those detectors. To mitigate this limitation, this paper
focuses on the problem of weakly supervised grounding in the context of visual question answering in transformers. Our approach leverages
capsules by transforming each visual token into a capsule representation in the visual encoder; it then uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding. Our experiments show that: while removing the information of masked
objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
detectors during training, which limits their applicability to the object classes available for those detectors. To mitigate this limitation, this paper
focuses on the problem of weakly supervised grounding in the context of visual question answering in transformers. Our approach leverages
capsules by transforming each visual token into a capsule representation in the visual encoder; it then uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding. Our experiments show that: while removing the information of masked
objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field.
Rizve, Mamshad Nayeem; Kardan, Navid; Khan, Salman; Khan, Fahad Shahbaz; Shah, Mubarak
OpenLDN: Learning to Discover Novel Classes for Open-World Semi-Supervised Learning Conference
European Conference on Computer Vision, 2022.
Abstract | Tags: ECCV | Links:
@conference{Rizve2022,
title = {OpenLDN: Learning to Discover Novel Classes for Open-World Semi-Supervised Learning},
author = {Mamshad Nayeem Rizve and Navid Kardan and Salman Khan and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/6665.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/6665-supp.pdf
https://github.com/nayeemrizve/OpenLDN
https://youtu.be/p2lYqvklcjA},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Semi-supervised learning (SSL) is one of the dominant approaches to address the annotation bottleneck of supervised learning. Recent SSL methods can effectively leverage a large repository of unlabeled data to improve performance while relying on a small set of labeled data. One common assumption in most SSL methods is that the labeled and unlabeled data are from the same data distribution. However, this is hardly the case in many real-world scenarios, which limits their applicability. In this work, instead, we attempt to solve the challenging open-world SSL problem that does not make such an assumption. In the open-world SSL problem, the objective is to recognize samples of known classes, and simultaneously detect and cluster samples belonging to novel classes present in unlabeled data. This work introduces OpenLDN that utilizes a pairwise similarity loss to discover novel classes. Using a bi-level optimization rule this pairwise similarity loss exploits the information available in the labeled set to implicitly cluster novel class samples, while simultaneously recognizing samples from known classes. After discovering novel classes, OpenLDN transforms the open-world SSL problem into a standard SSL problem to achieve additional performance gains using existing SSL methods. Our extensive experiments demonstrate that OpenLDN outperforms the current state-of-the-art methods on multiple popular classification benchmarks while providing a better accuracy/training time trade-off. Code: https://github.com/nayeemrizve/OpenLDN},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Rizve, Mamshad Nayeem; Kardan, Navid; Shah, Mubarak
Towards Realistic Semi-Supervised Learning Conference
European Conference on Computer Vision, 2022.
Abstract | Tags: ECCV | Links:
@conference{Rizve2022b,
title = {Towards Realistic Semi-Supervised Learning},
author = {Mamshad Nayeem Rizve and Navid Kardan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7402.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7402-supp.pdf
https://github.com/nayeemrizve/TRSSL
https://youtu.be/mE7GeQ35WyY},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Deep learning is pushing the state-of-the-art in many computer
vision applications. However, it relies on large annotated data
repositories, and capturing the unconstrained nature of the real-world
data is yet to be solved. Semi-supervised learning (SSL) complements
the annotated training data with a large corpus of unlabeled data to
reduce annotation cost. The standard SSL approach assumes unlabeled
data are from the same distribution as annotated data. Recently, a more
realistic SSL problem, called open-world SSL, is introduced, where the
unannotated data might contain samples from unknown classes. In this paper, we propose a novel pseudo-label based approach to tackle SSL in
open-world setting. At the core of our method, we utilize sample uncertainty and incorporate prior knowledge about class distribution to generate reliable class-distribution-aware pseudo-labels for unlabeled data belonging to both known and unknown classes. Our extensive experimentation showcases the effectiveness of our approach on several benchmark datasets, where it substantially outperforms the existing state-of-the art on seven diverse datasets including CIFAR-100 (∼17%), ImageNet-100 (∼5%), and Tiny ImageNet (∼9%). We also highlight the flexibility of our approach in solving novel class discovery task, demonstrate its stability in dealing with imbalanced data, and complement our approach with a technique to estimate the number of novel classes. Code: https://github.com/nayeemrizve/TRSSL},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
vision applications. However, it relies on large annotated data
repositories, and capturing the unconstrained nature of the real-world
data is yet to be solved. Semi-supervised learning (SSL) complements
the annotated training data with a large corpus of unlabeled data to
reduce annotation cost. The standard SSL approach assumes unlabeled
data are from the same distribution as annotated data. Recently, a more
realistic SSL problem, called open-world SSL, is introduced, where the
unannotated data might contain samples from unknown classes. In this paper, we propose a novel pseudo-label based approach to tackle SSL in
open-world setting. At the core of our method, we utilize sample uncertainty and incorporate prior knowledge about class distribution to generate reliable class-distribution-aware pseudo-labels for unlabeled data belonging to both known and unknown classes. Our extensive experimentation showcases the effectiveness of our approach on several benchmark datasets, where it substantially outperforms the existing state-of-the art on seven diverse datasets including CIFAR-100 (∼17%), ImageNet-100 (∼5%), and Tiny ImageNet (∼9%). We also highlight the flexibility of our approach in solving novel class discovery task, demonstrate its stability in dealing with imbalanced data, and complement our approach with a technique to estimate the number of novel classes. Code: https://github.com/nayeemrizve/TRSSL
Kumar, Aakash; Kini, Jyoti; Mian, Ajmal; Shah, Mubarak
Self Supervised Learning for Multiple Object Tracking in 3D Point Clouds Conference
2022 IEEE/RSJ International Conference on Intelligent Robots and Systems, 2022.
Abstract | Tags: IROS | Links:
@conference{Kumar2022,
title = {Self Supervised Learning for Multiple Object Tracking in 3D Point Clouds},
author = {Aakash Kumar and Jyoti Kini and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/camera_ready_paper.pdf},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {2022 IEEE/RSJ International Conference on Intelligent Robots and Systems},
abstract = {Multiple object tracking in 3D point clouds has applications in mobile robots and autonomous driving. This is a challenging problem due to the sparse nature of the point clouds and the added difficulty of annotation in 3D for supervised learning. To overcome these challenges, we propose a neural network architecture that learns effective object features and their affinities in a self supervised fashion for multiple object tracking in 3D point clouds captured with LiDAR sensors. For self supervision, we use two approaches. First, we generate two augmented LiDAR frames from a single real frame by applying translation, rotation and cutout to the objects. Second, we synthesize a LiDAR frame using CAD models or primitive geometric shapes and then apply the above three augmentations to them. Hence, the ground truth object locations and associations are known in both frames for self supervision. This removes the need to annotate object associations in real data, and additionally the need for training data collection and annotation for object detection in synthetic data. To the best of our knowledge, this is the first self supervised multiple object tracking method for 3D data. Our model achieves state of the art results.},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Arif, Maliha; Yong, Calvin; Mahalanobis, Abhijit; Rahnavard, Nazanin
Background-Tolerant Object Classification with Embedded Segmentation Mask for Infrared and Color Imagery Conference
IEEE International Conference on Image Processing, 2022.
Abstract | Tags: ICIP | Links:
@conference{Arif2022,
title = {Background-Tolerant Object Classification with Embedded Segmentation Mask for Infrared and Color Imagery},
author = {Maliha Arif and Calvin Yong and Abhijit Mahalanobis and Nazanin Rahnavard},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Final_ICIP2022_MA_submission.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/bg_poster_ICIP2022_Single.jpg},
year = {2022},
date = {2022-10-16},
urldate = {2022-10-16},
booktitle = {IEEE International Conference on Image Processing},
abstract = {Even though convolutional neural networks (CNNs) can classify objects in images very accurately, it is well known that the attention of the network may not always be on the semantically important regions of the scene. It has been observed that networks often learn background textures, which are not relevant to the object of interest. In turn this makes the networks susceptible to variations and changes in the background
which may negatively affect their performance.
We propose a new three-step training procedure called split training to reduce this bias in CNNs for object recognition using Infrared imagery and Color (RGB) data. Our split training procedure has three steps. First, a baseline model is trained to recognize objects in images without background, and the activations produced by the higher layers are observed. Next, a second network is trained using Mean Square Error (MSE) loss to produce the same activations, but in response to the objects embedded in background. This forces the second network to ignore the background while focusing on the object of interest. Finally, with layers producing the activations frozen, the rest of the second network is trained using cross-entropy loss to classify the objects in images with background. Our training method outperforms the traditional training procedure in both a simple CNN architecture, as well as for deep CNNs like VGG and DenseNet, and learns to mimic human vision which focuses more on shape and structure than background with higher accuracy.
Index Terms— infrared imagery, background invariant learning, grad-CAM, split training, MS-COCO},
keywords = {ICIP},
pubstate = {published},
tppubtype = {conference}
}
which may negatively affect their performance.
We propose a new three-step training procedure called split training to reduce this bias in CNNs for object recognition using Infrared imagery and Color (RGB) data. Our split training procedure has three steps. First, a baseline model is trained to recognize objects in images without background, and the activations produced by the higher layers are observed. Next, a second network is trained using Mean Square Error (MSE) loss to produce the same activations, but in response to the objects embedded in background. This forces the second network to ignore the background while focusing on the object of interest. Finally, with layers producing the activations frozen, the rest of the second network is trained using cross-entropy loss to classify the objects in images with background. Our training method outperforms the traditional training procedure in both a simple CNN architecture, as well as for deep CNNs like VGG and DenseNet, and learns to mimic human vision which focuses more on shape and structure than background with higher accuracy.
Index Terms— infrared imagery, background invariant learning, grad-CAM, split training, MS-COCO
Pillai, Manu S; Bhattacharya, Abhijeet; Baweja, Tanmay; Gupta, Rohit; Shah, Mubarak
DEEPSAR: Vessel Detection In SAR Imagery With Noisy Labels Conference
IEEE International Conference on Image Processing, 2022.
Tags: ICIP | Links:
@conference{Pillai2023,
title = {DEEPSAR: Vessel Detection In SAR Imagery With Noisy Labels},
author = {Manu S Pillai and Abhijeet Bhattacharya and Tanmay Baweja and Rohit Gupta and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICIP_Submission.pdf},
year = {2022},
date = {2022-10-16},
urldate = {2023-10-08},
publisher = {IEEE International Conference on Image Processing},
keywords = {ICIP},
pubstate = {published},
tppubtype = {conference}
}
Kini, Jyoti; Shah, Mubarak
Tag-Based Attention Guided Bottom-Up Approach for Video Instance Segmentation Conference
26th International Conference on Pattern Recognition, 2022.
Tags: ICPR, Video Instance Segmentation | Links:
@conference{Kini2022b,
title = {Tag-Based Attention Guided Bottom-Up Approach for Video Instance Segmentation},
author = {Jyoti Kini and Mubarak Shah },
url = {https://arxiv.org/pdf/2204.10765.pdf},
year = {2022},
date = {2022-08-21},
urldate = {2022-08-21},
booktitle = {26th International Conference on Pattern Recognition},
issue = {arxiv:2204.10765},
keywords = {ICPR, Video Instance Segmentation},
pubstate = {published},
tppubtype = {conference}
}
Beetham, James; Kardan, Navid; Mian, Ajmal; Shah, Mubarak
Detecting Compromised Architecture/Weights of a Deep Model Conference
26th International Conference on Pattern Recognition, 2022.
Tags: ICPR | Links:
@conference{Beetham2022,
title = {Detecting Compromised Architecture/Weights of a Deep Model},
author = {James Beetham and Navid Kardan and Ajmal Mian and Mubarak Shah},
url = {https://ieeexplore.ieee.org/abstract/document/9956280},
year = {2022},
date = {2022-08-21},
urldate = {2022-08-21},
booktitle = {26th International Conference on Pattern Recognition},
keywords = {ICPR},
pubstate = {published},
tppubtype = {conference}
}
Ristea, Nicolae-Catalin; Madan, Neelu; Ionescu, Radu Tudor; Nasrollahi, Kamal; Khan, Fahad Shahbaz; Moeslund, Thomas B.; Shah, Mubarak
Self-Supervised Predictive Convolutional Attentive Block for Anomaly Detection Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Tags: Anomaly Detection, CVPR, Self-Supervised Learning, Self-Supervision | Links:
@conference{nokey,
title = {Self-Supervised Predictive Convolutional Attentive Block for Anomaly Detection},
author = {Nicolae-Catalin Ristea and Neelu Madan and Radu Tudor Ionescu and Kamal Nasrollahi and Fahad Shahbaz Khan and Thomas B. Moeslund and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/SSPCAB_camera-arxiv.pdf},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {Anomaly Detection, CVPR, Self-Supervised Learning, Self-Supervision},
pubstate = {published},
tppubtype = {conference}
}
Karim, Nazmul; Rizve, Mamshad Nayeem; Rahnavard, Nazanin; Mian, Ajmal; Shah, Mubarak
UNICON: Combating Label Noise Through Uniform Selection and Contrastive Learning Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Tags: Contrastive Learning, CVPR, Noisy Labels, Semi-supervised learning | Links:
@conference{nokey,
title = {UNICON: Combating Label Noise Through Uniform Selection and Contrastive Learning},
author = {Nazmul Karim and Mamshad Nayeem Rizve and Nazanin Rahnavard and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/07363.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/07363-supp.pdf
https://github.com/nazmul-karim170/unicon-noisy-label},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {Contrastive Learning, CVPR, Noisy Labels, Semi-supervised learning},
pubstate = {published},
tppubtype = {conference}
}
Acsintoae, Andra; Florescu, Andrei; Georgescu, Mariana-Iuliana; Mare, Tudor; Sumedrea, Paul; Ionescu, Radu Tudor; Khan, Fahad Shahbaz; Shah, Mubarak
UBnormal: New Benchmark for Supervised Open-Set Video Anomaly Detection Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Tags: Anomaly Detection, CVPR, Dataset | Links:
@conference{nokey,
title = {UBnormal: New Benchmark for Supervised Open-Set Video Anomaly Detection},
author = {Andra Acsintoae and Andrei Florescu and Mariana-Iuliana Georgescu and Tudor Mare and Paul Sumedrea and Radu Tudor Ionescu and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/04315.pdf
https://github.com/lilygeorgescu/UBnormal},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {Anomaly Detection, CVPR, Dataset},
pubstate = {published},
tppubtype = {conference}
}
Dave, Ishan Rajendrakumar; Chen, Chen; Shah, Mubarak
SPAct: Self-supervised Privacy Preservation for Action Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Tags: Action Recognition, CVPR, Privacy Preservation | Links:
@conference{nokey,
title = {SPAct: Self-supervised Privacy Preservation for Action Recognition},
author = {Ishan Rajendrakumar Dave and Chen Chen and Mubarak Shah},
url = {https://arxiv.org/pdf/2203.15205.pdf
https://github.com/DAVEISHAN/SPAct
https://www.youtube.com/watch?v=_PAlMT7ozts},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {Action Recognition, CVPR, Privacy Preservation},
pubstate = {published},
tppubtype = {conference}
}
Kumar, Akash; Rawat, Yogesh Singh
End-to-End Semi-Supervised Learning for Video Action Detection Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Tags: CVPR, Semi-supervised learning | Links:
@conference{nokey,
title = {End-to-End Semi-Supervised Learning for Video Action Detection},
author = {Akash Kumar and Yogesh Singh Rawat},
url = {https://arxiv.org/pdf/2203.04251.pdf},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR, Semi-supervised learning},
pubstate = {published},
tppubtype = {conference}
}
Mendieta, Matias; Yang, Taojiannan; Wang, Pu; Lee, Minwoo; Ding, Zhengming; Chen, Chen
Local Learning Matters: Rethinking Data Heterogeneity in Federated Learning Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Tags: CVPR, Federated Learning | Links:
@conference{nokey,
title = {Local Learning Matters: Rethinking Data Heterogeneity in Federated Learning},
author = {Matias Mendieta and Taojiannan Yang and Pu Wang and Minwoo Lee and Zhengming Ding and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/11405.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/11405_supp.pdf},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {CVPR, Federated Learning},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Sijie; Shah, Mubarak; Chen, Chen
TransGeo: Transformer Is All You Need for Cross-view Image Geo-localization Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Tags: Cross-View, CVPR, Geo-Localization, Transformers | Links:
@conference{nokey,
title = {TransGeo: Transformer Is All You Need for Cross-view Image Geo-localization},
author = {Sijie Zhu and Mubarak Shah and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/11695.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/11695-supp.pdf},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {Cross-View, CVPR, Geo-Localization, Transformers},
pubstate = {published},
tppubtype = {conference}
}
Cao, Jiale; Pang, Yenwai; Anwer, Rao Muhammad; Cholakkal, Hisham; Xie, Jin; Shah, Mubarak; Khan, Fahad Shahbaz
PSTR: End-to-End One-Step Person Search With Transformers Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Abstract | Tags: CVPR, Re-Identification, Transformers, Visual Search | Links:
@conference{nokey,
title = {PSTR: End-to-End One-Step Person Search With Transformers},
author = {Jiale Cao and Yenwai Pang and Rao Muhammad Anwer and Hisham Cholakkal and Jin Xie and Mubarak Shah and Fahad Shahbaz Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/05237-2.pdf
https://github.com/JialeCao001/PSTR},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {We propose a novel one-step transformer-based person search framework, PSTR, that jointly performs person detection and re-identification (re-id) in a single architecture. PSTR comprises a person search-specialized (PSS) module that contains a detection encoder-decoder for person detection along with a discriminative re-id decoder for person re-id. The discriminative re-id decoder utilizes a multi-level supervision scheme with a shared decoder for
discriminative re-id feature learning and also comprises a part attention block to encode relationship between different parts of a person. We further introduce a simple multi-scale scheme to support re-id across person instances at different scales. PSTR jointly achieves the diverse objectives of object-level recognition (detection) and instance-level matching (re-id). To the best of our knowledge, we are the first to propose an end-to-end one-step
transformer-based person search framework. Experiments are performed on two popular benchmarks: CUHK-SYSU and PRW. Our extensive ablations reveal the merits of the proposed contributions. Further, the proposed PSTR sets a new state-of-the-art on both benchmarks. On the challenging
PRW benchmark, PSTR achieves a mean average precision (mAP) score of 56.5%. The source code is available at https://github.com/JialeCao001/PSTR.},
keywords = {CVPR, Re-Identification, Transformers, Visual Search},
pubstate = {published},
tppubtype = {conference}
}
discriminative re-id feature learning and also comprises a part attention block to encode relationship between different parts of a person. We further introduce a simple multi-scale scheme to support re-id across person instances at different scales. PSTR jointly achieves the diverse objectives of object-level recognition (detection) and instance-level matching (re-id). To the best of our knowledge, we are the first to propose an end-to-end one-step
transformer-based person search framework. Experiments are performed on two popular benchmarks: CUHK-SYSU and PRW. Our extensive ablations reveal the merits of the proposed contributions. Further, the proposed PSTR sets a new state-of-the-art on both benchmarks. On the challenging
PRW benchmark, PSTR achieves a mean average precision (mAP) score of 56.5%. The source code is available at https://github.com/JialeCao001/PSTR.
Gupta, Akshita; Narayan, Sanath; Joseph, K J; Khan, Salman; Khan, Fahad Shahbaz; Shah, Mubarak
OW-DETR: Open-world Detection Transformer Conference
IEEE Computer Vision and Pattern Recognition, 2022.
Abstract | Tags: CVPR, Object Detection, Open World, Transformers | Links:
@conference{nokey,
title = {OW-DETR: Open-world Detection Transformer},
author = {Akshita Gupta and Sanath Narayan and K J Joseph and Salman Khan and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/03815.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/03815-supp.pdf
https://github.com/akshitac8/OW-DETR.},
year = {2022},
date = {2022-06-19},
urldate = {2022-06-19},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Open-world object detection (OWOD) is a challenging computer vision problem, where the task is to detect a known set of object categories while simultaneously identifying unknown objects. Additionally, the model must incrementally learn new classes that become known in the next training episodes. Distinct from standard object detection, the OWOD setting poses significant challenges for generating quality candidate proposals on potentially unknown objects, separating the unknown objects from the background and detecting diverse unknown objects. Here, we introduce a novel end-to-end transformer-based framework, OW-DETR, for open-world object detection. The proposed OW-DETR comprises three dedicated components
namely, attention-driven pseudo-labeling, novelty classification and objectness scoring to explicitly address the aforementioned OWOD challenges. Our OW-DETR explicitly encodes multi-scale contextual information, possesses less inductive bias, enables knowledge transfer from known classes to the unknown class and can better discriminate between unknown objects and background. Comprehensive experiments are performed on two benchmarks: MS-COCO and PASCAL VOC. The extensive ablations reveal the merits of our proposed contributions. Further, our model outperforms the recently introduced OWOD approach, ORE, with absolute gains ranging from 1.8% to 3.3% in terms of unknown recall on MS-COCO. In the case of incremental
object detection, OW-DETR outperforms the state-of-the art for all settings on PASCAL VOC. Our code is available at https://github.com/akshitac8/OW-DETR.},
keywords = {CVPR, Object Detection, Open World, Transformers},
pubstate = {published},
tppubtype = {conference}
}
namely, attention-driven pseudo-labeling, novelty classification and objectness scoring to explicitly address the aforementioned OWOD challenges. Our OW-DETR explicitly encodes multi-scale contextual information, possesses less inductive bias, enables knowledge transfer from known classes to the unknown class and can better discriminate between unknown objects and background. Comprehensive experiments are performed on two benchmarks: MS-COCO and PASCAL VOC. The extensive ablations reveal the merits of our proposed contributions. Further, our model outperforms the recently introduced OWOD approach, ORE, with absolute gains ranging from 1.8% to 3.3% in terms of unknown recall on MS-COCO. In the case of incremental
object detection, OW-DETR outperforms the state-of-the art for all settings on PASCAL VOC. Our code is available at https://github.com/akshitac8/OW-DETR.
Dave, Ishan; Gupta, Rohit; Rizve, Mamshad Nayeem; Shah, Mubarak
TCLR: Temporal Contrastive Learning for Video Representation Journal Article
In: Computer Vision and Image Understanding, vol. 219, iss. 1077-3142, pp. 103406, 2022.
Abstract | Tags: Self-Supervised Learning | Links:
@article{nokey,
title = {TCLR: Temporal Contrastive Learning for Video Representation},
author = {Ishan Dave and Rohit Gupta and Mamshad Nayeem Rizve and Mubarak Shah },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1-s2.0-S1077314222000376-main.pdf
https://github.com/DAVEISHAN/TCLR},
doi = {https://doi.org/10.1016/j.cviu.2022.103406},
year = {2022},
date = {2022-05-01},
urldate = {2022-05-01},
journal = {Computer Vision and Image Understanding},
volume = {219},
issue = {1077-3142},
pages = {103406},
abstract = {Contrastive learning has nearly closed the gap between supervised and self-supervised learning of image representations, and has also been explored for videos. However, prior work on contrastive learning for video data has not explored the effect of explicitly encouraging the features to be distinct across the temporal dimension. We develop a new temporal contrastive learning framework consisting of two novel losses to improve upon existing contrastive self-supervised video representation learning methods. The local–local temporal contrastive loss adds the task of discriminating between non-overlapping clips from the same video, whereas the global–local temporal contrastive aims to discriminate between timesteps of the feature map of an input clip in order to increase the temporal diversity of the learned features. Our proposed temporal contrastive learning framework achieves significant improvement over the state-of-the-art results in various downstream video understanding tasks such as action recognition, limited-label action classification, and nearest-neighbor video retrieval on multiple video datasets and backbones. We also demonstrate significant improvement in fine-grained action classification for visually similar classes. With the commonly used 3D ResNet-18 architecture with UCF101 pretraining, we achieve 82.4% (+5.1% increase over the previous best) top-1 accuracy on UCF101 and 52.9% (+5.4% increase) on HMDB51 action classification, and 56.2% (+11.7% increase) Top-1 Recall on UCF101 nearest neighbor video retrieval. Code released at https://github.com/DAVEISHAN/TCLR.},
keywords = {Self-Supervised Learning},
pubstate = {published},
tppubtype = {article}
}
Kini, Jyoti; Khan, Fahad Shahbaz; Khan, Salman; Shah, Mubarak
Self-Supervised Video Object Segmentation via Cutout Prediction and Tagging Technical Report
no. arXiv:2204.10846, 2022.
Tags: Self-Supervised Learning, Video Object Segmentation | Links:
@techreport{Kini2022,
title = {Self-Supervised Video Object Segmentation via Cutout Prediction and Tagging},
author = {Jyoti Kini and Fahad Shahbaz Khan and Salman Khan and Mubarak Shah
},
url = {https://arxiv.org/pdf/2204.10846.pdf},
year = {2022},
date = {2022-04-24},
urldate = {2022-04-24},
number = {arXiv:2204.10846},
keywords = {Self-Supervised Learning, Video Object Segmentation},
pubstate = {published},
tppubtype = {techreport}
}
Modi, Rajat; Rana, Aayush Jung; Kumar, Akash; Tirupattur, Praveen; Vyas, Shruti; Rawat, Yogesh Singh; Shah, Mubarak
Video Action Detection: Analysing Limitations and Challenges Conference
IEEE Computer Vision and Pattern Recognition, 2022.
@conference{Modi2022,
title = {Video Action Detection: Analysing Limitations and Challenges},
author = {Rajat Modi and Aayush Jung Rana and Akash Kumar and Praveen Tirupattur and Shruti Vyas and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2204.07892.pdf},
year = {2022},
date = {2022-04-17},
urldate = {2022-04-17},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Hassan, Shah; Jiban, MdJibanul Haque; Mahalanobis, Abhijit
Performance Evaluation of Boosted 2-stream TCRNet Conference
International Congress on Information and Communication Technology, 2022.
Tags: ICICT | Links:
@conference{nokey,
title = {Performance Evaluation of Boosted 2-stream TCRNet},
author = {Shah Hassan and MdJibanul Haque Jiban and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/paper.pdf},
year = {2022},
date = {2022-03-02},
urldate = {2022-03-02},
publisher = {International Congress on Information and Communication Technology},
keywords = {ICICT},
pubstate = {published},
tppubtype = {conference}
}
Pestana, Camilo; Akhtar, Naveed; Rahnavard, Nazanin; Shah, Mubarak; Mian, Ajmal
Transferable 3D Adversarial Textures using End-to-end Optimization Conference
IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), 2022.
Abstract | Tags: Adversarial Attacks, WACV | Links:
@conference{Pestana2022,
title = {Transferable 3D Adversarial Textures using End-to-end Optimization},
author = {Camilo Pestana and Naveed Akhtar and Nazanin Rahnavard and Mubarak Shah and Ajmal Mian},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/WACV_2022_Deceptive_Textures-1.pdf},
doi = {10.1109/WACV51458.2022.00080},
year = {2022},
date = {2022-02-15},
urldate = {2022-02-15},
pages = {727-736},
publisher = {IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},
abstract = {Deep visual models are known to be vulnerable to adversarial attacks. The last few years have seen numerous techniques to compute adversarial inputs for these models. However, there are still under-explored avenues in this critical research direction. Among those is the estimation of adversarial textures for 3D models in an end-to-end optimization scheme. In this paper, we propose such a scheme to generate adversarial textures for 3D models that are highly transferable and invariant to different camera views and lighting conditions. Our method makes use of neural rendering with explicit control over the model texture and background. We ensure transferability of the adversarial textures by employing an ensemble of robust and non-robust models. Our technique utilizes 3D models as a proxy to simulate closer to real-life conditions, in contrast to conventional use of 2D images for adversarial attacks. We show the efficacy of our method with extensive experiments.},
keywords = {Adversarial Attacks, WACV},
pubstate = {published},
tppubtype = {conference}
}
Aafaq, Nayyer; Mian, Ajmal; Naveed Akhtar, Wei Liu; Shah, Mubarak
Dense Video Captioning with Early Linguistic Information Fusion Journal Article
In: IEEE Transactions on Multimedia, pp. 1-1, 2022.
Abstract | Tags: Adversarial Attacks | Links:
@article{nokey,
title = {Dense Video Captioning with Early Linguistic Information Fusion},
author = {Nayyer Aafaq and Ajmal Mian and Naveed Akhtar, Wei Liu and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/IEEE_TMM_Nayyer_Jan_2022_Final_Version_Manuscript.pdf},
doi = {10.1109/TMM.2022.3146005},
year = {2022},
date = {2022-01-25},
urldate = {2022-01-25},
journal = {IEEE Transactions on Multimedia},
pages = {1-1},
abstract = {Dense captioning methods generally detect events in videos first and then generate captions for the individual events. Events are localized solely based on the visual cues while ignoring the associated linguistic information and context. Whereas end-to-end learning may implicitly take guidance from language, these methods still fall short of the power of explicit modeling. In this paper, we propose a Visual-Semantic Embedding (ViSE) Framework that models the word(s)-context distributional properties over the entire semantic space and computes weights for all the n-grams such that higher weights are assigned to the more informative n-grams. The weights are accounted for in learning distributed representations of all the captions to construct a semantic space. To perform the contextualization
of visual information and the constructed semantic space in a supervised manner, we design Visual-Semantic Joint Modeling Network (VSJM-Net). The learned ViSE embeddings are then temporally encoded with a Hierarchical Descriptor Transformer (HDT). For caption generation, we exploit a transformer architecture to decode the input embeddings into natural language descriptions. Experiments on the large-scale ActivityNet Captions dataset and YouCook-II dataset demonstrate the efficacy of our method.
Index Terms—Dense video captioning, event localisation, language and vision, video captioning, context modeling.},
keywords = {Adversarial Attacks},
pubstate = {published},
tppubtype = {article}
}
of visual information and the constructed semantic space in a supervised manner, we design Visual-Semantic Joint Modeling Network (VSJM-Net). The learned ViSE embeddings are then temporally encoded with a Hierarchical Descriptor Transformer (HDT). For caption generation, we exploit a transformer architecture to decode the input embeddings into natural language descriptions. Experiments on the large-scale ActivityNet Captions dataset and YouCook-II dataset demonstrate the efficacy of our method.
Index Terms—Dense video captioning, event localisation, language and vision, video captioning, context modeling.
Kardan, Navid; Hill, Mitchell; Shah, Mubarak
Self-Joint Supervised Learning Conference
International Conference on Learning Representations (ICLR), 2022.
Abstract | Tags: ICLR | Links:
@conference{Kardan2022,
title = {Self-Joint Supervised Learning},
author = {Navid Kardan and Mitchell Hill and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Self_joint_ICLR-002.pdf
https://github.com/ndkn/Self-joint-Learning},
year = {2022},
date = {2022-01-20},
urldate = {2022-01-20},
publisher = {International Conference on Learning Representations (ICLR)},
abstract = {Supervised learning is a fundamental framework used to train machine learning systems. A supervised learning problem is often formulated using an i.i.d. as-sumption that restricts model attention to a single relevant signal at a time when predicting. This contrasts with the human ability to actively use related samples as reference when making decisions. We hypothesize that the restriction to a single signal for each prediction in the standard i.i.d. framework contributes to well-known drawbacks of supervised learning: making overconfident predictions and vulnerability to overfitting, adversarial attacks, and out-of-distribution data. To address these limitations, we propose a new supervised learning paradigm called self-joint learning that generalizes the standard approach by modeling the joint conditional distribution of two observed samples, where each sample is an im-age and its label. Rather than assuming samples are independent, our models explicitly learn the sample-to-sample relation of conditional independence. Our framework can naturally incorporate auxiliary unlabeled data to further improve the performance. Experiments on benchmark image datasets show our method offers significant improvement over standard supervised learning in terms of ac-curacy, robustness against adversarial attacks, out-of-distribution detection, and overconfidence mitigation. Code: github.com/ndkn/Self-joint-Learning},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Colvin, Dylan J.; Frota, Rafaela; Gupta, Rohit; Li, Mengjie; Seigneur, Hubert P.; Vyas, Shruti; Oliveira, Sofia; Shah, Mubarak; Davis, Kristopher O.
Automated Defect Detection and Localization in Photovoltaic Cells Using Semantic Segmentation of Electroluminescence Images Journal Article
In: IEEE Journal of Photovoltaics, vol. 12, no. 1, pp. 53-61, 2022.
Abstract | Tags: REU, Semantic Segmentation, Solar Cells | Links:
@article{Fioresi2022,
title = {Automated Defect Detection and Localization in Photovoltaic Cells Using Semantic Segmentation of Electroluminescence Images},
author = {Joseph Fioresi and Dylan J. Colvin and Rafaela Frota and Rohit Gupta and Mengjie Li and Hubert P. Seigneur and Shruti Vyas and Sofia Oliveira and Mubarak Shah and Kristopher O. Davis},
url = {https://ieeexplore.ieee.org/document/9650542},
doi = {10.1109/JPHOTOV.2021.3131059},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {IEEE Journal of Photovoltaics},
volume = {12},
number = {1},
pages = {53-61},
abstract = {In this article, we propose a deep learning based semantic segmentation model that identifies and segments defects in electroluminescence (EL) images of silicon photovoltaic (PV) cells. The proposed model can differentiate between cracks, contact interruptions, cell interconnect failures, and contact corrosion for both multicrystalline and monocrystalline silicon cells. Our model utilizes a segmentation Deeplabv3 model with a ResNet-50 backbone. It was trained on 17,064 EL images including 256 physically realistic simulated images of PV cells generated to deal with class imbalance. While performing semantic segmentation for five defect classes, this model achieves a weighted F1-score of 0.95, an unweighted F1-score of 0.69, a pixel-level global accuracy of 95.4%, and a mean intersection over union score of 57.3%. In addition, we introduce the UCF EL Defect dataset, a large-scale dataset consisting of 17,064 EL images, which will be publicly available for use by the PV and computer vision research communities.},
keywords = {REU, Semantic Segmentation, Solar Cells},
pubstate = {published},
tppubtype = {article}
}
2021
Kerrigan, Alec; Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
Reformulating Zero-shot Action Recognition for Multi-label Actions Conference
Thirty-fifth Conference on Neural Information Processing Systems, 2021.
Tags: Action Recognition, NeurIPS, Zero-Shot Learning | Links:
@conference{Kerrigan2021,
title = {Reformulating Zero-shot Action Recognition for Multi-label Actions},
author = {Alec Kerrigan and Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/reformulating_zero_shot_action2.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ZSL-Supp.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Reformulating-Zero-shot-Action-Recognition-forMulti-label-Actions.pptx},
year = {2021},
date = {2021-12-06},
urldate = {2021-12-06},
booktitle = {Thirty-fifth Conference on Neural Information Processing Systems},
keywords = {Action Recognition, NeurIPS, Zero-Shot Learning},
pubstate = {published},
tppubtype = {conference}
}
Lei, Huan; Akhtar, Naveed; Shah, Mubarak; Mian, Ajmal
Geometric Feature Learning for 3D Meshes Journal Article
In: arXiv, 2021.
Abstract | Tags: | Links:
@article{nokey,
title = {Geometric Feature Learning for 3D Meshes},
author = {Huan Lei and Naveed Akhtar and Mubarak Shah and Ajmal Mian},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2112.01801.pdf
https://github.com/EnyaHermite/Picasso},
year = {2021},
date = {2021-12-03},
journal = {arXiv},
abstract = {—Geometric feature learning for 3D meshes is central to computer graphics and highly important for numerous vision applications. However, deep learning currently lags in hierarchical modeling of heterogeneous 3D meshes due to the lack of required operations and/or their efficient implementations. In this paper, we propose a series of modular operations for effective geometric deep learning over heterogeneous 3D meshes. These operations include mesh convolutions, (un)pooling and efficient mesh decimation. We provide open source implementation of these operations, collectively termed Picasso. The mesh decimation module of Picasso is GPU-accelerated, which can process a batch of meshes on-the-fly for deep learning. Our (un)pooling operations compute features for newly-created neurons across network layers of varying resolution. Our mesh convolutions include facet2vertex, vertex2facet, and facet2facet convolutions that exploit vMF mixture and Barycentric interpolation to incorporate fuzzy modelling. Leveraging the modular operations of Picasso, we contribute a novel hierarchical neural network, PicassoNet-II, to learn highly discriminative features from 3D meshes. PicassoNet-II accepts primitive geometrics and fine textures of mesh facets as input features, while processing full scene meshes. Our network achieves highly competitive performance for shape analysis and scene parsing on a variety of benchmarks. We release Picasso and PicassoNet-II on Github.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Duarte, Kevin
Capsule Networks for Video Understanding PhD Thesis
University of Central Florida, 2021.
Tags: Action Detection, Capsule Networks, Multi-Modal Learning, Ph.D. Dissertation, Text and Video | Links:
@phdthesis{Duarte2021b,
title = {Capsule Networks for Video Understanding},
author = {Kevin Duarte},
url = {https://www.crcv.ucf.edu/people/alumni/#:~:text=Capsule%20Networks%20for%20Video%20Understanding},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
school = {University of Central Florida},
keywords = {Action Detection, Capsule Networks, Multi-Modal Learning, Ph.D. Dissertation, Text and Video},
pubstate = {published},
tppubtype = {phdthesis}
}
Duarte, Kevin; Chen, Brian; Shvetsova, Nina; Rouditchenko, Andrew; Thomas, Samuel; Liu, Alexander; Harwath, David; Glass, James; Kuehne, Hilde; Shah, Mubarak
Routing with Self-Attention for Multimodal Capsule Networks Unpublished
arXiv preprint arXiv:2112.00775, 2021.
Abstract | Tags: Audio, Capsule Networks, Multi-Modal Learning, Text and Video | Links:
@unpublished{nokey,
title = {Routing with Self-Attention for Multimodal Capsule Networks},
author = {Kevin Duarte and Brian Chen and Nina Shvetsova and Andrew Rouditchenko and Samuel Thomas and Alexander Liu and David Harwath and James Glass and Hilde Kuehne and Mubarak Shah},
editor = {arXiv},
url = {https://arxiv.org/pdf/2112.00775.pdf
https://arxiv.org/abs/2112.00775},
doi = { https://doi.org/10.48550/arXiv.2112.00775},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
abstract = {The task of multimodal learning has seen a growing interest recently as it allows for training neural architectures based on different modalities such as vision, text, and audio. One challenge in training such models is that they need to jointly learn semantic concepts and their relationships across different input representations. Capsule networks have been shown to perform well in context of capturing the relation between low-level input features and higher-level concepts. However, capsules have so far mainly been used only in small-scale fully supervised settings due to the resource demand of conventional routing algorithms. We present a new multimodal capsule network that allows us to leverage the strength of capsules in the context of a multimodal learning framework on large amounts of video data. To adapt the capsules to large-scale input data, we propose a novel routing by self-attention mechanism that selects relevant capsules which are then used to generate a final joint multimodal feature representation. This allows not only for robust training with noisy video data, but also to scale up the size of the capsule network compared to traditional routing methods while still being computationally efficient. We evaluate the proposed architecture by pretraining it on a large-scale multimodal video dataset and applying it on four datasets in two challenging downstream tasks. Results show that the proposed multimodal capsule network is not only able to improve results compared to other routing techniques, but also achieves competitive performance on the task of multimodal learning.},
howpublished = {arXiv preprint arXiv:2112.00775},
keywords = {Audio, Capsule Networks, Multi-Modal Learning, Text and Video},
pubstate = {published},
tppubtype = {unpublished}
}
Rajasegaran, Jathushan; Khan, Salman; Hayat, Munawar; Khan, Fahad Shahbaz; Shah, Mubarak
Meta-learning the Learning Trends Shared Across Tasks Conference
British Machine Vision Conference, Nov 22-25, 2021.
Tags: BMVC, Few-Shot Learning, Meta-Learning | Links:
@conference{Rajasegaran2021,
title = {Meta-learning the Learning Trends Shared Across Tasks},
author = {Jathushan Rajasegaran and Salman Khan and Munawar Hayat and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.bmvc2021-virtualconference.com/conference/papers/paper_0874.html},
year = {2021},
date = {2021-11-22},
urldate = {2021-11-22},
booktitle = {British Machine Vision Conference, Nov 22-25},
keywords = {BMVC, Few-Shot Learning, Meta-Learning},
pubstate = {published},
tppubtype = {conference}
}
Rajasegaran, Jathushan; Khan, Salman; Hayat, Munawar; Khan, Fahad Shahbaz; Shah, Mubarak
Self-supervised Knowledge Distillation for Few-shot Learning Conference
British Machine Vision Conference, Nov 22-25, 2021.
Tags: BMVC, Few-Shot Learning, Knowledge Distillation, Self-Supervision | Links:
@conference{Rajasegaran2020,
title = {Self-supervised Knowledge Distillation for Few-shot Learning},
author = {Jathushan Rajasegaran and Salman Khan and Munawar Hayat and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Self-supervised-Knowledge-Distillation-for-Few-shot-Learning.pdf
https://bmvc2021-virtualconference.com/conference/papers/paper_0820.html
https://github.com/brjathu/SKD},
year = {2021},
date = {2021-11-22},
urldate = {2021-11-22},
booktitle = {British Machine Vision Conference, Nov 22-25},
keywords = {BMVC, Few-Shot Learning, Knowledge Distillation, Self-Supervision},
pubstate = {published},
tppubtype = {conference}
}
Akhtar, Naveed; Mian, Ajmal; Kardan, Navid; Shah, Mubarak
Advances in Adversarial Attacks and Defenses in Computer Vision: A Survey Journal Article
In: IEEE Access, vol. 9, pp. 155161-155196, 2021.
Abstract | Tags: Adversarial Attacks | Links:
@article{Akhtar2021,
title = {Advances in Adversarial Attacks and Defenses in Computer Vision: A Survey},
author = {Naveed Akhtar and Ajmal Mian and Navid Kardan and Mubarak Shah},
editor = {IEEE Access},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Advances_in_Adversarial_Attacks_and_Defenses_in_Computer_Vision_A_Survey.pdf},
doi = {10.1109/ACCESS.2021.3127960},
year = {2021},
date = {2021-11-12},
urldate = {2021-11-12},
journal = {IEEE Access},
volume = {9},
pages = {155161-155196},
abstract = {Deep Learning is the most widely used tool in the contemporary field of computer vision. Its ability to accurately solve complex problems is employed in vision research to learn deep neural models for a variety of tasks, including security critical applications. However, it is now known that deep learning is vulnerable to adversarial attacks that can manipulate its predictions by introducing visually imperceptible perturbations in images and videos. Since the discovery of this phenomenon in 2013, it has attracted significant attention of researchers from multiple sub-fields of machine intelligence. In 2018, we published the first-ever review of the contributions made by the computer vision community in adversarial attacks on deep learning (and their defenses). Many of those contributions have inspired new directions in this area, which has matured significantly since witnessing the first generation methods. Hence, as a legacy sequel of our first literature survey, this review article focuses on the advances in this area since 2018. We thoroughly discuss the first generation attacks and comprehensively cover the modern attacks and their defenses appearing in the prestigious sources of computer vision and machine learning research. Besides offering the most comprehensive literature review of adversarial attacks and defenses to date, the article also provides concise definitions of technical terminologies for the non-experts. Finally, it discusses challenges and future outlook of this direction based on the literature since the advent of this research direction.},
keywords = {Adversarial Attacks},
pubstate = {published},
tppubtype = {article}
}
Xia, Haifeng; Jing, Taotao; Chen, Chen; Ding, Zhengming
Semi-supervised Domain Adaptive Retrieval via Discriminative Hashing Learning Conference
ACM Multimedia (ACM MM), 2021 (Oral), 2021.
Tags: ACM MM | Links:
@conference{nokey,
title = {Semi-supervised Domain Adaptive Retrieval via Discriminative Hashing Learning},
author = {Haifeng Xia and Taotao Jing and Chen Chen and Zhengming Ding},
url = {https://www.crcv.ucf.edu/chenchen/DHLing_MM_2021.pdf},
doi = {10.1145/3474085.3475526},
year = {2021},
date = {2021-10-20},
booktitle = {ACM Multimedia (ACM MM), 2021 (Oral)},
keywords = {ACM MM},
pubstate = {published},
tppubtype = {conference}
}
Zaeemzadeh, Alireza; Ghadar, Shabnam; Faieta, Baldo; Lin, Zhe; Rahnavard, Nazanin; Shah, Mubarak; Kalarot, Ratheesh
Face Image Retrieval with Attribute Manipulation Conference
International Conference on Computer Vision, 2021.
Abstract | Tags: ICCV, Style GAN, Visual Search | Links:
@conference{ZaeemzadehICCV2021,
title = {Face Image Retrieval with Attribute Manipulation},
author = {Alireza Zaeemzadeh and Shabnam Ghadar and Baldo Faieta and Zhe Lin and Nazanin Rahnavard and Mubarak Shah and Ratheesh Kalarot},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/06328.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/06328-supp.pdf},
year = {2021},
date = {2021-10-11},
urldate = {2021-10-11},
booktitle = {International Conference on Computer Vision},
abstract = {Current face image retrieval solutions are limited, since they treat different facial attributes the same and cannot incorporate
user’s preference for a subset of attributes in their search criteria. This paper introduces a new face image retrieval framework, where the input face query is augmented by both an adjustment vector that specifies the desired modifications
to the facial attributes, and a preference vector that assigns different levels of importance to different attributes. For example, a user can ask for retrieving images similar to a query image, but with a different hair color, and no preference for absence/presence of eyeglasses in the results. To achieve this, we propose to disentangle the semantics, corresponding to various attributes, by learning a set of sparse and orthogonal basis vectors in the latent space of StyleGAN. Such basis vectors are then employed to decompose the dissimilarity between face images in terms of dissimilarity between their attributes, assign preference to the attributes, and adjust the attributes in the query. Enforcing sparsity on the basis vectors helps us to disentangle the latent space and adjust each attribute independently from other attributes, while enforcing orthogonality facilitates preference assignment and the dissimilarity decomposition. The effectiveness of our approach is illustrated by achieving state-of-the-art results for the face image retrieval task. },
keywords = {ICCV, Style GAN, Visual Search},
pubstate = {published},
tppubtype = {conference}
}
user’s preference for a subset of attributes in their search criteria. This paper introduces a new face image retrieval framework, where the input face query is augmented by both an adjustment vector that specifies the desired modifications
to the facial attributes, and a preference vector that assigns different levels of importance to different attributes. For example, a user can ask for retrieving images similar to a query image, but with a different hair color, and no preference for absence/presence of eyeglasses in the results. To achieve this, we propose to disentangle the semantics, corresponding to various attributes, by learning a set of sparse and orthogonal basis vectors in the latent space of StyleGAN. Such basis vectors are then employed to decompose the dissimilarity between face images in terms of dissimilarity between their attributes, assign preference to the attributes, and adjust the attributes in the query. Enforcing sparsity on the basis vectors helps us to disentangle the latent space and adjust each attribute independently from other attributes, while enforcing orthogonality facilitates preference assignment and the dissimilarity decomposition. The effectiveness of our approach is illustrated by achieving state-of-the-art results for the face image retrieval task.
Regmi, Krishna; Shah, Mubarak
Video Geo-Localization Employing Geo-Temporal Feature Learning and GPS Trajectory Smoothing Conference
International Conference on Computer Vision, 2021.
Abstract | Tags: Geo-Localization, ICCV, Transformers, Video Geo-localization | Links:
@conference{RegmiICCV2021,
title = {Video Geo-Localization Employing Geo-Temporal Feature Learning and GPS Trajectory Smoothing},
author = {Krishna Regmi and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/video_geolocalization_ICCV2021.pdf
https://github.com/kregmi/VTE},
year = {2021},
date = {2021-10-11},
urldate = {2021-10-11},
booktitle = {International Conference on Computer Vision},
abstract = {In this paper, we address the problem of video geolocalization by proposing a Geo-Temporal Feature Learning (GTFL) Network to simultaneously learn the discriminative features for the query video frames and the gallery images for estimating the geo-spatial trajectory of a query video. Based on a transformer encoder architecture, our
GTFL model encodes query and gallery data separately, via two dedicated branches. The proposed GPS Loss and Clip Triplet Loss exploit the geographical and temporal proximity between the frames and the clips to jointly learn the query and the gallery features. We also propose a deep learning approach to trajectory smoothing by predicting the outliers in the estimated GPS positions and learning the offsets to smooth the trajectory. We build a large dataset from four different regions of USA; New York, San Francisco, Berkeley and Bay Area using BDD driving videos as query, and by collecting corresponding Google StreetView (GSV) Images for gallery. Extensive
evaluations of proposed method on this new dataset are provided. Code and dataset details is publicly available at
https://github.com/kregmi/VTE.},
keywords = {Geo-Localization, ICCV, Transformers, Video Geo-localization},
pubstate = {published},
tppubtype = {conference}
}
GTFL model encodes query and gallery data separately, via two dedicated branches. The proposed GPS Loss and Clip Triplet Loss exploit the geographical and temporal proximity between the frames and the clips to jointly learn the query and the gallery features. We also propose a deep learning approach to trajectory smoothing by predicting the outliers in the estimated GPS positions and learning the offsets to smooth the trajectory. We build a large dataset from four different regions of USA; New York, San Francisco, Berkeley and Bay Area using BDD driving videos as query, and by collecting corresponding Google StreetView (GSV) Images for gallery. Extensive
evaluations of proposed method on this new dataset are provided. Code and dataset details is publicly available at
https://github.com/kregmi/VTE.
Zheng, Ce; Zhu, Sijie; Mendieta, Matias; Yang, Taojiannan; Chen, Chen; Ding, Zhengming
3D Human Pose Estimation with Spatial and Temporal Transformers Conference
International Conference on Computer Vision, 2021.
Tags: Action Recognition, Human Pose Estimation, ICCV, Pose | Links:
@conference{nokey,
title = {3D Human Pose Estimation with Spatial and Temporal Transformers},
author = {Ce Zheng and Sijie Zhu and Matias Mendieta and Taojiannan Yang and Chen Chen and Zhengming Ding},
url = {https://arxiv.org/pdf/2103.10455.pdf},
year = {2021},
date = {2021-10-11},
urldate = {2021-10-11},
booktitle = {International Conference on Computer Vision},
keywords = {Action Recognition, Human Pose Estimation, ICCV, Pose},
pubstate = {published},
tppubtype = {conference}
}
Bhunia, Ankan Kumar; Khan, Salman; Cholakkal, Hisham; Anwer, Rao Muhammad; Khan, Fahad Shahbaz; Shah, Mubarak
Handwriting Transformers Conference
International Conference on Computer Vision, 2021.
Abstract | Tags: ICCV, Transformers | Links:
@conference{BhuniaICCV2021,
title = {Handwriting Transformers},
author = {Ankan Kumar Bhunia and Salman Khan and Hisham Cholakkal and Rao Muhammad Anwer and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Handwriting_Generation_ICCV21.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Handwriting_Generation_ICCV21_supp.pdf},
year = {2021},
date = {2021-10-11},
urldate = {2021-10-11},
booktitle = {International Conference on Computer Vision},
abstract = {We propose a novel transformer-based styled handwritten text image generation approach, HWT, that strives to learn both style-content entanglement as well as global and local writing style patterns. The proposed HWT captures the long and short range relationships within the style examples through a self-attention mechanism, thereby encoding both global and local style patterns. Further, the proposed transformer-based HWT comprises an encoder-decoder attention that enables style-content entanglement by gathering the style representation of each query character. To the best of our knowledge, we are the first to introduce a transformer-based generative network for styled handwritten text generation.
Our proposed HWT generates realistic styled handwritten text images and significantly outperforms the state-of-the-art demonstrated through extensive qualitative, quantitative and human-based evaluations. The proposed HWT can handle arbitrary length of text and any desired writing style in a few-shot setting. Further, our HWT generalizes well to the challenging scenario where both words and writing style are unseen during training, generating realistic styled handwritten text images.},
keywords = {ICCV, Transformers},
pubstate = {published},
tppubtype = {conference}
}
Our proposed HWT generates realistic styled handwritten text images and significantly outperforms the state-of-the-art demonstrated through extensive qualitative, quantitative and human-based evaluations. The proposed HWT can handle arbitrary length of text and any desired writing style in a few-shot setting. Further, our HWT generalizes well to the challenging scenario where both words and writing style are unseen during training, generating realistic styled handwritten text images.
Narayan, Sanath; Gupta, Akshita; Khan, Salman; Khan, Fahad Shahbaz; Shao, Ling; Shah, Mubarak
Discriminative Region-based Multi-Label Zero-Shot Learning Conference
International Conference on Computer Vision, 2021.
Abstract | Tags: ICCV, Transformers, Zero-Shot Learning | Links:
@conference{NarayanICCV2021,
title = {Discriminative Region-based Multi-Label Zero-Shot Learning},
author = {Sanath Narayan and Akshita Gupta and Salman Khan and Fahad Shahbaz Khan and Ling Shao and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/02617.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/02617-supp.pdf},
year = {2021},
date = {2021-10-11},
urldate = {2021-10-11},
booktitle = {International Conference on Computer Vision},
abstract = {Multi-label zero-shot learning (ZSL) is a more realistic counter-part of standard single-label ZSL since several objects can co-exist in a natural image. However, the occurrence of multiple objects complicates the reasoning and re-quires region-specific processing of visual features to pre-serve their contextual cues. We note that the best existing multi-label ZSL method takes a shared approach towards attending to region features with a common set of attention maps for all the classes. Such shared maps lead to diffused attention, which does not discriminatively focus on relevant locations when the number of classes are large. Moreover, mapping spatially-pooled visual features to the class semantics leads to inter-class feature entanglement, thus hampering the classification. Here, we propose an alternate approach towards region-based discriminability-preserving multi-label zero-shot classification. Our approach maintains the spatial resolution to preserve region-level characteristics and utilizes a bi-level attention module (BiAM) to enrich the features by incorporating both region and scene context information. The enriched region-level features are then mapped to the class semantics and only their class predictions are spatially pooled to obtain image-level predictions, thereby keeping the multi-class features
disentangled. Our approach sets a new state of the art on two large-scale multi-label zero-shot benchmarks: NUS-WIDE and Open Images. On NUS-WIDE, our approach achieves an absolute gain of 6.9% mAP for ZSL, compared to the best published results. Source code is available at <a href="https://github.com/akshitac8/BiAM" target="_blank">https://github.com/akshitac8/BiAM</a>.},
keywords = {ICCV, Transformers, Zero-Shot Learning},
pubstate = {published},
tppubtype = {conference}
}
disentangled. Our approach sets a new state of the art on two large-scale multi-label zero-shot benchmarks: NUS-WIDE and Open Images. On NUS-WIDE, our approach achieves an absolute gain of 6.9% mAP for ZSL, compared to the best published results. Source code is available at <a href="https://github.com/akshitac8/BiAM" target="_blank">https://github.com/akshitac8/BiAM</a>.
Chen, Brian; Rouditchenko, Andrew; Duarte, Kevin; Kuehne, Hilde; Thomas, Samuel; Boggust, Angie; Panda, Rameswar; Kingsbury, Brian; Feris, Rogerio; Harwatch, David; Glass, James; Picheny, Michael; Chang, Shih-Fu
Multimodal Clustering Networks for Self-supervised Learning from Unlabeled Videos Conference
International Conference on Computer Vision, 2021.
Abstract | Tags: ICCV, Multi-Modal Learning | Links:
@conference{ChenICCV2021,
title = {Multimodal Clustering Networks for Self-supervised Learning from Unlabeled Videos},
author = {Brian Chen and Andrew Rouditchenko and Kevin Duarte and Hilde Kuehne and Samuel Thomas and Angie Boggust and Rameswar Panda and Brian Kingsbury and Rogerio Feris and David Harwatch and James Glass and Michael Picheny and Shih-Fu Chang},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/02965.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/02965-supp.pdf},
year = {2021},
date = {2021-10-11},
urldate = {2021-10-11},
booktitle = {International Conference on Computer Vision},
abstract = {Multimodal self-supervised learning is getting more and more attention as it allows not only to train large networks without human supervision but also to search and retrieve data across various modalities. In this context, this paper proposes a framework that, starting from a pre- trained backbone, learns a common multimodal embedding space that, in addition to sharing representations across different modalities, enforces a grouping of semantically similar instances.
To this end, we extend the concept of instance-level contrastive learning with a multimodal lustering step in the training pipeline to capture semantic similarities across modalities. The resulting embedding space enables retrieval of samples across all modalities, even from unseen datasets and different domains. To evaluate our approach, we train our model on the HowTo100M dataset and evaluate its zero-shot retrieval capabilities in two challenging domains,
namely text-to-video retrieval, and temporal action localization, showing state-of-the-art results on four different datasets. },
keywords = {ICCV, Multi-Modal Learning},
pubstate = {published},
tppubtype = {conference}
}
To this end, we extend the concept of instance-level contrastive learning with a multimodal lustering step in the training pipeline to capture semantic similarities across modalities. The resulting embedding space enables retrieval of samples across all modalities, even from unseen datasets and different domains. To evaluate our approach, we train our model on the HowTo100M dataset and evaluate its zero-shot retrieval capabilities in two challenging domains,
namely text-to-video retrieval, and temporal action localization, showing state-of-the-art results on four different datasets.
Swetha, Sirnam; Kuehne, Hilde; Rawat, Yogesh Singh; Shah, Mubarak
Unsupervised Discriminative Embedding for Sub-Action Learning in Complex Activities Conference
IEEE International Conference on Image Processing, 2021.
Tags: Action Recognition, ICIP, Un-supervised Learning | Links:
@conference{Swetha2021,
title = {Unsupervised Discriminative Embedding for Sub-Action Learning in Complex Activities},
author = {Sirnam Swetha and Hilde Kuehne and Yogesh Singh Rawat and Mubarak Shah },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Unsupervised-Discriminative-Embedding-for-Sub-Action-Learning-in-Complex-Activities.pdf},
year = {2021},
date = {2021-09-19},
urldate = {2021-09-19},
booktitle = {IEEE International Conference on Image Processing},
keywords = {Action Recognition, ICIP, Un-supervised Learning},
pubstate = {published},
tppubtype = {conference}
}
Jiban, Md Jibanul Haque; Hassan, Shah; Mahalanobis, Abhijit
Two-Stream Boosted TCRNET for Range-Tolerant Infra-Red Target Detection Conference
IEEE Conference on Image Processing, 2021.
Abstract | Tags: ICIP | Links:
@conference{jiban2021icip,
title = {Two-Stream Boosted TCRNET for Range-Tolerant Infra-Red Target Detection},
author = {Md Jibanul Haque Jiban and Shah Hassan and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Two-Stream_TCRNet__IEEE_ICIP2021.pdf},
year = {2021},
date = {2021-09-19},
publisher = {IEEE Conference on Image Processing},
abstract = {The detection of vehicular targets in infra-red imagery is a challenging task, both due to the relatively few pixels on target and the false alarms produced by the surrounding terrain clutter. It has been previously shown [1] that a relatively simple network (known as TCRNet) can outperform conventional deep CNNs for such applications by maximizing a target to clutter ratio (TCR) metric. In this paper, we introduce a new form of the network (referred to as TCRNet-2) that further improves the performance by first processing target and clutter information in two parallel channels and then combining them to optimize the TCR metric. We also show that the overall performance can be considerably improved by boosting the performance of a primary TCRNet-2 detector, with a secondary
network that enhances discrimination between targets and clutter in the false alarm space of the primary network. We analyze the performance of the proposed networks using a publicly available data set of infra-red images of targets in natural terrain. It is shown that the TCRNet-2 and its boosted version yield considerably better performance than the original TCRNet over a wide range of distances, in both day and night conditions.
Index Terms— TCRNet, Infrared, Target Detection, MWIR, Surveillance},
keywords = {ICIP},
pubstate = {published},
tppubtype = {conference}
}
network that enhances discrimination between targets and clutter in the false alarm space of the primary network. We analyze the performance of the proposed networks using a publicly available data set of infra-red images of targets in natural terrain. It is shown that the TCRNet-2 and its boosted version yield considerably better performance than the original TCRNet over a wide range of distances, in both day and night conditions.
Index Terms— TCRNet, Infrared, Target Detection, MWIR, Surveillance
Arif, Maliha; Mahalanobis, Abhijit
Few Shot Learning for Infra-Red Object Recognition Using Analytically Designed Low Level Filters for Data Representation Conference
IEEE International Conference on Image Processing, 2021.
Abstract | Tags: ICIP | Links:
@conference{ArifICIP2021,
title = {Few Shot Learning for Infra-Red Object Recognition Using Analytically Designed Low Level Filters for Data Representation},
author = {Maliha Arif and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICIP_2021__Sparse_Learning-Camera-Ready.pdf},
year = {2021},
date = {2021-09-19},
publisher = {IEEE International Conference on Image Processing},
abstract = {It is well known that deep convolutional neural networks (CNNs) generalize well over large number of classes when ample training data is available. However, training with smaller datasets does not always achieve robust performance. In such cases, we show that using analytically derived filters in the lowest layer enables a network to achieve better performance than learning from scratch using a relatively small dataset. These class-agnostic filters represent the underlying manifold of the data space, and also generalize to new or unknown classes which may occur on the same manifold. This directly enables new classes to be learned with very few images by simply fine-tuning the final few layers of the network. We illustrate the advantages of our method using the publicly available set of infra-red images of vehicular ground targets. We compare a simple CNN trained using our method with transfer learning performed using the VGG-16 network, and show that when the number of training images is limited, the proposed approach not only achieves better results on the trained classes, but also outperforms a standard network for learning a new object class.
Index Terms— manifold, eigen representation, few shot learning, sparse learning, infra-red datasets},
keywords = {ICIP},
pubstate = {published},
tppubtype = {conference}
}
Index Terms— manifold, eigen representation, few shot learning, sparse learning, infra-red datasets
Cuellar, Adam; Mahalanobis, Abhijit
Detection of Small Moving Ground Vehicles in Cluttered Terrain Using Infrared Video Imagery Conference
IEEE International Conference on Image Processing, 2021.
Abstract | Tags: ICIP | Links:
@conference{CuellarICIP2021,
title = {Detection of Small Moving Ground Vehicles in Cluttered Terrain Using Infrared Video Imagery},
author = {Adam Cuellar and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/atc_icip.pdf},
year = {2021},
date = {2021-09-19},
publisher = {IEEE International Conference on Image Processing},
abstract = {The detection of small moving targets in cluttered infrared imagery remains a difficult and challenging task. Conventional image subtraction techniques with frame-to-frame registration yield very high false alarm rates. Furthermore, state of the art deep convolutional neural networks (DCNNs) such as YOLO and Mask R-CNN also do not work well for this application. We show however, that it is possible to train a CNN to detect moving targets in a stack of stabilized images
by maximizing a target to clutter ratio (TCR) metric. This metric has been previously used for detecting relatively large stationary targets in single images, but not for the purposes of finding small moving targets using multiple frames. Referred to as moving target indicator network (MTINet), the proposed network does not rely on image subtraction, but instead uses depth-wise convolution to learn inter-frame temporal dependencies. We compare the performance of the MTINet to state of the art DCNNs and a statistical anomaly detection algorithm, and propose a combined approach that offers the benefits of both data-driven learning and statistical analysis.
Index Terms— Detection, Localization, Infrared, CNN },
keywords = {ICIP},
pubstate = {published},
tppubtype = {conference}
}
by maximizing a target to clutter ratio (TCR) metric. This metric has been previously used for detecting relatively large stationary targets in single images, but not for the purposes of finding small moving targets using multiple frames. Referred to as moving target indicator network (MTINet), the proposed network does not rely on image subtraction, but instead uses depth-wise convolution to learn inter-frame temporal dependencies. We compare the performance of the MTINet to state of the art DCNNs and a statistical anomaly detection algorithm, and propose a combined approach that offers the benefits of both data-driven learning and statistical analysis.
Index Terms— Detection, Localization, Infrared, CNN
Shiraz, Sarah; Regmi, Krishna; Vyas, Shruti; Rawat, Yogesh Singh; Shah, Mubarak
Novel View Video Prediction using Dual Representation Conference
IEEE International Conference on Image Processing, 2021.
Tags: Cross-View, ICIP, View Synthesis | Links:
@conference{Shiraz2021,
title = {Novel View Video Prediction using Dual Representation},
author = {Sarah Shiraz and Krishna Regmi and Shruti Vyas and Yogesh Singh Rawat and Mubarak Shah
},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Projects_Novel-View-Video-Prediction-using-dual-Representation.pdf
https://www.crcv.ucf.edu/research/projects/novel-view-video-prediction-using-dual-representation/},
year = {2021},
date = {2021-09-19},
urldate = {2021-09-19},
booktitle = {IEEE International Conference on Image Processing},
keywords = {Cross-View, ICIP, View Synthesis},
pubstate = {published},
tppubtype = {conference}
}
Ott, Aaron; Mazaheri, Amir; da Vitoria Lobo, Niels; Shah, Mubarak
Deep Photo Cropper and Enhancer Journal Article
In: CoRR, vol. abs/2008.00634, 2021.
Tags: Dataset, Image Enhancement, REU | Links:
@article{Ott2021,
title = {Deep Photo Cropper and Enhancer},
author = {Aaron Ott and Amir Mazaheri and Niels da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/08/Publications_DEEP-PHOTO-CROPPER-AND-ENHANCER.pdf},
year = {2021},
date = {2021-08-07},
urldate = {2021-08-07},
booktitle = {IEEE International Conference on Image Processing},
journal = {CoRR},
volume = {abs/2008.00634},
keywords = {Dataset, Image Enhancement, REU},
pubstate = {published},
tppubtype = {article}
}
Regmi, Krishna
Exploring Relationships Between Ground and Aerial Views by Synthesis and Matching PhD Thesis
University of Central Florida, 2021.
Tags: Cross-View, GAN, Geo-Localization, Ph.D. Dissertation, Video Geo-localization | Links:
@phdthesis{Regmi2021,
title = {Exploring Relationships Between Ground and Aerial Views by Synthesis and Matching},
author = {Krishna Regmi},
url = {https://stars.library.ucf.edu/etd2020/747/},
year = {2021},
date = {2021-08-02},
urldate = {2021-08-02},
school = {University of Central Florida},
keywords = {Cross-View, GAN, Geo-Localization, Ph.D. Dissertation, Video Geo-localization},
pubstate = {published},
tppubtype = {phdthesis}
}
Edraki, Marzieh
Implication of Manifold Assumption in Deep Learning Models for Computer Vision Applications PhD Thesis
University of Central Florida, 2021.
Tags: Capusules, Ph.D. Dissertation | Links:
@phdthesis{Edraki2021,
title = {Implication of Manifold Assumption in Deep Learning Models for Computer Vision Applications},
author = {Marzieh Edraki},
url = {https://stars.library.ucf.edu/etd2020/675/},
year = {2021},
date = {2021-08-01},
urldate = {2021-08-01},
school = {University of Central Florida},
keywords = {Capusules, Ph.D. Dissertation},
pubstate = {published},
tppubtype = {phdthesis}
}
Zhu, Sijie; Yang, Taojiannan; Chen, Chen
Visual Explanation for Deep Metric Learning Journal Article
In: IEEE Transactions on Image Processing, 2021.
@article{nokey,
title = {Visual Explanation for Deep Metric Learning},
author = {Sijie Zhu and Taojiannan Yang and Chen Chen},
url = {https://arxiv.org/pdf/1909.12977.pdf},
year = {2021},
date = {2021-07-30},
urldate = {2021-07-30},
journal = {IEEE Transactions on Image Processing},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Wang, Wenxuan; Chen, Chen; Ding, Meng; Li, Jiangyun; Yu, Hong; Zha, Sen
TransBTS: Multimodal Brain Tumor Segmentation Using Transformer Conference
International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI), 2021.
Tags: MICCAI | Links:
@conference{nokey,
title = {TransBTS: Multimodal Brain Tumor Segmentation Using Transformer},
author = {Wenxuan Wang and Chen Chen and Meng Ding and Jiangyun Li and Hong Yu and Sen Zha},
url = {https://arxiv.org/pdf/2103.04430.pdf},
year = {2021},
date = {2021-06-26},
booktitle = {International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)},
keywords = {MICCAI},
pubstate = {published},
tppubtype = {conference}
}
Arif, Maliha; Mahalanobis, Abhijit
Infrared Target Recognition Using Realistic Training Images Generated by Modifying Latent Features of an Encoder-Decoder Network Journal Article
In: IEEE Transactions on Aerospace and Electronic Systems, 2021.
Abstract | Tags: TAES | Links:
@article{ArifTAES2021,
title = {Infrared Target Recognition Using Realistic Training Images Generated by Modifying Latent Features of an Encoder-Decoder Network},
author = {Maliha Arif and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/IEEE_TAES_Revised_double_Column.pdf},
doi = {10.1109/TAES.2021.3090921},
year = {2021},
date = {2021-06-22},
urldate = {2021-06-22},
journal = {IEEE Transactions on Aerospace and Electronic Systems},
abstract = {Generating realistic images has been a challenging problem in computer vision, with many researchers focusing on novel methods and datasets to produce benchmark results. Our motivation for the same arises from the dearth of real training images for recognizing targets in infrared imagery. We propose an encoder-decoder architecture for generating realistic medium wave infrared images of targets at various azimuth angles, in day or night conditions, and at different ranges. Specifically, we use a CNN-based siamese autoencoder network that modifies the latent space embedding of a given input view to produce a novel output view. First, we train this network with a limited set of real images of the targets, and show that it can generate new and previously unseen views of the same. We show that the network operates in the non-linear feature subspace and learns the underlying manifold to develop a semantic understanding of the targets. We use the structural similarity index measure (SSIM) to quantify how the generated and real images of targets compare. Finally, we show classifiers trained with the generated images are able to recognize targets in real test images.
Index Terms—ATR Classification, view prediction, deep convolutional autoencoders, infrared imagery},
keywords = {TAES},
pubstate = {published},
tppubtype = {article}
}
Index Terms—ATR Classification, view prediction, deep convolutional autoencoders, infrared imagery
Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
PLM: Partial Label Masking for Imbalanced Multi-label Classification Workshop
IEEE Conference on Computer Vision and Pattern Recognition, Learning from Limited or Imperfect Data (L2ID) Workshop, 2021.
Tags: CVPRW, Imbalanced Dataset, Multi-Label | Links:
@workshop{Duarte2021,
title = {PLM: Partial Label Masking for Imbalanced Multi-label Classification},
author = {Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah
},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PLM.pdf},
year = {2021},
date = {2021-06-20},
urldate = {2021-06-20},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition, Learning from Limited or Imperfect Data (L2ID) Workshop},
keywords = {CVPRW, Imbalanced Dataset, Multi-Label},
pubstate = {published},
tppubtype = {workshop}
}
Kumar, Aakash; Kini, Jyoti; Shah, Mubarak; Mian, Ajmal
PC-DAN: Point Cloud based Deep Affinity Network for 3D Multi-Object Tracking Workshop
IEEE Conference on Computer Vision and Pattern Recognition, 2nd Workshop on Visual Perception for Navigation in Human Environments - The JackRabbot Social Grouping and Activity Dataset and Benchmark, 2021.
Tags: Contest, CVPRW, LIDAR, Tracking | Links:
@workshop{Kumar2021,
title = {PC-DAN: Point Cloud based Deep Affinity Network for 3D Multi-Object Tracking},
author = {Aakash Kumar and Jyoti Kini and Mubarak Shah and Ajmal Mian},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PC-DAN.pdf},
year = {2021},
date = {2021-06-20},
urldate = {2021-06-20},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition, 2nd Workshop on Visual Perception for Navigation in Human Environments - The JackRabbot Social Grouping and Activity Dataset and Benchmark},
keywords = {Contest, CVPRW, LIDAR, Tracking},
pubstate = {published},
tppubtype = {workshop}
}
Gagne, Crystal; Kini, Jyoti; Smith, Daniel; Shah, Mubarak
Florida Wildlife Camera Trap Dataset Workshop
IEEE Conference on Computer Vision and Pattern Recognition, CV4Animals: Computer Vision for Animal Behavior Tracking and Modeling Workshop, 2021.
Tags: CVPRW, Dataset, Wildlife Preservation | Links:
@workshop{Gagne2021,
title = {Florida Wildlife Camera Trap Dataset},
author = {Crystal Gagne and Jyoti Kini and Daniel Smith and Mubarak Shah },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Projects_Florida-Wildlife-Camera-Trap-Dataset.pdf
https://www.crcv.ucf.edu/research/projects/florida-wildlife-camera-trap-dataset/},
year = {2021},
date = {2021-06-20},
urldate = {2021-06-20},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition, CV4Animals: Computer Vision for Animal Behavior Tracking and Modeling Workshop},
keywords = {CVPRW, Dataset, Wildlife Preservation},
pubstate = {published},
tppubtype = {workshop}
}
Georgescu, Mariana Iuliana; Bărbălău, Antonio; Ionescu, Radu Tudor; Khan, Fahad Shahbaz; Popescu, Marius; Shah, Mubarak
Anomaly Detection in Video via Self-Supervised and Multi-Task Learning Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2021.
Abstract | Tags: Anomaly Detection, Multi-Task Learning, Self-Supervised Learning | Links:
@conference{georgescu2020anomaly,
title = {Anomaly Detection in Video via Self-Supervised and Multi-Task Learning},
author = {Mariana Iuliana Georgescu and Antonio Bărbălău and Radu Tudor Ionescu and Fahad Shahbaz Khan and Marius Popescu and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Publications_Anomaly-Detection-in-Video-via-Self-Supervised-and-Multi-Task-Learning.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Publications_Anomaly-Detection-in-Video-via-Self-Supervised-and-Multi-Task-Learning_Supp.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Anomaly-Detection-in-Video-via-Self-Supervised-and-Multi-Task-Learning.pptx
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Anomaly-Detection-in-Video-via-Self-Supervised-and-Multi-Task-Learning.mp4},
year = {2021},
date = {2021-06-19},
urldate = {2021-06-19},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
abstract = {Anomaly detection in video is a challenging computer vision problem. Due to the lack of anomalous events at training time, anomaly detection requires the design of learning methods without full supervision. In this paper, we approach anomalous event detection in video through self-supervised and multi-task learning at the object level. We first utilize a pre-trained detector to detect objects. Then, we train a 3D convolutional neural network to produce discriminative anomaly-specific information by jointly learning multiple proxy tasks: three self-supervised and one based on knowledge distillation. The self-supervised tasks are: (i) discrimination of forward/backward moving objects (arrow of time), (ii) discrimination of objects in consecutive/intermittent frames (motion irregularity) and (iii) reconstruction of object-specific appearance information. The knowledge distillation task takes into account both classification and detection information, generating large prediction discrepancies between teacher and student models when anomalies occur. To the best of our knowledge, we are the first to approach anomalous event detection in video as a multi-task learning problem, integrating multiple self-supervised and knowledge distillation proxy tasks in a single architecture. Our lightweight architecture outperforms the state-of-the-art methods on three benchmarks: Avenue, ShanghaiTech and UCSD Ped2. Additionally, we perform an ablation study demonstrating the importance of integrating self-supervised learning and normality-specific distillation in a multi-task learning setting.},
keywords = {Anomaly Detection, Multi-Task Learning, Self-Supervised Learning},
pubstate = {published},
tppubtype = {conference}
}
Ashraf, Muhammad Waseem; Sultani, Waqas; Shah, Mubarak
Dogfight: Detecting Drones from Drones Videos Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2021.
Tags: CVPR, Drone Video Analysis, Object Detection, UAV Video Analysis, UVA Video Analysis | Links:
@conference{Sultani2021,
title = {Dogfight: Detecting Drones from Drones Videos },
author = {Muhammad Waseem Ashraf and Waqas Sultani and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Dogfight.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Dogfight_Supp.mp4
https://github.com/mwaseema/Drone-Detection
http://im.itu.edu.pk/dogfight-detecting-drones-from-drones-videos/
https://docs.google.com/presentation/d/1huBSbYzyNUCs-gJHdSU2CZ0XfRlBWGOQ/edit#slide=id.p1},
year = {2021},
date = {2021-06-19},
urldate = {2021-06-19},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
keywords = {CVPR, Drone Video Analysis, Object Detection, UAV Video Analysis, UVA Video Analysis},
pubstate = {published},
tppubtype = {conference}
}
Tirupattur, Praveen; Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
Modeling Multi-Label Action Dependencies for Temporal Action Localization Conference
IEEE Conference on Computer Vision and Pattern Recognition (Oral), 2021.
Tags: CVPR, Transformers | Links:
@conference{Tirupattur2021,
title = {Modeling Multi-Label Action Dependencies for Temporal Action Localization},
author = {Praveen Tirupattur and Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Modeling-Multi-Label-Action-Dependencies-for-Temporal-Action-Localization.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Modeling-Multi-Label-Action-Dependencies-for-Temporal-Action-Localization_Supp.zip
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Modeling-Multi-Label-Action-Dependencies-for-Temporal-Action-Localization-CVPR-2021-ORAL.mp4},
year = {2021},
date = {2021-06-19},
urldate = {2021-06-19},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (Oral)},
keywords = {CVPR, Transformers},
pubstate = {published},
tppubtype = {conference}
}
Zaeemzadeh, Alireza; Bisagno, Niccolò; Sambugaro, Zeno; Conci, Nicola; Rahnavard, Nazanin; Shah, Mubarak
Out-of-Distribution Detection Using Union of 1-Dimensional Subspaces Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2021.
Tags: CVPR, Open World, Out of Distribution (OOD) | Links:
@conference{Zaeemzadeh2021,
title = {Out-of-Distribution Detection Using Union of 1-Dimensional Subspaces},
author = {Alireza Zaeemzadeh and Niccolò Bisagno and Zeno Sambugaro and Nicola Conci and Nazanin Rahnavard and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Out-of-Distribution-Detection-Using-Union-of-1-Dimensional-Subspaces.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Out-of-Distribution-Detection-Using-Union-of-1-Dimensional-Subspaces_Supp.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CVPR21_presentation_video.mov},
year = {2021},
date = {2021-06-19},
urldate = {2021-06-19},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
keywords = {CVPR, Open World, Out of Distribution (OOD)},
pubstate = {published},
tppubtype = {conference}
}
Khan, Aisha Urooj; Kuehne, Hilde; Duarte, Kevin; Gan, Chuang; Lobo, Niels Da Vitoria; Shah, Mubarak
Found a Reason for me? Weakly-supervised Grounded Visual Question Answering using Capsules Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2021.
Tags: Capsule Networks, CVPR, Grounding, VQA | Links:
@conference{Khan2021b,
title = {Found a Reason for me? Weakly-supervised Grounded Visual Question Answering using Capsules},
author = {Aisha Urooj Khan and Hilde Kuehne and Kevin Duarte and Chuang Gan and Niels Da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Found-a-Reason-for-me.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Found-a-Reason-for-me_Supp.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/cvpr21_poster_v2.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/cvpr_2021_5min.mp4},
year = {2021},
date = {2021-06-19},
urldate = {2021-06-19},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
keywords = {Capsule Networks, CVPR, Grounding, VQA},
pubstate = {published},
tppubtype = {conference}
}
Rizve, Mamshad Nayeem; Khan, Salman; Khan, Fahad Shahbaz; Shah, Mubarak
Exploring Complementary Strengths of Invariant and Equivariant Representations for Few-Shot Learning Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2021.
Tags: CVPR, Equivariance, Few-Shot Learning, Invariance | Links:
@conference{Rizve2021b,
title = {Exploring Complementary Strengths of Invariant and Equivariant Representations for Few-Shot Learning},
author = {Mamshad Nayeem Rizve and Salman Khan and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Exploring-Complementary-Strengths-of-Invariant-and-Equivariant.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Exploring-Complementary-Strengths-of-Invariant-and-Equivariant_Supp.pdf
https://github.com/nayeemrizve/invariance-equivariance
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/FSL_CVPR2021_Video_Final.mp4},
year = {2021},
date = {2021-06-19},
urldate = {2021-06-19},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
keywords = {CVPR, Equivariance, Few-Shot Learning, Invariance},
pubstate = {published},
tppubtype = {conference}
}
Rizve, Mamshad Nayeem; Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
In Defense of Pseudo-Labeling: An Uncertainty-Aware Pseudo-label Selection Framework for Semi-Supervised Learning Conference
Ninth International Conference on Learning Representations (ICLR), 2021.
Tags: ICLR, Network Calibration, Pseudo-Labeling, Semi-supervised learning | Links:
@conference{Rizve2021,
title = {In Defense of Pseudo-Labeling: An Uncertainty-Aware Pseudo-label Selection Framework for Semi-Supervised Learning},
author = {Mamshad Nayeem Rizve and Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/In-Defense-Of-Pseudo-Labeling.pdf
https://github.com/nayeemrizve/ups
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/UPS_ICLR2021_Slides.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/UPS_ICLR2021_Slides.pptx
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/UPS_Poster_ICLR2021.png},
year = {2021},
date = {2021-05-04},
urldate = {2021-05-04},
booktitle = {Ninth International Conference on Learning Representations (ICLR)},
keywords = {ICLR, Network Calibration, Pseudo-Labeling, Semi-supervised learning},
pubstate = {published},
tppubtype = {conference}
}
Sultani, Waqas; Shah, Mubarak
Human Action Recognition in Drone Videos using a Few Aerial Training Examples Journal Article
In: Computer Vision and Image Understanding, vol. vol. 206, no. 103186, 2021.
Tags: CVIU, Drone Video Analysis, Human Action Recognition, Multi-Task Learning, UAV Video Analysis | Links:
@article{Sultani2020,
title = {Human Action Recognition in Drone Videos using a Few Aerial Training Examples},
author = {Waqas Sultani and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1910.10027.pdf
https://www.crcv.ucf.edu/research/projects/human-action-recognition-in-drone-videos-using-a-few-aerial-training-examples/},
year = {2021},
date = {2021-05-01},
urldate = {2021-05-01},
journal = {Computer Vision and Image Understanding},
volume = {vol. 206},
number = {103186},
keywords = {CVIU, Drone Video Analysis, Human Action Recognition, Multi-Task Learning, UAV Video Analysis},
pubstate = {published},
tppubtype = {article}
}
Zaeemzadeh, Alireza
Robust and Scalable Data Representation and Analysis Leveraging Isometric Transformations and Sparsity PhD Thesis
University of Central Florida, 2021.
Tags: Ph.D. Dissertation | Links:
@phdthesis{Zaeemzadeh2021b,
title = {Robust and Scalable Data Representation and Analysis Leveraging Isometric Transformations and Sparsity},
author = {Alireza Zaeemzadeh},
url = {https://www.cecs.ucf.edu/graddefense-old/pdf/13258},
year = {2021},
date = {2021-05-01},
urldate = {2021-05-01},
school = {University of Central Florida},
keywords = {Ph.D. Dissertation},
pubstate = {published},
tppubtype = {phdthesis}
}
Georgescu, Mariana-Iuliana; Ionescu, Radu Tudor; Khan, Fahad Shahbaz; Popescu, Marius; Shah, Mubarak
A Background-Agnostic Framework with Adversarial Training for Abnormal Event Detection in Video Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, 2021.
Tags: Abnormal Event Detection, Anomaly Detection | Links:
@article{Georgescu2021,
title = {A Background-Agnostic Framework with Adversarial Training for Abnormal Event Detection in Video},
author = {Mariana-Iuliana Georgescu and Radu Tudor Ionescu and Fahad Shahbaz Khan and Marius Popescu and Mubarak Shah
},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/AED_PAMI-revised-arxiv.pdf},
year = {2021},
date = {2021-04-16},
urldate = {2021-04-16},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {Abnormal Event Detection, Anomaly Detection},
pubstate = {published},
tppubtype = {article}
}
Khan, Salman; Naseer, Muzammal; Hayat, Munawar; Zamir, Syed Waqas; Khan, Fahad Shahbaz; Shah, Mubarak
Transformers in Vision: A Survey Technical Report
no. arXiv:2101.01169, 2021.
Tags: Survey, Transformers | Links:
@techreport{Khan2021,
title = {Transformers in Vision: A Survey},
author = {Salman Khan and Muzammal Naseer and Munawar Hayat and Syed Waqas Zamir and Fahad Shahbaz Khan and Mubarak Shah },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Publications_TransformersSurvey.pdf},
year = {2021},
date = {2021-02-02},
urldate = {2021-02-02},
number = {arXiv:2101.01169},
keywords = {Survey, Transformers},
pubstate = {published},
tppubtype = {techreport}
}
Rana, Aayush; Rawat, Yogesh Singh
We don’t Need Thousand Proposals: Single Shot Actor-Action Detection in Videos Conference
IEEE 2021 Winter Conference on Applications of Computer Vision (WACV), 2021.
Tags: WACV | Links:
@conference{Rana2021,
title = {We don’t Need Thousand Proposals: Single Shot Actor-Action Detection in Videos},
author = {Aayush Rana and Yogesh Singh Rawat
},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/12/Projects_Single-shot-actor-action-detection-in-videos.pdf
https://www.crcv.ucf.edu/research/projects/we-dont-need-thousand-proposals-single-shot-actor-action-detection-in-videos/
https://youtu.be/GHKpr5VTbV8},
year = {2021},
date = {2021-01-05},
urldate = {2021-01-05},
booktitle = {IEEE 2021 Winter Conference on Applications of Computer Vision (WACV)},
keywords = {WACV},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Ce; Wu, Wenhan; Yang, Taojiannan; Zhu, Sijie; Chen, Chen; Liu, Ruixu; Shen, Ju; Kehtarnavaz, Nasser; Shah, Mubarak
Deep Learning-Based Human Pose Estimation: A Survey Technical Report
no. arXiv:2012.13392, 2021.
Tags: Human Pose Estimation | Links:
@techreport{Zheng2021,
title = {Deep Learning-Based Human Pose Estimation: A Survey},
author = {Ce Zheng and Wenhan Wu and Taojiannan Yang and Sijie Zhu and Chen Chen and Ruixu Liu and Ju Shen and Nasser Kehtarnavaz and Mubarak Shah},
url = {https://arxiv.org/pdf/2012.13392.pdf
https://github.com/zczcwh/DL-HPE},
year = {2021},
date = {2021-01-02},
number = {arXiv:2012.13392},
keywords = {Human Pose Estimation},
pubstate = {published},
tppubtype = {techreport}
}
2020
Khan, Aisha Urooj; Mazaheri, Amir; da Vitoria Lobo, Niels; Shah, Mubarak
MMFT-BERT: Multimodal Fusion Transformer with BERT Encodings for Visual Question Answering Conference
Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings, EMNLP, 2020.
Tags: EMNLP, Visual Question Answering | Links:
@conference{Khan2020b,
title = {MMFT-BERT: Multimodal Fusion Transformer with BERT Encodings for Visual Question Answering},
author = {Aisha Urooj Khan and Amir Mazaheri and Niels da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Publications_MMFT-BERT.pdf},
year = {2020},
date = {2020-11-16},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings, EMNLP},
keywords = {EMNLP, Visual Question Answering},
pubstate = {published},
tppubtype = {conference}
}
Georgescu, Mariana-Iuliana; Barbalau, Antonio; Ionescu, Radu Tudor; Khan, Fahad Shahbaz; Popescu, Marius; Shah, Mubarak
Anomaly Detection in Video via Self-Supervised and Multi-Task Learning Technical Report
no. arXiv:2011.07491, 2020.
Tags: Anomaly Detection, CVPR-2021, Multi-Task Learning, Self-Supervised Learning | Links:
@techreport{Georgescu2020b,
title = {Anomaly Detection in Video via Self-Supervised and Multi-Task Learning},
author = {Mariana-Iuliana Georgescu and Antonio Barbalau and Radu Tudor Ionescu and Fahad Shahbaz Khan and Marius Popescu and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Publicatons_Anomaly-Detection-in-Video-via-Self-Supervised-and-Multi-Task-Learning.pdf},
year = {2020},
date = {2020-11-15},
urldate = {2020-11-15},
number = {arXiv:2011.07491},
howpublished = {CVPR-2021},
keywords = {Anomaly Detection, CVPR-2021, Multi-Task Learning, Self-Supervised Learning},
pubstate = {published},
tppubtype = {techreport}
}
McIntosh, Bruce; Venkataramanan, Shashanka; Mahalanobis, Abhijit
Target Detection in Cluttered Environments Using Infra-Red Images Conference
IEEE International Conference on Image Processing, 2020.
Abstract | Tags: ICIP | Links:
@conference{McIntosh2020,
title = {Target Detection in Cluttered Environments Using Infra-Red Images},
author = {Bruce McIntosh and Shashanka Venkataramanan and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/icip2020_photoready.pdf},
year = {2020},
date = {2020-10-28},
booktitle = {IEEE International Conference on Image Processing},
abstract = {The detection of targets in infra-red imagery is a challenging problem which involves locating small targets in heavily
cluttered environments while maintaining a low false alarm rate. We propose a network that optimizes a “target to clutter
ratio”(TCR) metric defined as the ratio of the output energies produced by the network in response to targets and clutter.
We show that for target detection, it is advantageous to analytically derive the first layer of a CNN to maximize the
TCR metric, and then train the rest of the network to optimize the same cost function. We evaluate the performance of the
resulting network using a public domain MWIR data set released by the US Army’s Night Vision Laboratories, and compare
it to the state-of-the-art detectors such as Faster RCNN and Yolo-v3. Referred to as the TCRNet, the proposed network
demonstrates state of the art results with greater than 30% improvement in probability of detection while reducing
the false alarm rate by more than a factor of 2 when compared to these leading methods. Ablation studies also show that the
proposed approach and metric are superior to learning the entire network from scratch, or using conventional regression
metrics such as the mean square error (MSE). },
keywords = {ICIP},
pubstate = {published},
tppubtype = {conference}
}
cluttered environments while maintaining a low false alarm rate. We propose a network that optimizes a “target to clutter
ratio”(TCR) metric defined as the ratio of the output energies produced by the network in response to targets and clutter.
We show that for target detection, it is advantageous to analytically derive the first layer of a CNN to maximize the
TCR metric, and then train the rest of the network to optimize the same cost function. We evaluate the performance of the
resulting network using a public domain MWIR data set released by the US Army’s Night Vision Laboratories, and compare
it to the state-of-the-art detectors such as Faster RCNN and Yolo-v3. Referred to as the TCRNet, the proposed network
demonstrates state of the art results with greater than 30% improvement in probability of detection while reducing
the false alarm rate by more than a factor of 2 when compared to these leading methods. Ablation studies also show that the
proposed approach and metric are superior to learning the entire network from scratch, or using conventional regression
metrics such as the mean square error (MSE).
Georgescu, Mariana-Iuliana; Ionescu, Radu Tudor; Khan, Fahad Shahbaz; Popescu, Marius; Shah, Mubarak
A Scene-Agnostic Framework with Adversarial Training for Abnormal Event Detection in Video Technical Report
no. arXiv:2008.12328, 2020.
Tags: Anomaly Detection | Links:
@techreport{Georgescu2020,
title = {A Scene-Agnostic Framework with Adversarial Training for Abnormal Event Detection in Video},
author = {Mariana-Iuliana Georgescu and Radu Tudor Ionescu and Fahad Shahbaz Khan and Marius Popescu and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/08/Publications_A-Scene-Agnostic-Framework-with-Adversarial-Training-for-Abnormal-Event-Detection-in-Video.pdf},
year = {2020},
date = {2020-08-27},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 14, No. 8},
number = {arXiv:2008.12328},
keywords = {Anomaly Detection},
pubstate = {published},
tppubtype = {techreport}
}
Sun, ShiJie; Akhtar, Naveed; Song, XiangYu; Song, HuanSheng; Mian, Ajmal; Shah, Mubarak
Simultaneous Detection and Tracking with Motion Modelling for Multiple Object Tracking Conference
16th European Conference on Computer Vision, 2020.
Abstract | Tags: ECCV, Tracking | Links:
@conference{Sun2020,
title = {Simultaneous Detection and Tracking with Motion Modelling for Multiple Object Tracking},
author = {ShiJie Sun and Naveed Akhtar and XiangYu Song and HuanSheng Song and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Simultaneous-Detection-and-Tracking-with-Motion-Modelling-for-Multiple-Object-Tracking.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Simultaneous-Detection-and-Tracking-with-Motion-Modelling-for-Multiple-Object-Tracking_Supp.pdf
https://shijies.github.io/DMMN_Page/},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {Deep learning based Multiple Object Tracking (MOT) currently relies on off-the-shelf detectors for tracking-by-detection. This results in deep models that are detector biased and evaluations that are detector influenced. To resolve this issue, we introduce Deep Motion Modeling Network (DMM-Net) that can estimate multiple objects’ motion parameters to perform joint detection and association in an end-to-end manner. DMM-Net models object features over multiple frames and simultaneously infers object classes, visibility and their motion parameters. These outputs are readily used to update the tracklets for efficient MOT. DMM-Net achieves PR-MOTA score of 12.80 @ 120+ fps for the popular UA-DETRAC challenge - which is better performance and orders of magnitude faster. We also contribute a synthetic large-scale public dataset Omni-MOT for vehicle tracking that provides precise ground-truth annotations to eliminate the detector influence in MOT evaluation. This 14M+ frames dataset is extendable with our public script (Code at Dataset, Dataset Recorder, Omni-MOT Source). We demonstrate the suitability of Omni-MOT for deep learning with DMM-Net, and also make the source code of our network public.},
keywords = {ECCV, Tracking},
pubstate = {published},
tppubtype = {conference}
}
Xie, Jin; Cholakkal, Hisham; Anwer, Rao Muhammad; Khan, Fahad Shahbaz; Pang, Yanwei; Shao, Ling; Shah, Mubarak
Count- and Similarity-aware R-CNN for Pedestrian Detection Conference
16th European Conference on Computer Vision, 2020.
Abstract | Tags: Detection, ECCV | Links:
@conference{Xie2020,
title = {Count- and Similarity-aware R-CNN for Pedestrian Detection},
author = {Jin Xie and Hisham Cholakkal and Rao Muhammad Anwer and Fahad Shahbaz Khan and Yanwei Pang and Ling Shao and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Count-and-Similarity-aware-R-CNN-for-Pedestrian-Detection.pdf},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {Recent pedestrian detection methods generally rely on additional supervision, such as visible bounding-box annotations, to handle heavy occlusions. We propose an approach that leverages pedestrian count and proposal similarity information within a two-stage pedestrian detection framework. Both pedestrian count and proposal similarity are derived from standard full-body annotations commonly used to train pedestrian detectors. We introduce a count-weighted detection loss function that assigns higher weights to the detection errors occurring at highly overlapping pedestrians. The proposed loss function is utilized at both stages of the two-stage detector. We further introduce a count-andsimilarity branch within the two-stage detection framework, which predicts pedestrian count as well as proposal similarity to identify distinct proposals. Our approach requires neither part information nor visible bounding-box annotations. Experiments are performed on the CityPersons and CrowdHuman datasets. Our method sets a new state-of-the-art on both datasets. Further, it achieves an absolute gain of 2.4% over the current state-of-the-art, in terms of log-average miss rate, on the heavily occluded (HO) set of CityPersons test set, without using additional visible bounding-box supervision. Finally, we demonstrate the applicability of our approach for the problem of human instance segmentation. Code and models are available at: https://github.com/Leotju/CaSe.},
keywords = {Detection, ECCV},
pubstate = {published},
tppubtype = {conference}
}
Schatz, Kara Marie; Quintanilla, Erik; Vyas, Shruti; Rawat, Yogesh Singh
A Recurrent Transformer Network for Novel View Action Synthesis Conference
16th European Conference on Computer Vision, 2020.
Abstract | Tags: Cross-View, ECCV, REU, View Action Synthesis, View Synthesis | Links:
@conference{Schatz2020,
title = {A Recurrent Transformer Network for Novel View Action Synthesis},
author = {Kara Marie Schatz and Erik Quintanilla and Shruti Vyas and Yogesh Singh Rawat},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Projects_A-Recurrent-Transformer-Network-for-Novel-View-Action-Synthesis.pdf
https://www.crcv.ucf.edu/research/projects/a-recurrent-transformer-network-for-novel-view-action-synthesis/},
year = {2020},
date = {2020-08-23},
urldate = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {In this work, we address the problem of synthesizing human actions from novel views. Given an input video of an actor performing some action, we aim to synthesize a video with the same action performed from a novel view with the help of an appearance prior. We propose an end-to-end deep network to solve this problem. The proposed network utilizes the change in viewpoint to transform the action from the input view to the novel view in feature space. The transformed action is integrated with the target appearance using the proposed recurrent transformer network, which provides a transformed appearance for each time-step in the action sequence. The recurrent transformer network utilize action key-points which are determined in an unsupervised approach using the encoded action features. We also propose a hierarchical structure for the recurrent transformation which further improves the performance. We demonstrate the effectiveness of the proposed method through extensive experiments conducted on a large-scale multi-view action recognition NTU-RGB+D dataset. In addition, we show that the proposed method can transform the action to a novel viewpoint with an entirely different scene or actor. The code is publicly available at https://github.com/schatzkara/cross-view-video.},
keywords = {Cross-View, ECCV, REU, View Action Synthesis, View Synthesis},
pubstate = {published},
tppubtype = {conference}
}
Vyas, Shruti; Rawat, Yogesh Singh; Shah, Mubarak
Multi-view Action Recognition using Cross-view Video Prediction Conference
16th European Conference on Computer Vision, 2020.
Abstract | Tags: Cross-View Video Prediction, ECCV | Links:
@conference{Vyas2020,
title = {Multi-view Action Recognition using Cross-view Video Prediction},
author = {Shruti Vyas and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Projects_Multi-view-Action-Recognition-using-Cross-view-Video-Prediction.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Projects_Multi-view-Action-Recognition-using-Cross-view-Video-Prediction_Supp.pdf
https://www.crcv.ucf.edu/research/projects/multi-view-action-recognition-using-cross-view-video-prediction/},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {In this work, we address the problem of action recognition in a multi-view environment. Most of the existing approaches utilize pose information for multi-view action recognition. We focus on RGB modality instead and propose an unsupervised representation learning framework, which encodes the scene dynamics in videos captured from multiple viewpoints via predicting actions from unseen views. The framework takes multiple short video clips from different viewpoints and time as input and learns an holistic internal representation which is used to predict a video clip from an unseen viewpoint and time. The ability of the proposed network to render unseen video frames enables it to learn a meaningful and robust representation of the scene dynamics. We evaluate the effectiveness of the learned representation for multiview video action recognition in a supervised approach. We observe a significant improvement in the performance with RGB modality on NTU-RGB+D dataset, which is the largest dataset for multi-view action recognition. The proposed framework also achieves state-of-the-art results with depth modality, which validates the generalization capability of the approach to other data modalities. The code is publicly available at https://github.com/svyas23/cross-view-action.},
keywords = {Cross-View Video Prediction, ECCV},
pubstate = {published},
tppubtype = {conference}
}
Venkataramanan, Shashanka; Peng, Kuan-Chuan; Singh, Rajat Vikram; Mahalanobis, Abhijit
Attention Guided Anomaly Localization in Images Conference
16th European Conference on Computer Vision, 2020.
Abstract | Tags: ECCV | Links:
@conference{Venkataramanan2020,
title = {Attention Guided Anomaly Localization in Images},
author = {Shashanka Venkataramanan and Kuan-Chuan Peng and Rajat Vikram Singh and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/2813.pdf
https://youtu.be/b-EQr-fGPWo},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {Anomaly localization is an important problem in computer vision which involves localizing anomalous regions within images with applications in industrial inspection, surveillance, and medical imaging. This task is challenging due to the small sample size and pixel coverage of the anomaly in real-world scenarios. Most prior works need to use anomalous training images to compute a class-specific threshold to localize anomalies. Without the need of anomalous training images, we propose Convolutional Adversarial Variational autoencoder with Guided Attention (CAVGA), which localizes the anomaly with a convolutional latent variable to preserve the spatial information. In the unsupervised setting, we propose an attention expansion loss where we encourage CAVGA to focus on all normal regions in the image. Furthermore, in the weakly supervised setting we propose a complementary guided attention loss, where we encourage the attention map to focus on all normal regions while minimizing the attention map corresponding to anomalous regions in the image. CAVGA outperforms the state-of-the-art (SOTA) anomaly localization methods on MVTec Anomaly Detection (MVTAD), modified ShanghaiTech Campus (mSTC) and Large-scale Attention based Glaucoma (LAG) datasets in the unsupervised setting and when using only 2% anomalous images in the weakly-supervised setting. CAVGA also outperforms SOTA anomaly detection methods on the MNIST, CIFAR-10, Fashion-MNIST, MVTAD, mSTC and LAG datasets.},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
LaLonde, Rodney
Algorithms and Applications of Novel Capsule Networks PhD Thesis
University of Central Florida, 2020.
Tags: Ph.D. Dissertation | Links:
@phdthesis{LaLonde2020b,
title = {Algorithms and Applications of Novel Capsule Networks},
author = {Rodney LaLonde},
url = {https://stars.library.ucf.edu/etd2020/612/},
year = {2020},
date = {2020-08-02},
urldate = {2020-08-02},
school = {University of Central Florida},
keywords = {Ph.D. Dissertation},
pubstate = {published},
tppubtype = {phdthesis}
}
RaviPrakash, Harish
Novel Computational Approaches for Multidimensional Brain Image Analysis PhD Thesis
University of Central Florida, 2020.
Tags: Ph.D. Dissertation | Links:
@phdthesis{nokey,
title = {Novel Computational Approaches for Multidimensional Brain Image Analysis},
author = {Harish RaviPrakash},
url = {https://stars.library.ucf.edu/etd2020/618/},
year = {2020},
date = {2020-08-01},
urldate = {2020-08-01},
school = {University of Central Florida},
keywords = {Ph.D. Dissertation},
pubstate = {published},
tppubtype = {phdthesis}
}
Zhang, Xiaoyu; Mian, Ajmal; Gupta, Rohit; Rahnavard, Nazanin; Shah, Mubarak
Cassandra: Detecting Trojaned Networks from Adversarial Perturbations Technical Report
no. arXiv:2007.14433, 2020.
Tags: Adversarial Attacks | Links:
@techreport{Zhang2020,
title = {Cassandra: Detecting Trojaned Networks from Adversarial Perturbations},
author = {Xiaoyu Zhang and Ajmal Mian and Rohit Gupta and Nazanin Rahnavard and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/08/Publications_Cassandra-Detecting-Trojaned-Networks-from-Adversarial-Perturbations.pdf},
year = {2020},
date = {2020-07-28},
urldate = {2020-07-28},
number = {arXiv:2007.14433},
keywords = {Adversarial Attacks},
pubstate = {published},
tppubtype = {techreport}
}
Edraki, Marzieh; Karim, Nazmul; Rahnavard, Nazanin; Mian, Ajmal; Shah, Mubarak
Odyssey: Creation, Analysis and Detection of Trojan Models Technical Report
no. arXiv:2007.08142, 2020.
Tags: Adversarial Attacks | Links:
@techreport{Edraki2020,
title = {Odyssey: Creation, Analysis and Detection of Trojan Models},
author = {Marzieh Edraki and Nazmul Karim and Nazanin Rahnavard and Ajmal Mian and Mubarak Shah
},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/08/Publications_Odyssey-Creation-Analysis-and-Detection-of-Trojan-Models.pdf
https://www.crcv.ucf.edu/research/projects/odyssey-creation-analysis-and-detection-of-trojan-models/},
year = {2020},
date = {2020-07-16},
number = {arXiv:2007.08142},
keywords = {Adversarial Attacks},
pubstate = {published},
tppubtype = {techreport}
}
Demir, Ugur; Rawat, Yogesh Singh; Shah, Mubarak
TinyVIRAT: Low-resolution Video Action Recognition Technical Report
no. arXiv:2007.07355, 2020.
Tags: Video Action Recognition | Links:
@techreport{Demir2020,
title = {TinyVIRAT: Low-resolution Video Action Recognition},
author = {Ugur Demir and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_TinyVIRAT.pdf
https://www.crcv.ucf.edu/research/projects/tinyvirat-low-resolution-video-action-recognition/},
year = {2020},
date = {2020-07-14},
number = {arXiv:2007.07355},
keywords = {Video Action Recognition},
pubstate = {published},
tppubtype = {techreport}
}
Joneidi, Mohsen; Vahidian, Saeed; Esmaeili, Ashkan; Wang, Weijia; Rahnavard, Nazanin; Lin, Bill; Shah, Mubarak
Select to Better Learn: Fast and Accurate Deep Learning using Data Selection from Nonlinear Manifolds Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2020.
Abstract | Tags: CVPR | Links:
@conference{Jonediei2020,
title = {Select to Better Learn: Fast and Accurate Deep Learning using Data Selection from Nonlinear Manifolds},
author = {Mohsen Joneidi and Saeed Vahidian and Ashkan Esmaeili and Weijia Wang and Nazanin Rahnavard and Bill Lin and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/04/Select-to-Better-Learn.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2020/04/Select-to-Better-Learn_Supplementary.pdf},
year = {2020},
date = {2020-06-14},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
abstract = {Finding a small subset of data whose linear combination spans other data points, also called column subset selection problem (CSSP), is an important open problem in computer science with many applications in computer vision and deep learning such as the ones shown in Fig. 1. There are some studies that solve CSSP in a polynomial time complexity w.r.t. the size of the original dataset. A simple and efficient selection algorithm with a linear complexity order, referred to as spectrum pursuit (SP), is proposed that pursuits spectral components of the dataset using available sample points. The proposed non-greedy algorithm aims to iteratively find K data samples whose span is close to that of the first K spectral components of entire data. SP has no parameter to be fine tuned and this desirable property makes it problem-independent. The simplicity of SP enables us to extend the underlying linear model to more complex models such as nonlinear manifolds and graph-based models. The nonlinear extension of SP is introduced as kernel-SP (KSP). The superiority of the proposed algorithms is demonstrated in a wide range of applications.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
McIntosh, Bruce; Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
Visual-textual Capsule Routing for Text-based Video Segmentation Conference
IEEE Conference on Computer Vision and Pattern Recognition (Oral), 2020.
Abstract | Tags: Capsule Networks, CVPR, Video Object Segmentation | Links:
@conference{Duarte2020,
title = {Visual-textual Capsule Routing for Text-based Video Segmentation},
author = {Bruce McIntosh and Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/06/Projects_Visual-textual-Capsule-Routing-for-Text-based-Video-Segmentation.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2020/06/Projects_Visual-textual-Capsule-Routing-for-Text-based-Video-Segmentation_Supplementary.zip
https://www.crcv.ucf.edu/research/projects/visual-textual-capsule-routing-for-text-based-video-segmentation/},
year = {2020},
date = {2020-06-14},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (Oral)},
edition = {(Oral)},
abstract = {Joint understanding of vision and natural language is a challenging problem with a wide range of applications in artificial intelligence. In this work, we focus on integration of video and text for the task of actor and action video segmentation from a sentence. We propose a capsule-based approach which performs pixel-level localization based on a natural language query describing the actor of interest. We encode both the video and textual input in the form of capsules, which provide a more effective representation in comparison with standard convolution based features. Our novel visual-textual routing mechanism allows for the fusion of video and text capsules to successfully localize the actor and action. The existing works on actor-action localization are mainly focused on localization in a single frame instead of the full video. Different from existing works, we propose to perform the localization on all frames of the video. To validate the potential of the proposed network for actor and action video localization, we extend an existing actor-action dataset (A2D) with annotations for all the frames. The experimental evaluation demonstrates the effectiveness of our capsule network for text selective actor and action localization in videos. The proposed method also improves upon the performance of the existing state-of-the art works on single frame-based localization. },
keywords = {Capsule Networks, CVPR, Video Object Segmentation},
pubstate = {published},
tppubtype = {conference}
}
Rajasegaran, Jathushan; Khan, Salman; Hayat, Munawar; Khan, Fahad Shahbaz; Shah, Mubarak
iTAML : An Incremental Task-Agnostic Meta-learning Approach Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2020.
Abstract | Tags: CVPR, Meta-Learning | Links:
@conference{Khan2020,
title = {iTAML : An Incremental Task-Agnostic Meta-learning Approach},
author = {Jathushan Rajasegaran and Salman Khan and Munawar Hayat and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/04/iTAML.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2020/04/iTAML-Supplementary.pdf},
year = {2020},
date = {2020-06-14},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
abstract = {Humans can continuously learn new knowledge as their experience grows. In contrast, previous learning in deep neural networks can quickly fade out when they are trained on a new task. In this paper, we hypothesize this problem can be avoided by learning a set of generalized parameters, that are neither specific to old nor new tasks. In this pursuit, we introduce a novel meta-learning approach that seeks to maintain an equilibrium between all the encountered tasks. This is ensured by a new meta-update rule which avoids catastrophic forgetting. In comparison to previous metalearning techniques, our approach is task-agnostic. When presented with a continuum of data, our model automatically identifies the task and quickly adapts to it with just a single update. We perform extensive experiments on five datasets in a class-incremental setting, leading to significant improvements over the state of the art methods (e.g., a 21.3% boost on CIFAR100 with 10 incremental tasks). Specifically, on large-scale datasets that generally prove difficult cases for incremental learning, our approach delivers absolute gains as high as 19.1% and 7.4% on ImageNet and MS-Celeb datasets, respectively. Our codes are available at: https://github.com/brjathu/iTAML. },
keywords = {CVPR, Meta-Learning},
pubstate = {published},
tppubtype = {conference}
}
Palazzo, Simone; Spampinato, Concetto; Kavasidis, Isaak; Giordano, Daniela; Schmidt, Joseph; Shah, Mubarak
Decoding Brain Representations by Multimodal Learning of Neural Activity and Visual Features Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, pp. 1 - 1, 2020.
Tags: EEG/Brain, Medical | Links:
@article{Palazzo2020,
title = {Decoding Brain Representations by Multimodal Learning of Neural Activity and Visual Features},
author = {Simone Palazzo and Concetto Spampinato and Isaak Kavasidis and Daniela Giordano and Joseph Schmidt and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/06/Publications_Decoding-Brain-Representations-by-Multimodal-Learning-of-Neural-Activity-and-Visual-Features.pdf},
year = {2020},
date = {2020-05-20},
urldate = {2020-05-20},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
pages = {1 - 1},
keywords = {EEG/Brain, Medical},
pubstate = {published},
tppubtype = {article}
}
Sharghi, Aidean; da Vitoria Lobo, Niels; Shah, Mubarak
Text Synopsis Generation for Egocentric Videos Conference
International Conference on Pattern Recognition, 2020.
Tags: Egocentric, ICPR, Text Synopsis, Video Summarization | Links:
@conference{Sharghi2020,
title = {Text Synopsis Generation for Egocentric Videos},
author = {Aidean Sharghi and Niels da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Publications_Text-Synopsis-Generation-for-Egocentric-Videos.pdf
https://youtu.be/Z2Rpy7MC7QI},
year = {2020},
date = {2020-05-08},
booktitle = {International Conference on Pattern Recognition},
keywords = {Egocentric, ICPR, Text Synopsis, Video Summarization},
pubstate = {published},
tppubtype = {conference}
}
Rizve, Mamshad Nayeem; Demir, Ugur; Tirupattur, Praveen; Rana, Aayush; Duarte, Kevin; Dave, Ishan; Rawat, Yogesh Singh; Shah, Mubarak
Gabriella: An Online System for Real-Time Activity Detection in Untrimmed Security Videos Conference
25th International Conference on Pattern Recognition, Italy, 10-15 January 2021 (ICPR 2020), 2020.
Tags: Activity Detection, IARPA DIVA, ICPR | Links:
@conference{Rizve2020,
title = {Gabriella: An Online System for Real-Time Activity Detection in Untrimmed Security Videos},
author = {Mamshad Nayeem Rizve and Ugur Demir and Praveen Tirupattur and Aayush Rana and Kevin Duarte and Ishan Dave and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Gabriella.pdf
https://www.crcv.ucf.edu/research/projects/gabriella-an-online-system-for-real-time-activity-detection-in-untrimmed-security-videos/},
year = {2020},
date = {2020-04-23},
urldate = {2020-04-23},
booktitle = {25th International Conference on Pattern Recognition, Italy, 10-15 January 2021 (ICPR 2020)},
keywords = {Activity Detection, IARPA DIVA, ICPR},
pubstate = {published},
tppubtype = {conference}
}
Zaeemzadeh, Alireza; Rahnavard, Nazanin; Shah, Mubarak
Norm-Preservation: Why Residual Networks Can Become Extremely Deep? Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, 2020.
Tags: Deep Learning, RESNET | Links:
@article{Zaeemzadeh2020,
title = {Norm-Preservation: Why Residual Networks Can Become Extremely Deep?},
author = {Alireza Zaeemzadeh and Nazanin Rahnavard and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Norm-Preservation.pdf},
year = {2020},
date = {2020-04-19},
urldate = {2020-04-19},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {Deep Learning, RESNET},
pubstate = {published},
tppubtype = {article}
}
Gupta, Rohit; Shah, Mubarak
RescueNet: Joint Building Segmentation and Damage Assessment from Satellite Imagery Conference
International Conference on Pattern Recognition, 2020.
Tags: Geo-Localization, ICPR, Segmentation | Links:
@conference{Gupta2020,
title = {RescueNet: Joint Building Segmentation and Damage Assessment from Satellite Imagery},
author = {Rohit Gupta and Mubarak Shah},
url = {https://arxiv.org/pdf/2004.07312.pdf},
year = {2020},
date = {2020-04-15},
booktitle = {International Conference on Pattern Recognition},
keywords = {Geo-Localization, ICPR, Segmentation},
pubstate = {published},
tppubtype = {conference}
}
MS, Kalemaki; AH, Karantanas; D, Exarchos; ET, Detorakis; O, Zoras; K, Marias; Millo, Corina; Bagci, Ulas; I, Pallikaris; A, Stratis; I, Karatzanis; K, Perisinakis; P, Koutentakis; GA, Kontadakis; DA, Spandidos; A, Tsatsakis; Papadakis, Georgios Z.
PET/CT and PET/MRI in Optalhalmic Oncology Journal Article
In: International Journal of Oncology, 2020.
Tags: Medical | Links:
@article{MS2020,
title = {PET/CT and PET/MRI in Optalhalmic Oncology},
author = {Kalemaki MS and Karantanas AH and Exarchos D and Detorakis ET and Zoras O and Marias K and Corina Millo and Ulas Bagci and Pallikaris I and Stratis A and Karatzanis I and Perisinakis K and Koutentakis P and Kontadakis GA and Spandidos DA and Tsatsakis A and Georgios Z. Papadakis},
url = {https://doi.org/10.3892/ijo.2020.4955},
year = {2020},
date = {2020-03-09},
journal = {International Journal of Oncology},
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
LaLonde, Rodney; Kandel, P.; Spampinato, Concetto; Wallace, M. B.; Bagci, Ulas
Diagnosing Colorectal Polyps in the Wild with Capsule Networks Journal Article
In: IEEE ISBI , 2020.
Tags: Medical | Links:
@article{LaLonde2020,
title = {Diagnosing Colorectal Polyps in the Wild with Capsule Networks},
author = {Rodney LaLonde and Kandel, P. and Concetto Spampinato and M.B. Wallace and Ulas Bagci },
url = {https://github.com/lalonderodney/D-Caps},
year = {2020},
date = {2020-03-02},
journal = {IEEE ISBI },
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
Chen, Chen; Surette, Ray; Shah, Mubarak
Automated monitoring for security camera networks: promise from computer vision labs Journal Article
In: Security Journal, 2020.
Tags: Deep Learning | Links:
@article{Chen2020,
title = {Automated monitoring for security camera networks: promise from computer vision labs},
author = {Chen Chen and Ray Surette and Mubarak Shah
},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/08/Publications_Automated-monitoring-for-security-camera-networks.pdf},
year = {2020},
date = {2020-02-17},
journal = {Security Journal},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Edraki, Marzieh; Rahnavard, Nazanin; Shah, Mubarak
Subspace Capsule Network Conference
34th Conference on Association for the Advancement of Artificial Intelligence (AAAI 2020), New York, USA, 2020.
Tags: Capsule Networks | Links:
@conference{Edraki2019,
title = {Subspace Capsule Network},
author = {Marzieh Edraki and Nazanin Rahnavard and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/03/Projects_SubspaceCapsuleNetwork.pdf
https://www.crcv.ucf.edu/research/projects/subspace-capsule-network/},
year = {2020},
date = {2020-02-07},
booktitle = {34th Conference on Association for the Advancement of Artificial Intelligence (AAAI 2020), New York, USA},
keywords = {Capsule Networks},
pubstate = {published},
tppubtype = {conference}
}
Masoodi, S.; Razi, A.; Wright, C.; Gatlin, J.; Bagci, Ulas
Instance-level Microtubule Tracking Journal Article
In: IEEE Transactions on Medical Imaging, 2020.
Tags: Medical | Links:
@article{Masoodi2019,
title = {Instance-level Microtubule Tracking},
author = {Masoodi, S. and Razi, A. and Wright, C. and Gatlin, J. and Ulas Bagci},
url = {https://doi.org/10.1109/TMI.2019.2963865},
year = {2020},
date = {2020-01-03},
journal = {IEEE Transactions on Medical Imaging},
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
Quintanilla, Erik; Rawat, Yogesh Singh; Sakryukin, Andrey; Shah, Mubarak; Kankanhalli, Mohan
Adversarial Learning for Personalized Tag Recommendation Journal Article
In: IEEE Transactions on Multimedia (TMM), 2020.
Tags: REU | Links:
@article{quintanilla2020adversarial,
title = {Adversarial Learning for Personalized Tag Recommendation},
author = {Erik Quintanilla and Yogesh Singh Rawat and Andrey Sakryukin and Mubarak Shah and Mohan Kankanhalli},
url = {https://arxiv.org/pdf/2004.00698.pdf
Code: https://github.com/vyzuer/ALTReco
},
year = {2020},
date = {2020-01-01},
journal = {IEEE Transactions on Multimedia (TMM)},
keywords = {REU},
pubstate = {published},
tppubtype = {article}
}
2019
Tirosh, A.; RaviPrakash, H.; Papadakis, Georgios Z.; Tatsi, C.; Belyavskaya, E.; Chahralampos, L.; Lodish, MB.; Bagci, Ulas; Stratakis, Constantine A.
Computerized Analysis of Brain MR parameters dynamics in young patients with Cushing-Syndrome – a case control study Journal Article
In: The Journal of Clinical Endocrinology and Metabolism, 2019.
Tags: Medical | Links:
@article{Tirosh2019,
title = {Computerized Analysis of Brain MR parameters dynamics in young patients with Cushing-Syndrome – a case control study},
author = {Tirosh, A. and RaviPrakash, H. and Georgios Z. Papadakis and Tatsi, C. and Belyavskaya, E. and Chahralampos, L. and Lodish, MB. and Ulas Bagci and Constantine A. Stratakis},
url = {https://doi.org/10.1210/clinem/dgz303},
year = {2019},
date = {2019-12-30},
journal = {The Journal of Clinical Endocrinology and Metabolism},
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
Hoogenboom, S.; Bagci, Ulas; Wallace, M. B.
AI in Gastroenterology. Current State of Play and Potential. How will it affect our practice and when? Journal Article
In: Techniques in Gastrointestinal Endoscopy, 150634, 2019.
Tags: Medical | Links:
@article{Hoogenboom2019,
title = {AI in Gastroenterology. Current State of Play and Potential. How will it affect our practice and when? },
author = {Hoogenboom, S. and Ulas Bagci and M.B. Wallace},
url = {https://doi.org/10.1016/j.tgie.2019.150634},
year = {2019},
date = {2019-12-29},
journal = {Techniques in Gastrointestinal Endoscopy, 150634},
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
Stember, JN.; Celik, Haydar; Krupinski, E.; Chang, P.; Mutasa, S.; Wood, Bradford; Lignelli, A.; Moonis, G.; Jambawalikar, S.; Bagci, Ulas
Eye-Tracking for Deep Learning Segmentation Using Convolutional Neural Networks: a proof-of-principle application to meningiomas Journal Article
In: Journal of Digital Imaging, 2019.
Tags: Medical | Links:
@article{Stember2019,
title = {Eye-Tracking for Deep Learning Segmentation Using Convolutional Neural Networks: a proof-of-principle application to meningiomas},
author = {Stember, JN. and Haydar Celik and Krupinski, E. and Chang, P. and Mutasa, S. and Bradford Wood and Lignelli, A. and Moonis, G. and S. Jambawalikar and Ulas Bagci
},
url = {https://doi.org/10.1007/s10278-019-00220-4},
year = {2019},
date = {2019-12-28},
journal = {Journal of Digital Imaging},
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
Karaaslan, E.; Bagci, Ulas; Catbas, F. N.
Artificial Intelligence Assisted Infrastructure Assessment Using Mixed Reality Systems Journal Article
In: Journal of Transportation Research, 2019.
Tags: Medical | Links:
@article{Karaaslan2019,
title = {Artificial Intelligence Assisted Infrastructure Assessment Using Mixed Reality Systems},
author = {Karaaslan, E. and Ulas Bagci and Catbas, F.N.
},
url = {https://doi.org/10.1177%2F0361198119839988},
year = {2019},
date = {2019-12-27},
journal = {Journal of Transportation Research},
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
Torosdagli, N.; Liberton, Denise; Verma, Payal; Sincan, Murat; Lee, Janice; Bagci, Ulas
Deep Geodesic Learning for Segmentation and Anatomical Landmarking Journal Article
In: IEEE Transactions on Medical Imaging, 2019.
Tags: Medical | Links:
@article{Torosdagli2019,
title = {Deep Geodesic Learning for Segmentation and Anatomical Landmarking},
author = {N. Torosdagli and Denise Liberton and Payal Verma and Murat Sincan and Janice Lee and Ulas Bagci
},
url = {https://doi.org/10.1109/TMI.2018.2875814},
year = {2019},
date = {2019-12-25},
journal = {IEEE Transactions on Medical Imaging},
keywords = {Medical},
pubstate = {published},
tppubtype = {article}
}
LaLonde, Rodney; Tanner, Irene; Nikiforaki, K.; Papadakis, Georgios Z.; Kandel, P.; Bolan, CW; Wallace, M. B.; Bagci, Ulas
INN: Inflated Neural Networks for IPMN Diagnosis Conference
MICCAI, 2019.
Tags: Medical | Links:
@conference{LaLonde2019,
title = {INN: Inflated Neural Networks for IPMN Diagnosis},
author = {Rodney LaLonde and Irene Tanner and Nikiforaki, K. and Georgios Z. Papadakis and Kandel, P. and Bolan, CW and M.B. Wallace and Ulas Bagci},
url = {https://doi.org/10.1007/978-3-030-32254-0_12},
year = {2019},
date = {2019-12-20},
booktitle = {MICCAI},
keywords = {Medical},
pubstate = {published},
tppubtype = {conference}
}
Khosravan, Naji; Mortazi, Aliasghar; Wallace, M. B.; Bagci, Ulas
PAN: Projective Adversarial Network for Medical Image Segmentation Conference
MICCAI, 2019.
Tags: Medical | Links:
@conference{Khosravan2019,
title = {PAN: Projective Adversarial Network for Medical Image Segmentation},
author = {Naji Khosravan and Aliasghar Mortazi and M.B. Wallace and Ulas Bagci
},
url = {https://doi.org/10.1007/978-3-030-32226-7_8},
year = {2019},
date = {2019-12-19},
booktitle = {MICCAI},
keywords = {Medical},
pubstate = {published},
tppubtype = {conference}
}
Mortazi, Aliasghar; Khosravan, Naji; Torigian, DA; Kurugol, S.; Bagci, Ulas
Weakly Supervised Segmentation by A Deep Geodesic Prior Conference
MICCAI 2019-MLMI, 2019.
Tags: Medical | Links:
@conference{Mortazi2019,
title = {Weakly Supervised Segmentation by A Deep Geodesic Prior},
author = {Aliasghar Mortazi and Naji Khosravan and Torigian, DA and Kurugol, S. and Ulas Bagci
},
url = {https://doi.org/10.1007/978-3-030-32692-0_28},
year = {2019},
date = {2019-12-17},
booktitle = {MICCAI 2019-MLMI},
keywords = {Medical},
pubstate = {published},
tppubtype = {conference}
}
Liu, Y.; Khosravan, Naji; Liu, Y.; Stember, J.; Bagci, Ulas; Jambawalikar, S.
Cross-modality Knowledge Transfer for Prostate Segmentation from CT Scans Conference
MICCAI 2019-DART, 2019.
Tags: Medical | Links:
@conference{Liu2019,
title = {Cross-modality Knowledge Transfer for Prostate Segmentation from CT Scans},
author = {Liu, Y. and Naji Khosravan and Liu, Y. and Stember, J. and Ulas Bagci and S. Jambawalikar},
url = {https://doi.org/10.1007/978-3-030-33391-1_8},
year = {2019},
date = {2019-12-15},
booktitle = {MICCAI 2019-DART},
keywords = {Medical},
pubstate = {published},
tppubtype = {conference}
}
Anwar, S.; Tooba, A.; Rafique, K.; RaviPrakash, H.; Mohy-ud-din, H.; Bagci, Ulas
A Survey on Recent Advancements for AI-Enabled Radiomics in Neuro-Oncology Conference
MICCAI 2019-RNO-AI, 2019.
Tags: Medical | Links:
@conference{Anwar2019,
title = {A Survey on Recent Advancements for AI-Enabled Radiomics in Neuro-Oncology},
author = {Anwar, S. and Tooba, A. and Rafique, K. and RaviPrakash, H. and Mohy-ud-din, H. and Ulas Bagci},
url = {https://doi.org/10.1007/978-3-030-40124-5_3},
year = {2019},
date = {2019-12-10},
booktitle = {MICCAI 2019-RNO-AI},
keywords = {Medical},
pubstate = {published},
tppubtype = {conference}
}
Mazaheri, Amir
Video Content Understanding Using Text PhD Thesis
University of Central Florida, 2019.
@phdthesis{Mazaheri2019b,
title = {Video Content Understanding Using Text},
author = {Amir Mazaheri},
url = {https://stars.library.ucf.edu/etd2020/99/},
year = {2019},
date = {2019-12-03},
urldate = {2019-12-02},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Khosravan, Naji
Collaborative Artificial Intelligence Algorithms For Medical Imaging Applications PhD Thesis
University of Central Florida, 2019.
@phdthesis{nokey,
title = {Collaborative Artificial Intelligence Algorithms For Medical Imaging Applications},
author = {Naji Khosravan},
url = {https://stars.library.ucf.edu/etd/6877/},
year = {2019},
date = {2019-12-02},
urldate = {2019-12-01},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Sharghi, Aidean
Visual-Textual Video Synopsis Generation PhD Thesis
University of Central Florida, 2019.
@phdthesis{nokey,
title = {Visual-Textual Video Synopsis Generation},
author = {Aidean Sharghi},
url = {https://stars.library.ucf.edu/etd/6716/},
year = {2019},
date = {2019-12-01},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Mortazi, Aliasghar
Optimization Algorithms for Deep Learning Based Medical Image Segmentations PhD Thesis
University of Central Florida, 2019.
@phdthesis{Mortazi2019b,
title = {Optimization Algorithms for Deep Learning Based Medical Image Segmentations},
author = {Aliasghar Mortazi},
url = {https://stars.library.ucf.edu/etd/6715/},
year = {2019},
date = {2019-12-01},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Khodadadeh, Siavash; Bölöni, Ladislau; Shah, Mubarak
Unsupervised Meta-Learning for Few-Shot Image Classification Conference
33rd Conference on Neural Information Processing Systems (NeurIPS 2019), Vancouver, Canada, 2019.
Tags: Meta-Learning | Links:
@conference{Khodadadeh2019,
title = {Unsupervised Meta-Learning for Few-Shot Image Classification},
author = {Siavash Khodadadeh and Ladislau Bölöni and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/01/Publications_unsupervised-meta-learning-for-few-shot-image-classification.pdf
https://www.crcv.ucf.edu/research/projects/unsupervised-meta-learning-for-few-shot-image-and-video-classification/},
year = {2019},
date = {2019-11-30},
booktitle = {33rd Conference on Neural Information Processing Systems (NeurIPS 2019), Vancouver, Canada},
journal = {33rd Conference on Neural Information Processing Systems (NeurIPS 2019), Vancouver, Canada},
keywords = {Meta-Learning},
pubstate = {published},
tppubtype = {conference}
}
Kalayeh, Mahdi M.; Shah, Mubarak
On Symbiosis of Attribute Prediction and Semantic Segmentation Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, Pages 1-1, DOI: 10.1109/TPAMI.2019.2956039, 2019.
Tags: Semantic Segmentation | Links:
@article{Kalayeh2019,
title = {On Symbiosis of Attribute Prediction and Semantic Segmentation},
author = {Mahdi M. Kalayeh and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/01/Publications_On-Symbiosis-of-Attribute-Prediction-and-Semantic-Segmentation.pdf},
year = {2019},
date = {2019-11-26},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence, Pages 1-1, DOI: 10.1109/TPAMI.2019.2956039},
keywords = {Semantic Segmentation},
pubstate = {published},
tppubtype = {article}
}
Sultani, Waqas; Shah, Mubarak
Human Action Recognition in Drone Videos using a Few Aerial Training Examples Conference
Cornell University Library, arXiv:1910.10027v1. [cs.CV], 2019.
Tags: Action Recognition | Links:
@conference{Sultani2019,
title = {Human Action Recognition in Drone Videos using a Few Aerial Training Examples},
author = {Waqas Sultani and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/10/1910.10027v1.pdf},
year = {2019},
date = {2019-11-04},
publisher = {Cornell University Library, arXiv:1910.10027v1. [cs.CV]},
keywords = {Action Recognition},
pubstate = {published},
tppubtype = {conference}
}
Arif, Maliha; Mahalanobis, Abhijit
View Prediction using manifold learning in non-linear feature subspace Proceedings
SPIE Symposium on Mulispectral Image Processing and Pattern Recognition, 2019.
Tags: Deep Learning | Links:
@proceedings{Maliha2019,
title = {View Prediction using manifold learning in non-linear feature subspace},
author = {Maliha Arif and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Final_Ours_SPIE_submission_version2.pdf},
doi = {10.1117/12.2539521},
year = {2019},
date = {2019-11-01},
publisher = {SPIE Symposium on Mulispectral Image Processing and Pattern Recognition},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {proceedings}
}
Aafaq, Nayyer; Mian, Ajmal; Liu, Wei; Gilani, Syed Zulqarnain; Shah, Mubarak
Video Description: A Survey of Methods, Datasets, and Evaluation Metrics Journal Article
In: ACM Comput. Surv. 52, 6, Article 115 (October 2019), 37 pages. DOI: https://doi.org/10.1145/3355390, 2019.
Tags: Video Description | Links:
@article{Aafaq2019,
title = {Video Description: A Survey of Methods, Datasets, and Evaluation Metrics},
author = {Nayyer Aafaq and Ajmal Mian and Wei Liu and Syed Zulqarnain Gilani and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/a115-aafaq.pdf
},
year = {2019},
date = {2019-10-31},
journal = {ACM Comput. Surv. 52, 6, Article 115 (October 2019), 37 pages. DOI: https://doi.org/10.1145/3355390},
publisher = {ACM Comput. Surv. 52, 6, Article 115 (October 2019), 37 pages. DOI: https://doi.org/10.1145/3355390},
keywords = {Video Description},
pubstate = {published},
tppubtype = {article}
}
Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
CapsuleVOS: Semi-Supervised Video Object Segmentation Using Capsule Routing Conference
International Conference on Computer Vision (ICCV 2019), Seoul, South Korea, Oct 27-Nov 2, 2019.
Tags: Video Object Segmentation | Links:
@conference{Duarte2019,
title = {CapsuleVOS: Semi-Supervised Video Object Segmentation Using Capsule Routing},
author = {Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Projects_CapsuleVOS-Semi-Supervised-Video-Object-Segmentation-Using-Capsule-Routing.pdf
https://www.crcv.ucf.edu/research/projects/capsulevos-semi-supervised-video-object-segmentation-using-capsule-routing/},
year = {2019},
date = {2019-10-30},
publisher = { International Conference on Computer Vision (ICCV 2019), Seoul, South Korea, Oct 27-Nov 2},
keywords = {Video Object Segmentation},
pubstate = {published},
tppubtype = {conference}
}
Alemu, Leulseged Tesfaye; Pelillo, Marcello; Shah, Mubarak
Deep Constrained Dominant Sets for Person Re-Identification Conference
International Conference on Computer Vision (ICCV 2019), Seoul, South Korea, Oct 27-Nov 2, 2019.
Tags: Re-Identification | Links:
@conference{Alemu2019,
title = {Deep Constrained Dominant Sets for Person Re-Identification},
author = {Leulseged Tesfaye Alemu and Marcello Pelillo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/08/Publications_Deep-Constrained-Dominant-Sets-for-Person-Re-Identification.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2019/08/Publications_Deep-Constrained-Dominant-Sets-for-Person-Re-Identification_Supplementary.pdf
https://www.crcv.ucf.edu/research/projects/deep-constrained-dominant-sets-for-person-re-identification/},
year = {2019},
date = {2019-10-28},
publisher = {International Conference on Computer Vision (ICCV 2019), Seoul, South Korea, Oct 27-Nov 2},
keywords = {Re-Identification},
pubstate = {published},
tppubtype = {conference}
}
Regmi, Krishna; Shah, Mubarak
Bridging the Domain Gap for Ground-to-Aerial Image Matching Conference
International Conference on Computer Vision (ICCV 2019), Seoul, South Korea, Oct 27-Nov 2, 2019.
Tags: Cross-View Image Retrieval, Geo-Localization | Links:
@conference{Regmi2019,
title = { Bridging the Domain Gap for Ground-to-Aerial Image Matching},
author = {Krishna Regmi and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/02/Publications_Bridging-the-Domain-Gap-for-Ground-to-Aerial-Image-Matching_Final.pdf
https://www.crcv.ucf.edu/research/projects/bridging-the-domain-gap-for-ground-to-aerial-image-matching/
},
year = {2019},
date = {2019-10-27},
publisher = {International Conference on Computer Vision (ICCV 2019), Seoul, South Korea, Oct 27-Nov 2},
keywords = {Cross-View Image Retrieval, Geo-Localization},
pubstate = {published},
tppubtype = {conference}
}
Spampinato, Concetto; Palazzo, Simone; D’Oro, P.; Giordano, Daniela; Shah, Mubarak
Adversarial Framework for Unsupervised Learning of Motion Dynamics in Videos Journal Article
In: International Journal of Computer Vision, 1-20, 2019.
Tags: Unsupervised Learning | Links:
@article{Spampinato2019,
title = {Adversarial Framework for Unsupervised Learning of Motion Dynamics in Videos},
author = {Concetto Spampinato and Simone Palazzo and P. D’Oro and Daniela Giordano and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Publications_Adversarial-Framework-for-Unsupervised-Learning-of-Motion-Dynamics-in-Videos.pdf},
year = {2019},
date = {2019-10-08},
journal = {International Journal of Computer Vision, 1-20},
keywords = {Unsupervised Learning},
pubstate = {published},
tppubtype = {article}
}
Hou, Rui; Chen, Chen; Sukthankar, Rahul; Shah, Mubarak
An Efficient 3D CNN for Action/Object Segmentation in Video Conference
British Machine Vision Conference (BMVC 2019), UK, Sep 9-10, 2019.
Tags: BMVC, Video Object Segmentation | Links:
@conference{Hou2019,
title = { An Efficient 3D CNN for Action/Object Segmentation in Video},
author = {Rui Hou and Chen Chen and Rahul Sukthankar and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/07/Publications_An-Efficient-3D-CNN-for-Action-Object-Segmentation-in-Video.pdf
https://www.crcv.ucf.edu/research/projects/an-efficient-3d-cnn-for-action-object-segmentation-in-video/},
year = {2019},
date = {2019-09-10},
publisher = {British Machine Vision Conference (BMVC 2019), UK, Sep 9-10},
keywords = {BMVC, Video Object Segmentation},
pubstate = {published},
tppubtype = {conference}
}
Tesfaye, Yonatan Tariku; Zemene, Eyasu; Prati, Andrea; Pelillo, Marcello; Shah, Mubarak
Multi-target tracking in multiple non-overlapping cameras using constrained dominant sets Journal Article
In: International Journal for Computer Vision (IJCV), September 2019, Volume 127, Issue 9, pp1303-1320., 2019.
Tags: Clustering, Dominant Sets, Reranking, Tracking, Video Re-ID | Links:
@article{Tesfaye2019,
title = { Multi-target tracking in multiple non-overlapping cameras using constrained dominant sets},
author = {Yonatan Tariku Tesfaye and Eyasu Zemene and Andrea Prati and Marcello Pelillo and Mubarak Shah },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/06/Projects_MultiTargetTrackingConstrainedDominantSets.pdf
https://www.crcv.ucf.edu/home/projects/multi-target-tracking-in-non-overlapping-cameras-using-fast-constrained-dominant-sets/},
year = {2019},
date = {2019-09-09},
urldate = {2019-09-09},
journal = {International Journal for Computer Vision (IJCV), September 2019, Volume 127, Issue 9, pp1303-1320.},
keywords = {Clustering, Dominant Sets, Reranking, Tracking, Video Re-ID},
pubstate = {published},
tppubtype = {article}
}
LaPlace, Cecilia; Khan, Aisha Urooj; Borji, Ali
Segmenting Sky Pixels in Images: Analysis and Comparison Conference
IEEE Winter Conference on Applications of Computer Vision, 2019.
Tags: REU
@conference{LaPlace2019,
title = {Segmenting Sky Pixels in Images: Analysis and Comparison},
author = {Cecilia LaPlace and Aisha Urooj Khan and Ali Borji },
year = {2019},
date = {2019-08-02},
booktitle = {IEEE Winter Conference on Applications of Computer Vision},
keywords = {REU},
pubstate = {published},
tppubtype = {conference}
}
Hou, Rui
Action Recognition, Temporal Localization and Detection in Trimmed and Untrimmed Video PhD Thesis
University of Central Florida, 2019.
@phdthesis{Hou2019b,
title = {Action Recognition, Temporal Localization and Detection in Trimmed and Untrimmed Video},
author = {Rui Hou},
url = {https://stars.library.ucf.edu/etd/6507/},
year = {2019},
date = {2019-08-01},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Sun, ShiJie; Akhtar, Naveed; Song, HuanSheng; Mian, Ajmal; Shah, Mubarak
Deep Affinity Network for Multiple Object Tracking Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, 2019.
Tags: Tracking | Links:
@article{Sun2019,
title = {Deep Affinity Network for Multiple Object Tracking},
author = {ShiJie Sun and Naveed Akhtar and HuanSheng Song and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Publications_Deep-Affinity-Network-for-Multiple-Object-Tracking.pdf},
year = {2019},
date = {2019-07-19},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {Tracking},
pubstate = {published},
tppubtype = {article}
}
Abolghasemi, Pooya; Mazaheri, Amir; Shah, Mubarak; Boloni, Ladislau
Pay Attention! – Robustifying a Deep Visuomotor Policy Through Task-Focused Visual Attention Conference
Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), 2019.
Tags: Robotics | Links:
@conference{abolghasemi2019payattention,
title = {Pay Attention! – Robustifying a Deep Visuomotor Policy Through Task-Focused Visual Attention},
author = {Pooya Abolghasemi and Amir Mazaheri and Mubarak Shah and Ladislau Boloni},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Publications_Pay_attention.pdf
https://www.crcv.ucf.edu/home/projects/pay-attention/},
year = {2019},
date = {2019-06-08},
publisher = {Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)},
keywords = {Robotics},
pubstate = {published},
tppubtype = {conference}
}
Zaeemzadeh, Alireza; Joneidi, Mohsen; Rahnavard, Nazanin; Shah, Mubarak
Iterative Projection and Matching: Finding Structure-preserving Representatives and Its Application to Computer Visio Conference
Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), 2019.
Tags: Iterative Projection and Matching | Links:
@conference{zaeemzadeh2019ipm,
title = {Iterative Projection and Matching: Finding Structure-preserving Representatives and Its Application to Computer Visio},
author = {Alireza Zaeemzadeh and Mohsen Joneidi and Nazanin Rahnavard and Mubarak Shah},
url = {https://arxiv.org/pdf/1811.12326.pdf
https://www.crcv.ucf.edu/home/projects/iterative-projection-and-matching/
https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Projects_IterativeProjectionMatching_Bibtex.txt},
year = {2019},
date = {2019-06-07},
publisher = {Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)},
keywords = {Iterative Projection and Matching},
pubstate = {published},
tppubtype = {conference}
}
Khosravan, Naji; Celik, Haydar; Turkbey, Baris; EC, Jones; Wood, Bradford; Bagci, Ulas
A collaborative computer aided diagnosis (C-CAD) system with eye-tracking, sparse attentional model, and deep learning Journal Article
In: Medical image analysis. 2019 Jan 1;51:101-15., 2019.
Tags: Attention, Eye-Tracking, Medical | Links:
@article{N2019,
title = { A collaborative computer aided diagnosis (C-CAD) system with eye-tracking, sparse attentional model, and deep learning},
author = {Naji Khosravan and Haydar Celik and Baris Turkbey and Jones EC and Bradford Wood and Ulas Bagci},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Publications_C-CAD.pdf
},
year = {2019},
date = {2019-06-06},
journal = {Medical image analysis. 2019 Jan 1;51:101-15.},
keywords = {Attention, Eye-Tracking, Medical},
pubstate = {published},
tppubtype = {article}
}
Mahalanobis, Abhijit
An overview of some techniques for the detection and recognition of objects in 3D data Proceedings
OSA Imaging and Applied Optics Congress, 2019.
Tags: Deep Learning
@proceedings{Mahalanobis2019,
title = {An overview of some techniques for the detection and recognition of objects in 3D data},
author = {Abhijit Mahalanobis},
year = {2019},
date = {2019-06-01},
publisher = {OSA Imaging and Applied Optics Congress},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {proceedings}
}
Mahalanobis, Abhijit; McIntosh, Bruce
A comparison of target detection algorithms using DSIAC ATR algorithm development data set, Proceedings
Automatic Target Recognition XXIX, vol. 10988, 2019.
Tags: Deep Learning
@proceedings{Mahalanobis2019b,
title = {A comparison of target detection algorithms using DSIAC ATR algorithm development data set,},
author = {Abhijit Mahalanobis and Bruce McIntosh},
year = {2019},
date = {2019-05-14},
volume = {10988},
publisher = {Automatic Target Recognition XXIX},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {proceedings}
}
Tanner, Irene; Mahalanobis, Abhijit
Fundamentals of target classification using deep learning Proceedings
Automatic Target Recognition XXIX, vol. 10988, 2019.
Tags: Deep Learning, REU
@proceedings{Tanner2019,
title = {Fundamentals of target classification using deep learning},
author = {Irene Tanner and Abhijit Mahalanobis},
year = {2019},
date = {2019-05-14},
urldate = {2019-05-14},
volume = {10988},
publisher = {Automatic Target Recognition XXIX},
keywords = {Deep Learning, REU},
pubstate = {published},
tppubtype = {proceedings}
}
Mahmoudkalayeh, Mahdi
Describing Images by Semantic Modeling Using Attributes and Tags PhD Thesis
University of Central Florida, 2019.
@phdthesis{Mahmoudkalayeh2019,
title = {Describing Images by Semantic Modeling Using Attributes and Tags},
author = {Mahdi Mahmoudkalayeh},
url = {https://stars.library.ucf.edu/etd/6296/},
year = {2019},
date = {2019-05-01},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
I, Irmakci; Hussein, Sarfaraz; A, Savran; RR, Kalyani; D, Reiter; CW, Chia; KW, Fishbein; RG, Spencer; L, Ferrucci; Bagci, Ulas
A Novel Extension to Fuzzy Connectivity for Body Composition Analysis: Applications in Thigh, Brain, and Whole Body Tissue Segmentation Journal Article
In: IEEE Transactions on Biomedical Engineering. 2019 Apr;66(4):1069-81., 2019.
Tags: Medical, Segmentation | Links:
@article{I2019d,
title = { A Novel Extension to Fuzzy Connectivity for Body Composition Analysis: Applications in Thigh, Brain, and Whole Body Tissue Segmentation},
author = {Irmakci I and Sarfaraz Hussein and Savran A and Kalyani RR and Reiter D and Chia CW and Fishbein KW and Spencer RG and Ferrucci L and Ulas Bagci},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Publications_NovelExtensionToFuzzyConnectivity.pdf
},
year = {2019},
date = {2019-04-30},
journal = { IEEE Transactions on Biomedical Engineering. 2019 Apr;66(4):1069-81.},
keywords = {Medical, Segmentation},
pubstate = {published},
tppubtype = {article}
}
Bagci, Ulas; Hussein, Sarfaraz
System and method for image-based quantification of white and brown adipose tissue at the whole-body, organ and body-region levels Journal Article
In: United States patent application US 10/157,462. 2018 Dec 18, 2019.
Tags: Medical, Segmentation | Links:
@article{U2018,
title = { System and method for image-based quantification of white and brown adipose tissue at the whole-body, organ and body-region levels},
author = {Ulas Bagci and Sarfaraz Hussein},
editor = {University of Central Florida Research Foundation Inc (UCFRF) and assignee},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Publications_System-and-method-for-image-based-quantification-of-white-and-brown-adipose-tissue-at-the-whole-body-organ-and-body-region-levels.pdf},
year = {2019},
date = {2019-04-29},
journal = {United States patent application US 10/157,462. 2018 Dec 18},
keywords = {Medical, Segmentation},
pubstate = {published},
tppubtype = {article}
}
Hussein, Sarfaraz; P, Kandel; CW, Bolan; Wallace, M. B.; Bagci, Ulas
Lung and pancreatic tumor characterization in the deep learning era: novel supervised and unsupervised learning approaches Journal Article
In: IEEE transactions on medical imaging. 2019 Jan 23., 2019.
Tags: Classification, Medical | Links:
@article{S2019,
title = { Lung and pancreatic tumor characterization in the deep learning era: novel supervised and unsupervised learning approaches},
author = {Sarfaraz Hussein and Kandel P and Bolan CW and M.B. Wallace and Ulas Bagci},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Publications_Lung-and-Pancreatic-Tumor-Characterization-in-the-Deep-Learning-.pdf
},
year = {2019},
date = {2019-04-28},
journal = { IEEE transactions on medical imaging. 2019 Jan 23.},
keywords = {Classification, Medical},
pubstate = {published},
tppubtype = {article}
}
H, Bogunovic; F, Venhuizen; S, Klimscha; S, Apostolopoulos; A, Bab-Hadiashar; Bagci, Ulas; MF, Beg; L, Bekalo; Q, Chen; C, Ciller; K, Gopinath
RETOUCH-The Retinal OCT Fluid Detection and Segmentation Benchmark and Challenge Journal Article
In: IEEE transactions on medical imaging. 2019 Feb 26., 2019.
Tags: Detection, Segmentation | Links:
@article{H2019b,
title = {RETOUCH-The Retinal OCT Fluid Detection and Segmentation Benchmark and Challenge},
author = {Bogunovic H and Venhuizen F and Klimscha S and Apostolopoulos S and Bab-Hadiashar A and Ulas Bagci and Beg MF and Bekalo L and Chen Q and Ciller C and Gopinath K},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Publications_RETOUCH.pdf},
year = {2019},
date = {2019-04-27},
journal = {IEEE transactions on medical imaging. 2019 Feb 26.},
keywords = {Detection, Segmentation},
pubstate = {published},
tppubtype = {article}
}
Vaca-Castano, Gonzalo; da Vitoria Lobo, Niels; Shah, Mubarak
Holistic Object Detection and Image Understanding Journal Article
In: Computer Vision and Image Understanding, vol. vol. 181, pp. 1-13, 2019.
@article{nokey,
title = {Holistic Object Detection and Image Understanding},
author = {Gonzalo Vaca-Castano and Niels da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Publications_Holistic-object-detection-and-image-understanding.pdf
https://www.crcv.ucf.edu/research/projects/holistic-object-detection-and-image-understanding/},
doi = {https://doi.org/10.1016/j.cviu.2019.02.006},
year = {2019},
date = {2019-04-01},
urldate = {2019-04-01},
journal = {Computer Vision and Image Understanding},
volume = {vol. 181},
pages = {1-13},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
DG, Vinsard; Y, Mori; M, Misawa; SE, Kudo; A, Rastogi; Bagci, Ulas; DK, Rex; Wallace, M. B.
Quality Assurance of Computer-Aided Detection and Diagnosis in Colonoscopy Journal Article
In: Gastrointestinal Endoscopy. 2019 Mar 26., 2019.
Tags: Detection, Medical | Links:
@article{DG2019,
title = {Quality Assurance of Computer-Aided Detection and Diagnosis in Colonoscopy},
author = {Vinsard DG and Mori Y and Misawa M and Kudo SE and Rastogi A and Ulas Bagci and Rex DK and M.B. Wallace},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Publications_Quality-Assurance-of-Computer-Aided-Detection-and-Diagnosis-in-Colonoscopy.pdf},
year = {2019},
date = {2019-03-26},
journal = {Gastrointestinal Endoscopy. 2019 Mar 26.},
keywords = {Detection, Medical},
pubstate = {published},
tppubtype = {article}
}
Rawat, Yogesh Singh; Shah, Mubarak; Kankanhalli, Mohan
Photography and Exploration of Tourist Locations Based on Optimal Foraging Theory Journal Article
In: IEEE Transactions on Circuits and Systems for Video Technology, 2019.
@article{rawat2019photography,
title = {Photography and Exploration of Tourist Locations Based on Optimal Foraging Theory},
author = {Yogesh Singh Rawat and Mubarak Shah and Mohan Kankanhalli},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2019/09/NsfProjects_BigData_Photography-and-Exploration-of-Tourist-Locations-Based-on-Optimal-Foraging-Theory.pdf
https://github.com/vyzuer/foraging_theory},
year = {2019},
date = {2019-01-01},
journal = {IEEE Transactions on Circuits and Systems for Video Technology},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Lobo, Niels Da Vitoria; Shah, Mubarak
UCF's 30-Year REU Site in Computer Vision Journal Article
In: Communications of the ACM, January 2019, Vol. 62 No. 1, Pages 31-34, 2019.
@article{Lobo2019,
title = {UCF's 30-Year REU Site in Computer Vision},
author = {Niels Da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Publications_UCF30YearREUSiteinComputerVision.pdf},
year = {2019},
date = {2019-01-01},
journal = {Communications of the ACM, January 2019, Vol. 62 No. 1, Pages 31-34},
keywords = {NSF, REU},
pubstate = {published},
tppubtype = {article}
}
Rana, Aayush; Tirupattur, Praveen; Rizve, Mamshad Nayeem; Duarte, Kevin; Demir, Ugur; Rawat, Yogesh Singh; Shah, Mubarak
An Online System for Real-Time Activity Detection in Untrimmed Surveillance Videos Journal Article
In: 2019.
Tags:
@article{ranaonline,
title = {An Online System for Real-Time Activity Detection in Untrimmed Surveillance Videos},
author = {Aayush Rana and Praveen Tirupattur and Mamshad Nayeem Rizve and Kevin Duarte and Ugur Demir and Yogesh Singh Rawat and Mubarak Shah},
year = {2019},
date = {2019-00-00},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2018
Tian, Yicong
Human Action Detection, Tracking and Segmentation in Videos PhD Thesis
University of Central Florida, 2018.
@phdthesis{Tian2018b,
title = {Human Action Detection, Tracking and Segmentation in Videos},
author = {Yicong Tian},
url = {https://stars.library.ucf.edu/etd/6159/},
year = {2018},
date = {2018-12-31},
urldate = {2018-12-31},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Ardeshir, Shervin
Relating First-person and Third-person Vision PhD Thesis
University of Central Florida, 2018.
@phdthesis{Ardeshir0000,
title = {Relating First-person and Third-person Vision},
author = {Shervin Ardeshir},
url = {https://stars.library.ucf.edu/cgi/viewcontent.cgi?article=6960&context=etd},
year = {2018},
date = {2018-08-02},
urldate = {2018-08-01},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Hussein, Sarfaraz
Learning Algorithms for Fat Quantification and Tumor Characterization PhD Thesis
University of Central Florida, 2018.
@phdthesis{Hussein0000,
title = {Learning Algorithms for Fat Quantification and Tumor Characterization},
author = {Sarfaraz Hussein},
url = {https://www.crcv.ucf.edu/papers/theses/Hussein},
year = {2018},
date = {2018-08-01},
urldate = {2018-08-01},
school = {University of Central Florida},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Maria Mosquera Chuquicusma,; Hussein, Sarfaraz; Bagci, Ulas
How to Fool Radiologists with Generative Adversarial Networks? A Visual Turing Test for Lung Cancer Diagnosis Conference
IEEE 15th International Symposium, 2018.
Tags: REU
@conference{Chuquicusma2018,
title = {How to Fool Radiologists with Generative Adversarial Networks? A Visual Turing Test for Lung Cancer Diagnosis},
author = {Maria Mosquera Chuquicusma, and Sarfaraz Hussein and Ulas Bagci },
year = {2018},
date = {2018-08-01},
booktitle = {IEEE 15th International Symposium},
keywords = {REU},
pubstate = {published},
tppubtype = {conference}
}
Mazaheri, Amir; Gong, Boqing; Shah, Mubarak
Learning a Multi-Concept Video Retrieval Model with Multiple Latent Variables Journal Article
In: ACM Trans. Multimedia Comput. Commun. Appl. 14, 2, Article 46 (April 2018), 21 pages, 2018.
Tags: Video Retrieval | Links:
@article{Mazaheri2019,
title = { Learning a Multi-Concept Video Retrieval Model with Multiple Latent Variables},
author = {Amir Mazaheri and Boqing Gong and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/acm-tomm-14.2.46.pdf},
year = {2018},
date = {2018-04-14},
journal = {ACM Trans. Multimedia Comput. Commun. Appl. 14, 2, Article 46 (April 2018), 21 pages},
keywords = {Video Retrieval},
pubstate = {published},
tppubtype = {article}
}
Tirupattur, Praveen; Rawat, Yogesh Singh; Spampinato, Concetto; Shah, Mubarak
ThoughtViz: Visualizing Human Thoughts Using Generative Adversarial Network Conference
ACM Multimedia 2018, Seoul, Korea, October 22-26, 2018., 2018.
Tags: EEG/Brain, Gans | Links:
@conference{Tirupattur2019,
title = { ThoughtViz: Visualizing Human Thoughts Using Generative Adversarial Network},
author = {Praveen Tirupattur and Yogesh Singh Rawat and Concetto Spampinato and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/acmmm18/thoughtviz.pdf},
year = {2018},
date = {2018-04-13},
urldate = {2018-04-13},
journal = {ACM Multimedia 2018, Seoul, Korea, October 22-26, 2018},
publisher = {ACM Multimedia 2018, Seoul, Korea, October 22-26, 2018.},
keywords = {EEG/Brain, Gans},
pubstate = {published},
tppubtype = {conference}
}
Mazaheri, Amir; Shah, Mubarak
Visual Text Correction Conference
Proceedings of IEEE European Conference on Computer Vision (ECCV 2018), Munich, Germany, September 8-14, 2018., 2018.
Tags: Video Description | Links:
@conference{Mazaheri2018,
title = {Visual Text Correction},
author = {Amir Mazaheri and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/eccv2018/vtc.pdf
https://amirmazaheri1990.github.io/VTC/},
year = {2018},
date = {2018-04-12},
journal = {in Proceedings of IEEE European Conference on Computer Vision (ECCV 2018), Munich, Germany, September 8-14, 2018.},
publisher = {Proceedings of IEEE European Conference on Computer Vision (ECCV 2018), Munich, Germany, September 8-14, 2018.},
keywords = {Video Description},
pubstate = {published},
tppubtype = {conference}
}
Idrees, Haroon; Tayyab, Muhmmad; Athrey, Kishan; Zhang, Dong; Al-Maadeed, Somaya; Rajpoot, Nasir; Shah, Mubarak
Composition Loss for Counting, Density Map Estimation and Localization in Dense Crowds Conference
IEEE European Conference on Computer Vision (ECCV 2018), Munich, Germany, September 8-14, 2018., 2018.
Tags: Crowd Counting | Links:
@conference{Idrees2018,
title = { Composition Loss for Counting, Density Map Estimation and Localization in Dense Crowds},
author = {Haroon Idrees and Muhmmad Tayyab and Kishan Athrey and Dong Zhang and Somaya Al-Maadeed and Nasir Rajpoot and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/eccv2018/2324.pdf
},
year = {2018},
date = {2018-04-11},
publisher = { IEEE European Conference on Computer Vision (ECCV 2018), Munich, Germany, September 8-14, 2018.},
keywords = {Crowd Counting},
pubstate = {published},
tppubtype = {conference}
}
Tian, Yicong; Dehghan, Afshin; Shah, Mubarak
"On Detection, Data Association and Segmentation for Multi-target Tracking" in IEEE Transactions on Pattern Analysis and Machine Intelligence Journal Article
In: 2018.
Tags: Tracking | Links:
@article{Tian2018,
title = {"On Detection, Data Association and Segmentation for Multi-target Tracking" in IEEE Transactions on Pattern Analysis and Machine Intelligence},
author = {Yicong Tian and Afshin Dehghan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/08392447.pdf
https://ieeexplore.ieee.org/abstract/document/8392447/
},
year = {2018},
date = {2018-04-10},
keywords = {Tracking},
pubstate = {published},
tppubtype = {article}
}
Jamal, M.; Li, H.; Gong, Boqing
Deep Face Detector Adaptation Without Negative Transfer or Catastrophic Forgetting Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018., 2018.
Tags: Detection | Links:
@conference{Jamal2019,
title = {Deep Face Detector Adaptation Without Negative Transfer or Catastrophic Forgetting},
author = {M. Jamal and H. Li and Boqing Gong},
url = {https://www.crcv.ucf.edu/papers/cvpr2018/deep-face.pdf},
year = {2018},
date = {2018-04-09},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018.},
keywords = {Detection},
pubstate = {published},
tppubtype = {conference}
}
LaLonde, Rodney; Zhang, Dong; Shah, Mubarak
ClusterNet: Detecting Small Objects in Large Scenes by Exploiting Spatio-Temporal Information Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018, 2018.
Tags: Detection, Drone Video Analysis, UAV Video Analysis | Links:
@conference{LaLonde2018,
title = {ClusterNet: Detecting Small Objects in Large Scenes by Exploiting Spatio-Temporal Information},
author = {Rodney LaLonde and Dong Zhang and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/cvpr2018/3460Final.pdf
https://www.crcv.ucf.edu/papers/cvpr2018/3460-suppFinal.pdf},
year = {2018},
date = {2018-04-08},
urldate = {2018-04-08},
publisher = { IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018},
keywords = {Detection, Drone Video Analysis, UAV Video Analysis},
pubstate = {published},
tppubtype = {conference}
}
Wang, Tiantian; Zhang, Lihe; Wang, Shuo; Lu, Huchuan; Yang, Gang; Ruan, Xiang; Borji, Ali
Detect Globally, Refine Locally: A Novel Approach to Saliency Detection Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018., 2018.
Tags: Saliency Detection | Links:
@conference{Wang2018,
title = { Detect Globally, Refine Locally: A Novel Approach to Saliency Detection},
author = {Tiantian Wang and Lihe Zhang and Shuo Wang and Huchuan Lu and Gang Yang and Xiang Ruan and Ali Borji},
url = {https://www.crcv.ucf.edu/papers/cvpr2018/camera_ready.pdf
},
year = {2018},
date = {2018-04-07},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018.},
keywords = {Saliency Detection},
pubstate = {published},
tppubtype = {conference}
}
Regmi, Krishna; Borji, Ali
Cross-View Image Synthesis Using Conditional GANs Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018, 2018.
Tags: Cross-View Image Synthesis, Geo-Localization | Links:
@conference{Regmi2018,
title = { Cross-View Image Synthesis Using Conditional GANs},
author = {Krishna Regmi and Ali Borji},
url = {https://www.crcv.ucf.edu/papers/cvpr2018/cross-view.pdf
https://www.crcv.ucf.edu/research/projects/cross-view-image-synthesis/},
year = {2018},
date = {2018-04-06},
urldate = {2018-04-06},
publisher = { IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018},
keywords = {Cross-View Image Synthesis, Geo-Localization},
pubstate = {published},
tppubtype = {conference}
}
Khan, Aisha Urooj; Borji, Ali
Analysis of Hand Segmentation in the Wild Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018, 2018.
Tags: Segmentation | Links:
@conference{Urooj2018,
title = { Analysis of Hand Segmentation in the Wild},
author = {Aisha Urooj Khan and Ali Borji},
url = {https://www.crcv.ucf.edu/papers/cvpr2018/hand-segmentation.pdf},
year = {2018},
date = {2018-04-05},
publisher = { IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018},
keywords = {Segmentation},
pubstate = {published},
tppubtype = {conference}
}
Wang, Wenguan; Shen, Jianbing; Guo, Fang; Cheng, Ming-Ming; Borji, Ali
Revisiting Video Saliency: A Large-Scale Benchmark and a New Model Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018., 2018.
Tags: Saliency Detection | Links:
@conference{Wang2018b,
title = { Revisiting Video Saliency: A Large-Scale Benchmark and a New Model},
author = {Wenguan Wang and Jianbing Shen and Fang Guo and Ming-Ming Cheng and Ali Borji},
url = {https://www.crcv.ucf.edu/papers/cvpr2018/revisiting-saliency.pdf},
year = {2018},
date = {2018-04-04},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018.},
keywords = {Saliency Detection},
pubstate = {published},
tppubtype = {conference}
}
Zeng, Yu; Lu, Huchuan; Zhang, Lihe; Feng, Mengyang; Borji, Ali
Learning to Promote Saliency Detectors Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018., 2018.
Tags: Saliency Detection | Links:
@conference{Zeng2018,
title = {Learning to Promote Saliency Detectors},
author = {Yu Zeng and Huchuan Lu and Lihe Zhang and Mengyang Feng and Ali Borji},
url = {https://www.crcv.ucf.edu/papers/cvpr2018/1757.pdf},
year = {2018},
date = {2018-04-03},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018.},
keywords = {Saliency Detection},
pubstate = {published},
tppubtype = {conference}
}
Sultani, Waqas; Chen, Chen; Shah, Mubarak
Real-world Anomaly Detection in Surveillance Videos Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018., 2018.
Tags: Anomaly Detection | Links:
@conference{Sultani2018,
title = {Real-world Anomaly Detection in Surveillance Videos},
author = {Waqas Sultani and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/anomaly_detection.pdf
},
year = {2018},
date = {2018-04-03},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018.},
keywords = {Anomaly Detection},
pubstate = {published},
tppubtype = {conference}
}
Zaeemzadeh, Alireza; Rahnavard, Nazanin; Shah, Mubarak
Norm-Preservation: Why Residual Networks Can Become Extremely Deep? Conference
Cornell University Library, 2018.
Tags: Deep Learning | Links:
@conference{Zaeemzadeh2018,
title = {Norm-Preservation: Why Residual Networks Can Become Extremely Deep?},
author = {Alireza Zaeemzadeh and Nazanin Rahnavard and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/arxiv_files/1805.07477.pdf},
year = {2018},
date = {2018-04-02},
publisher = {Cornell University Library},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Duarte, Kevin; Rawat, Yogesh Singh; Shah, Mubarak
VideoCapsuleNet: A Simplified Network for Action Detection Conference
In Advances in Neural Information Processing Systems (pp. 7610-7619), 2018.
Tags: Human Action and Activity Recognition | Links:
@conference{duarte2018videocapsulenet,
title = {VideoCapsuleNet: A Simplified Network for Action Detection},
author = {Kevin Duarte and Yogesh Singh Rawat and Mubarak Shah},
url = {https://papers.nips.cc/paper/7988-videocapsulenet-a-simplified-network-for-action-detection.pdf
https://www.crcv.ucf.edu/home/projects/videocapsulenet/
https://www.crcv.ucf.edu/wp-content/uploads/2019/05/Projects_VideoCapsuleNet_Bibtex.txt},
year = {2018},
date = {2018-04-01},
publisher = {In Advances in Neural Information Processing Systems (pp. 7610-7619)},
keywords = {Human Action and Activity Recognition},
pubstate = {published},
tppubtype = {conference}
}
Kalayeh, Mahdi M.; Basaran, Emrah; Gokmen, Muhittin; Kamasak, Mustafa E.; Shah, Mubarak
Human Semantic Parsing for Person Re-identification Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018, 2018.
Tags: Re-Identification | Links:
@conference{Kalayeh2018,
title = { Human Semantic Parsing for Person Re-identification},
author = {Mahdi M. Kalayeh and Emrah Basaran and Muhittin Gokmen and Mustafa E. Kamasak and Mubarak Shah},
url = {http://openaccess.thecvf.com/content_cvpr_2018/papers/Kalayeh_Human_Semantic_Parsing_CVPR_2018_paper.pdf
https://www.crcv.ucf.edu/papers/cvpr2018/semantic_parsing.tex},
year = {2018},
date = {2018-03-31},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2018), Salt Lake City, UT, June 18-22, 2018},
keywords = {Re-Identification},
pubstate = {published},
tppubtype = {conference}
}
Zemene, Eyasu; Tesfaye, Yonatan Tariku; Idrees, Haroon; Prati, Andrea; Pelillo, Marcello; Shah, Mubarak
Large-scale Image Geo-Localization Using Dominant Sets Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 41, pp. 148 - 161, 2018.
Tags: Clustering, Dominant Sets, Geo-Localization, Re-Ranking | Links:
@article{Mequanint2020,
title = {Large-scale Image Geo-Localization Using Dominant Sets},
author = {Eyasu Zemene and Yonatan Tariku Tesfaye and Haroon Idrees and Andrea Prati and Marcello Pelillo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Publications_Large-Scale-Image-Geo-Localization-Using-Dominant-Sets.pdf
https://www.crcv.ucf.edu/research/projects/large-scale-image-geo-localization-using-dominant-sets/},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
volume = {41},
pages = {148 - 161},
keywords = {Clustering, Dominant Sets, Geo-Localization, Re-Ranking},
pubstate = {published},
tppubtype = {article}
}
Rawat, Yogesh Singh; Rana, Aayush; Tirupattur, Praveen; Shah, Mubarak
Action and Object Detection for TRECVID Journal Article
In: 2018.
Tags:
@article{rawat2018action,
title = {Action and Object Detection for TRECVID},
author = {Yogesh Singh Rawat and Aayush Rana and Praveen Tirupattur and Mubarak Shah},
year = {2018},
date = {2018-01-01},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2017
Green, Aileen; Bagci, Ulas; Hussein, Sarfaraz; Kelly, Patrick V.; Muzaffara, Razi; Neuschwander-Tetrib, Brent A.; Osmana, Medhat
Brown adipose tissue detected by PET/CT imaging is associated with less central obesity Journal Article
In: vol. Volume 38, , pp. pp. 629-635,, 2017.
Tags: Deep Learning | Links:
@article{Greena2017b,
title = {Brown adipose tissue detected by PET/CT imaging is associated with less central obesity},
author = {Aileen Green and Ulas Bagci and Sarfaraz Hussein and Patrick V. Kelly and Razi Muzaffara and Brent A. Neuschwander-Tetrib and Medhat Osmana},
url = {https://www.crcv.ucf.edu/papers/nmc17.pdf},
year = {2017},
date = {2017-12-31},
volume = {Volume 38, },
pages = {pp. 629-635,},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Idrees, Haroon; Zamir, Amir Roshan; Jiang, Yu-Gang; Gorban, Alex; Laptev, Ivan; Sukthankar, Rahul; Shah, Mubarak
The THUMOS Challenge on Action Recognition for Videos "in the Wild" Journal Article
In: Computer Vision and Image Understanding, 2017.
Tags: Deep Learning | Links:
@article{Idrees2017,
title = {The THUMOS Challenge on Action Recognition for Videos "in the Wild"},
author = {Haroon Idrees and Amir Roshan Zamir and Yu-Gang Jiang and Alex Gorban and Ivan Laptev and Rahul Sukthankar and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/thumosCVIU.pdf},
year = {2017},
date = {2017-12-30},
journal = {Computer Vision and Image Understanding},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Dehghan, Afshin; Shah, Mubarak
Binary Quadratic Programing for Online Tracking of Hundreds of People in Extremely Crowded Scenes Journal Article
In: Transactions on Pattern Analysis and Machine Intelligence, 2017.
Tags: Crowd Analysis, Deep Learning | Links:
@article{Dehghan2017,
title = {Binary Quadratic Programing for Online Tracking of Hundreds of People in Extremely Crowded Scenes},
author = {Afshin Dehghan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/05/Publications_Binary-Quadratic-Programing-for-Online-Tracking-of-Hundreds-of-People-in-Extremely-Crowded-Scenes.pdf},
year = {2017},
date = {2017-12-28},
urldate = {2017-12-28},
journal = {Transactions on Pattern Analysis and Machine Intelligence},
keywords = {Crowd Analysis, Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Rahmani, Hossein; Mian, Ajmal; Shah, Mubarak
Learning a Deep Model for Human Action Recognition from Novel Viewpoints Journal Article
In: Transactions on Pattern Analysis and Machine Intelligence, 2017.
Tags: Deep Learning | Links:
@article{Rahmani2017,
title = {Learning a Deep Model for Human Action Recognition from Novel Viewpoints },
author = {Hossein Rahmani and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/07893732.pdf},
year = {2017},
date = {2017-12-26},
journal = {Transactions on Pattern Analysis and Machine Intelligence},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Komatsu, S.; Markman, A.; Mahalanobis, Abhijit; Chen, Kenny; Javidi, Bahram
Three-dimensional integral imaging and object detection using long-wave infrared imaging Journal Article
In: Applied Optics , 2017.
Tags: Deep Learning
@article{Komatsu2017,
title = {Three-dimensional integral imaging and object detection using long-wave infrared imaging},
author = {S. Komatsu and A. Markman and Abhijit Mahalanobis and Kenny Chen and Bahram Javidi },
year = {2017},
date = {2017-12-12},
journal = {Applied Optics },
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Mahalanobis, Abhijit; Shilling, R.; Muise, Robert; Neifeld, Mark
High resolution imaging using a translating coded aperture Journal Article
In: Optical Engineering , vol. 56, no. 8, 2017.
Tags: Deep Learning
@article{Mahalanobis2017,
title = {High resolution imaging using a translating coded aperture},
author = {Abhijit Mahalanobis and R. Shilling and Robert Muise and Mark Neifeld},
year = {2017},
date = {2017-08-22},
journal = {Optical Engineering },
volume = {56},
number = {8},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Prokaj, Jan; da Vitoria Lobo, Niels
Scale Space Based Grammar for Hand Detection Conference
Springer Lecture Notes in Computer Science, 2017.
Tags: REU
@conference{Prokaj2017,
title = {Scale Space Based Grammar for Hand Detection},
author = {Jan Prokaj and Niels da Vitoria Lobo },
year = {2017},
date = {2017-08-08},
booktitle = {Springer Lecture Notes in Computer Science},
keywords = {REU},
pubstate = {published},
tppubtype = {conference}
}
Finocchiaro, Jessica; Khan, AU; Borji, Ali
Egocentric Height Estimation Conference
WACV, 2017.
Tags: REU
@conference{Finocchiaro2017b,
title = { Egocentric Height Estimation},
author = {Jessica Finocchiaro and AU Khan and Ali Borji},
year = {2017},
date = {2017-08-02},
booktitle = {WACV},
keywords = {REU},
pubstate = {published},
tppubtype = {conference}
}
Laurel, Jacob; and, Aidean Sharghi
Query-focused video summarization: Dataset, Evaluation, and a Memory Network Based Approach Conference
CVPR, 2017.
Tags: REU
@conference{Laurel2017,
title = {Query-focused video summarization: Dataset, Evaluation, and a Memory Network Based Approach},
author = {Jacob Laurel and Aidean Sharghi and et al.},
year = {2017},
date = {2017-08-01},
booktitle = {CVPR},
keywords = {REU},
pubstate = {published},
tppubtype = {conference}
}
Sharghi, Aidean; Laurel, J.; Gong, Boqing
Query-Focused Video Summarization: Dataset, Evaluation, and A Memory Network Based Approach Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2017.
Tags: REU
@conference{Sharghi2017b,
title = {Query-Focused Video Summarization: Dataset, Evaluation, and A Memory Network Based Approach},
author = {Aidean Sharghi and J. Laurel and Boqing Gong },
year = {2017},
date = {2017-07-22},
booktitle = { IEEE Conference on Computer Vision and Pattern Recognition},
keywords = {REU},
pubstate = {published},
tppubtype = {conference}
}
Spampinato, Concetto; Palazzo, Simone; Kavasidis, Isaak; Giordano, Daniela; Shah, Mubarak; Souly, Nasim
Deep Learning Human Mind for Automated Visual Classification Conference
IEEE Conference on Computer Vision and Pattern Recognition, 2017.
Tags: Classification, CVPR, EEG/Brain | Links:
@conference{Spampinato2017,
title = {Deep Learning Human Mind for Automated Visual Classification},
author = {Concetto Spampinato and Simone Palazzo and Isaak Kavasidis and Daniela Giordano and Mubarak Shah and Nasim Souly},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/08/Publications_Deep-Learning-Human-Mind-for-Automated-Visual-Classification.pdf
},
year = {2017},
date = {2017-07-21},
urldate = {2017-07-21},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition},
keywords = {Classification, CVPR, EEG/Brain},
pubstate = {published},
tppubtype = {conference}
}
Komatsu, S.; Markman, A.; Mahalanobis, Abhijit; Chen, Kenny; Javidi, Bahram
Passive long-wave infrared three-dimensional integral imaging for face detection and depth estimation: an overview Proceedings
Three-Dimensional Imaging, Visualization, and Display , 2017.
Tags: Deep Learning | Links:
@proceedings{Komatsu2020,
title = {Passive long-wave infrared three-dimensional integral imaging for face detection and depth estimation: an overview},
author = {S. Komatsu and A. Markman and Abhijit Mahalanobis and Kenny Chen and Bahram Javidi},
url = {https://www.spiedigitallibrary.org/conference-proceedings-of-spie/10219/1021918/Passive-long-wave-infrared-three-dimensional-integral-imaging-for-face/10.1117/12.2276286.short},
year = {2017},
date = {2017-05-10},
publisher = {Three-Dimensional Imaging, Visualization, and Display },
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {proceedings}
}
Hussein, Sarfaraz; Gillies, Robert; Cao, Kunlin; Song, Qi; Bagci, Ulas
TumorNET: Lung Nodule Characterization using Multi-View Convolutional Neural Network with Gaussian Process Conference
IEEE ISBI , 2017.
Tags: Deep Learning | Links:
@conference{Hussein2017b,
title = {TumorNET: Lung Nodule Characterization using Multi-View Convolutional Neural Network with Gaussian Process},
author = {Sarfaraz Hussein and Robert Gillies and Kunlin Cao and Qi Song and Ulas Bagci},
url = {https://www.crcv.ucf.edu/papers/1703.00645.pdf},
year = {2017},
date = {2017-04-10},
publisher = {IEEE ISBI },
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Torosdagli, Nelisah; Liberton, Denise; Verma, Payal; Sincan, Murat; Lee, Janice; Pattanaik, Sumantha; Bagci, Ulas
Robust and Fully Automated Segmentation of Mandible from CT Scans Journal Article
In: IEEE ISBI 2017, 2017.
Tags: Deep Learning | Links:
@article{Torosdagli2017,
title = {Robust and Fully Automated Segmentation of Mandible from CT Scans},
author = {Nelisah Torosdagli and Denise Liberton and Payal Verma and Murat Sincan and Janice Lee and Sumantha Pattanaik and Ulas Bagci},
url = {https://www.crcv.ucf.edu/papers/1702.07059.pdf
},
year = {2017},
date = {2017-04-09},
journal = { IEEE ISBI 2017},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {article}
}
Hou, Rui; Chen, Chen; Shah, Mubarak
An End-to-end 3D Convolutional Neural Network for Action Detection and Segmentation in Videos Conference
Cornell University Library, 2017.
Tags: Detection, Segmentation | Links:
@conference{Hou2016,
title = {An End-to-end 3D Convolutional Neural Network for Action Detection and Segmentation in Videos},
author = {Rui Hou and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/ST-CNN-arXiv.pdf
},
year = {2017},
date = {2017-03-31},
publisher = {Cornell University Library},
keywords = {Detection, Segmentation},
pubstate = {published},
tppubtype = {conference}
}
Tesfaye, Yonatan Tariku; Zemene, Eyasu; Prati, Andrea; Pelillo, Marcello; Shah, Mubarak
Multi-Target Tracking in Multiple Non-Overlapping Cameras using Constrained Dominant Sets Journal Article
In: Cornell University Library, 2017.
Tags: Tracking | Links:
@article{Tesfaye2017,
title = { Multi-Target Tracking in Multiple Non-Overlapping Cameras using Constrained Dominant Sets},
author = {Yonatan Tariku Tesfaye and Eyasu Zemene and Andrea Prati and Marcello Pelillo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/1706.06196.pdf},
year = {2017},
date = {2017-03-30},
journal = {Cornell University Library},
keywords = {Tracking},
pubstate = {published},
tppubtype = {article}
}
Green, Aileen; Bagci, Ulas; Hussein, Sarfaraz; Kelly, Patrick V.; Muzaffara, Razi; Neuschwander-Tetrib, Brent A.; Osmana, Medhat
Brown adipose tissue detected by PET/CT imaging is associated with less central obesity Journal Article
In: NuclearMedicine Communications, Volume 38, Issue 7, pp. 629-635, July, 2017.
Tags: Medical, Segmentation | Links:
@article{Greena2017,
title = { Brown adipose tissue detected by PET/CT imaging is associated with less central obesity},
author = {Aileen Green and Ulas Bagci and Sarfaraz Hussein and Patrick V. Kelly and Razi Muzaffara and Brent A. Neuschwander-Tetrib and Medhat Osmana},
url = {https://www.crcv.ucf.edu/papers/nmc17.pdf},
year = {2017},
date = {2017-03-29},
journal = {NuclearMedicine Communications, Volume 38, Issue 7, pp. 629-635, July},
publisher = {NuclearMedicine Communications, Volume 38, Issue 7, pp. 629-635, July 2017.},
keywords = {Medical, Segmentation},
pubstate = {published},
tppubtype = {article}
}
Hussein, Sarfaraz; Cao, Kunlin; Song, Qi; Bagci, Ulas
Risk Stratification of Lung Nodules Using 3D CNN-Based Multi-task Learning Conference
Cornell University Library, 2017.
Tags: Medical | Links:
@conference{Hussein2017,
title = { Risk Stratification of Lung Nodules Using 3D CNN-Based Multi-task Learning},
author = {Sarfaraz Hussein and Kunlin Cao and Qi Song and Ulas Bagci},
url = {https://www.crcv.ucf.edu/papers/1704.08797v1.pd.pdf},
year = {2017},
date = {2017-03-28},
address = {Cornell University Library},
keywords = {Medical},
pubstate = {published},
tppubtype = {conference}
}
Mazaheri, Amir; Zhang, Dong; Shah, Mubarak
Video Fill In the Blank using LR/RL LSTMs with Spatial-Temporal Attentions Conference
Cornell University Library, 2017.
Tags: Video Description | Links:
@conference{Mazaheri2017,
title = {Video Fill In the Blank using LR/RL LSTMs with Spatial-Temporal Attentions},
author = {Amir Mazaheri and Dong Zhang and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/1704.04689v1.pd.pdf},
year = {2017},
date = {2017-03-26},
publisher = {Cornell University Library},
keywords = {Video Description},
pubstate = {published},
tppubtype = {conference}
}
Palazzo, Simone; Spampinato, Concetto; Kavasidis, Isaak; Giordano, Daniela; Shah, Mubarak
Generative Adversarial Networks Conditioned by Brain Signals Conference
IEEE International Conference on Computer Vision (ICCV), 2017.
Tags: Deep Learning, EEG/Brain | Links:
@conference{Palazzo2017,
title = {Generative Adversarial Networks Conditioned by Brain Signals},
author = {Simone Palazzo and Concetto Spampinato and Isaak Kavasidis and Daniela Giordano and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/iccv17/egpaper_for_review.pdf
},
year = {2017},
date = {2017-03-25},
urldate = {2017-03-25},
publisher = {IEEE International Conference on Computer Vision (ICCV)},
keywords = {Deep Learning, EEG/Brain},
pubstate = {published},
tppubtype = {conference}
}
LaLonde, Rodney; Zhang, Dong; Shah, Mubarak
Fully Convolutional Deep Neural Networks for Persistent Multi-Frame Multi-Object Detection in Wide Area Aerial Videos Conference
Cornell University Library, 2017.
Tags: Detection | Links:
@conference{LaLonde2017,
title = { Fully Convolutional Deep Neural Networks for Persistent Multi-Frame Multi-Object Detection in Wide Area Aerial Videos},
author = {Rodney LaLonde and Dong Zhang and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/1704.02694v1.pdf},
year = {2017},
date = {2017-03-23},
publisher = {Cornell University Library},
keywords = {Detection},
pubstate = {published},
tppubtype = {conference}
}
Soomro, Khurram; Shah, Mubarak
Unsupervised Action Discovery and Localization in Videos Conference
Proceedings of the IEEE International Conference on Computer Vision (ICCV), 2017.
Tags: Deep Learning | Links:
@conference{Soomro2017,
title = {Unsupervised Action Discovery and Localization in Videos},
author = {Khurram Soomro and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/iccv17/Soomro_ICCV17.pdf
},
year = {2017},
date = {2017-03-23},
publisher = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Hou, Rui; Chen, Chen; Shah, Mubarak
Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos Conference
Cornell University Library, 2017.
Tags: Detection | Links:
@conference{Hou2017,
title = {Tube Convolutional Neural Network (T-CNN) for Action Detection in Videos},
author = {Rui Hou and Chen Chen and Mubarak Shah},
url = {https://arxiv.org/pdf/1703.10664.pdf},
year = {2017},
date = {2017-03-22},
publisher = {Cornell University Library},
keywords = {Detection},
pubstate = {published},
tppubtype = {conference}
}
Souly, Nasim; Spampinato, Concetto; Shah, Mubarak
Semi Supervised Semantic Segmentation Using Generative Adversarial Network Conference
IEEE International Conference on Computer Vision (ICCV), 2017.
Tags: Deep Learning | Links:
@conference{Souly2017b,
title = { Semi Supervised Semantic Segmentation Using Generative Adversarial Network},
author = {Nasim Souly and Concetto Spampinato and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/iccv17/GAN_Semantic_cameraReady.pdf},
year = {2017},
date = {2017-03-22},
publisher = { IEEE International Conference on Computer Vision (ICCV)},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Fan, Deng-Ping; Cheng, Ming-Ming; Liu, Yun; Li, Tao; Borji, Ali
Structure-measure: A New Way to Evaluate Foreground Maps Conference
IEEE International Conference on Computer Vision (ICCV), 2017.
Tags: Deep Learning | Links:
@conference{Fan2017,
title = { Structure-measure: A New Way to Evaluate Foreground Maps},
author = {Deng-Ping Fan and Ming-Ming Cheng and Yun Liu and Tao Li and Ali Borji},
url = {https://www.crcv.ucf.edu/papers/iccv17/1164.pdf},
year = {2017},
date = {2017-03-21},
publisher = {IEEE International Conference on Computer Vision (ICCV)},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Souly, Nasim; Spampinato, Concetto; Shah, Mubarak
Semi and Weakly Supervised Semantic Segmentation Using Generative Adversarial Network Conference
Cornell University Library, 2017.
Tags: Segmentation | Links:
@conference{Souly2017,
title = {Semi and Weakly Supervised Semantic Segmentation Using Generative Adversarial Network},
author = {Nasim Souly and Concetto Spampinato and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/1703.09695.pdf},
year = {2017},
date = {2017-03-20},
publisher = {Cornell University Library},
keywords = {Segmentation},
pubstate = {published},
tppubtype = {conference}
}
Tavakoli, Hamed R.; Shetty, Rakshith; Borji, Ali; Laaksonen, Jorma
Paying Attention to Descriptions Generated by Image Captioning Models Conference
IEEE International Conference on Computer Vision (ICCV), Venice, Italy, 2017.
Tags: Deep Learning | Links:
@conference{Tavakoli2017,
title = {Paying Attention to Descriptions Generated by Image Captioning Models},
author = {Hamed R. Tavakoli and Rakshith Shetty and Ali Borji and Jorma Laaksonen},
url = {https://www.crcv.ucf.edu/papers/iccv17/1704.07434.pdf},
year = {2017},
date = {2017-03-19},
publisher = { IEEE International Conference on Computer Vision (ICCV), Venice, Italy},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Wang, Tiantian; Borji, Ali; Zhang, Lihe; Zhang, Pingping; Lu, Huchuan
A Stagewise Refinement Model for Detecting Salient Objects in Images Conference
IEEE International Conference on Computer Vision (ICCV), Venice, Italy, 2017.
Tags: Deep Learning | Links:
@conference{Wang2017,
title = {A Stagewise Refinement Model for Detecting Salient Objects in Images},
author = {Tiantian Wang and Ali Borji and Lihe Zhang and Pingping Zhang and Huchuan Lu},
url = {https://www.crcv.ucf.edu/papers/iccv17/1709.pdf},
year = {2017},
date = {2017-03-17},
publisher = { IEEE International Conference on Computer Vision (ICCV), Venice, Italy},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Zhang, Yang; David, P.; Gong, Boqing
Curriculum Domain Adaptation for Semantic Segmentation of Urban Scenes Conference
IEEE International Conference on Computer Vision (ICCV), Venice, Italy,, 2017.
Tags: Deep Learning | Links:
@conference{Zhang2017,
title = {Curriculum Domain Adaptation for Semantic Segmentation of Urban Scenes},
author = {Yang Zhang and P. David and Boqing Gong},
url = {https://www.crcv.ucf.edu/papers/iccv17/CurriculumDA.pdf
https://www.crcv.ucf.edu/papers/iccv17/VQS-Supp.pdf
https://github.com/Cold-Winter/vqs},
year = {2017},
date = {2017-03-16},
publisher = { IEEE International Conference on Computer Vision (ICCV), Venice, Italy,},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Gan, C.; Li, Y.; Li, H.; Sun, C.; Gong, Boqing
VQS: Linking Segmentations to Questions and Answers for Supervised Attention in VQA and Question-Focused Semantic Segmentation Conference
IEEE International Conference on Computer Vision (ICCV), Venice, Italy, 2017.
Tags: Deep Learning | Links:
@conference{Gan2017,
title = {VQS: Linking Segmentations to Questions and Answers for Supervised Attention in VQA and Question-Focused Semantic Segmentation},
author = {C. Gan and Y. Li and H. Li and C. Sun and Boqing Gong},
url = {https://www.crcv.ucf.edu/papers/iccv17/VQS.pdf
https://www.crcv.ucf.edu/papers/iccv17/1707.09465.pd.pdf},
year = {2017},
date = {2017-03-13},
publisher = { IEEE International Conference on Computer Vision (ICCV), Venice, Italy},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Prakash, Harish Ravi; Korostenskaja, Milena; Castillo, Eduardo; Lee, Ki; Baumgartner, James; Bagci, Ulas
Automatic Response Assessment in Regions of Language Cortex in Epilepsy Patients Using ECoG-based Functional Mapping and Machine Learning Conference
IEEE SMC, 2017.
Tags: Deep Learning | Links:
@conference{Prakash2017,
title = {Automatic Response Assessment in Regions of Language Cortex in Epilepsy Patients Using ECoG-based Functional Mapping and Machine Learning},
author = {Harish Ravi Prakash and Milena Korostenskaja and Eduardo Castillo and Ki Lee and James Baumgartner and Ulas Bagci},
url = {https://www.crcv.ucf.edu/papers/1706.01380.pdf},
year = {2017},
date = {2017-03-13},
publisher = { IEEE SMC},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Mortazi, Aliasghar; Karim, Rashed; Rhode, Kawal; Burt, Jeremy; Bagci, Ulas
CardiacNET: Segmentation of Left Atrium and Proximal Pulmonary Veins from MRI Using Multi-View CNN Conference
MICCAI 2017,, 2017.
Tags: Deep Learning | Links:
@conference{Mortazi2017,
title = {CardiacNET: Segmentation of Left Atrium and Proximal Pulmonary Veins from MRI Using Multi-View CNN},
author = {Aliasghar Mortazi and Rashed Karim and Kawal Rhode and Jeremy Burt and Ulas Bagci},
url = {https://www.crcv.ucf.edu/papers/1705.06333.pdf},
year = {2017},
date = {2017-03-12},
publisher = { MICCAI 2017,},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Finocchiaro, Jessica; Khan, Aisha Urooj; Borji, Ali
Egocentric Height Estimation Conference
WACV , 2017.
Tags: Deep Learning | Links:
@conference{Finocchiaro2017,
title = {Egocentric Height Estimation},
author = {Jessica Finocchiaro and Aisha Urooj Khan and Ali Borji},
url = {https://www.crcv.ucf.edu/papers/1610.02714.pdf},
year = {2017},
date = {2017-03-08},
publisher = { WACV },
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Kavasidis, Isaak; Palazzo, Simone; Spampinato, Concetto; Giordano, Daniela; Shah, Mubarak
Brain2Image: Converting Brain Signals into Images Conference
ACM Multimedia 25, Mountain View, CA, 2017.
Tags: Deep Learning, EEG/Brain | Links:
@conference{kavasidiskavasidis2017,
title = {Brain2Image: Converting Brain Signals into Images},
author = {Isaak Kavasidis and Simone Palazzo and Concetto Spampinato and Daniela Giordano and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/camera_ready_acmmm_BNI08.pdf},
year = {2017},
date = {2017-03-07},
urldate = {2017-03-07},
publisher = {ACM Multimedia 25, Mountain View, CA},
keywords = {Deep Learning, EEG/Brain},
pubstate = {published},
tppubtype = {conference}
}
Tian, Yicong; Chen, Chen; Shah, Mubarak
Cross-View Image Matching for Geo-localization in Urban Environments Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017), 2017.
Tags: Deep Learning | Links:
@conference{Tian2017b,
title = { Cross-View Image Matching for Geo-localization in Urban Environments},
author = {Yicong Tian and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/cvpr2017/geolocalization_cvpr17.pdf
https://www.crcv.ucf.edu/research/cross-view-image-matching-for-geo-localization-in-urban-environments/},
year = {2017},
date = {2017-02-28},
urldate = {2017-02-28},
publisher = { IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017)},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Kalayeh, Mahdi M.; Gong, Boqing; Shah, Mubarak
Improving Facial Attribute Prediction using Semantic Segmentation Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017), 2017.
Tags: Deep Learning | Links:
@conference{M.Kalayeh2017,
title = {Improving Facial Attribute Prediction using Semantic Segmentation},
author = {Mahdi M. Kalayeh and Boqing Gong and Mubarak Shah},
url = {https://www.crcv.ucf.edu/papers/cvpr2017/Kalayeh_CVPR2017.pdf},
year = {2017},
date = {2017-02-27},
publisher = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017)},
keywords = {Deep Learning},
pubstate = {published},
tppubtype = {conference}
}
Sharghi, Aidean; Laurel, J.; Gong, Boqing
Query-Focused Video Summarization: Dataset, Evaluation, and A Memory Network Based Approach Conference
IEEE Conference on Computer Vision and Pattern Recognition (CVPR 2017), 2017.
Tags: Deep Learning | Links: