You can access the CRCV Publications Page for enhanced search capabilities.
2020
Sun, ShiJie; Akhtar, Naveed; Song, XiangYu; Song, HuanSheng; Mian, Ajmal; Shah, Mubarak
Simultaneous Detection and Tracking with Motion Modelling for Multiple Object Tracking Conference
16th European Conference on Computer Vision, 2020.
@conference{Sun2020,
title = {Simultaneous Detection and Tracking with Motion Modelling for Multiple Object Tracking},
author = {ShiJie Sun and Naveed Akhtar and XiangYu Song and HuanSheng Song and Ajmal Mian and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Simultaneous-Detection-and-Tracking-with-Motion-Modelling-for-Multiple-Object-Tracking.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Simultaneous-Detection-and-Tracking-with-Motion-Modelling-for-Multiple-Object-Tracking_Supp.pdf
https://shijies.github.io/DMMN_Page/},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {Deep learning based Multiple Object Tracking (MOT) currently relies on off-the-shelf detectors for tracking-by-detection. This results in deep models that are detector biased and evaluations that are detector influenced. To resolve this issue, we introduce Deep Motion Modeling Network (DMM-Net) that can estimate multiple objects’ motion parameters to perform joint detection and association in an end-to-end manner. DMM-Net models object features over multiple frames and simultaneously infers object classes, visibility and their motion parameters. These outputs are readily used to update the tracklets for efficient MOT. DMM-Net achieves PR-MOTA score of 12.80 @ 120+ fps for the popular UA-DETRAC challenge - which is better performance and orders of magnitude faster. We also contribute a synthetic large-scale public dataset Omni-MOT for vehicle tracking that provides precise ground-truth annotations to eliminate the detector influence in MOT evaluation. This 14M+ frames dataset is extendable with our public script (Code at Dataset, Dataset Recorder, Omni-MOT Source). We demonstrate the suitability of Omni-MOT for deep learning with DMM-Net, and also make the source code of our network public.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Xie, Jin; Cholakkal, Hisham; Anwer, Rao Muhammad; Khan, Fahad Shahbaz; Pang, Yanwei; Shao, Ling; Shah, Mubarak
Count- and Similarity-aware R-CNN for Pedestrian Detection Conference
16th European Conference on Computer Vision, 2020.
@conference{Xie2020,
title = {Count- and Similarity-aware R-CNN for Pedestrian Detection},
author = {Jin Xie and Hisham Cholakkal and Rao Muhammad Anwer and Fahad Shahbaz Khan and Yanwei Pang and Ling Shao and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Publications_Count-and-Similarity-aware-R-CNN-for-Pedestrian-Detection.pdf},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {Recent pedestrian detection methods generally rely on additional supervision, such as visible bounding-box annotations, to handle heavy occlusions. We propose an approach that leverages pedestrian count and proposal similarity information within a two-stage pedestrian detection framework. Both pedestrian count and proposal similarity are derived from standard full-body annotations commonly used to train pedestrian detectors. We introduce a count-weighted detection loss function that assigns higher weights to the detection errors occurring at highly overlapping pedestrians. The proposed loss function is utilized at both stages of the two-stage detector. We further introduce a count-andsimilarity branch within the two-stage detection framework, which predicts pedestrian count as well as proposal similarity to identify distinct proposals. Our approach requires neither part information nor visible bounding-box annotations. Experiments are performed on the CityPersons and CrowdHuman datasets. Our method sets a new state-of-the-art on both datasets. Further, it achieves an absolute gain of 2.4% over the current state-of-the-art, in terms of log-average miss rate, on the heavily occluded (HO) set of CityPersons test set, without using additional visible bounding-box supervision. Finally, we demonstrate the applicability of our approach for the problem of human instance segmentation. Code and models are available at: https://github.com/Leotju/CaSe.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Schatz, Kara Marie; Quintanilla, Erik; Vyas, Shruti; Rawat, Yogesh Singh
A Recurrent Transformer Network for Novel View Action Synthesis Conference
16th European Conference on Computer Vision, 2020.
@conference{Schatz2020,
title = {A Recurrent Transformer Network for Novel View Action Synthesis},
author = {Kara Marie Schatz and Erik Quintanilla and Shruti Vyas and Yogesh Singh Rawat},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Projects_A-Recurrent-Transformer-Network-for-Novel-View-Action-Synthesis.pdf
https://www.crcv.ucf.edu/research/projects/a-recurrent-transformer-network-for-novel-view-action-synthesis/},
year = {2020},
date = {2020-08-23},
urldate = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {In this work, we address the problem of synthesizing human actions from novel views. Given an input video of an actor performing some action, we aim to synthesize a video with the same action performed from a novel view with the help of an appearance prior. We propose an end-to-end deep network to solve this problem. The proposed network utilizes the change in viewpoint to transform the action from the input view to the novel view in feature space. The transformed action is integrated with the target appearance using the proposed recurrent transformer network, which provides a transformed appearance for each time-step in the action sequence. The recurrent transformer network utilize action key-points which are determined in an unsupervised approach using the encoded action features. We also propose a hierarchical structure for the recurrent transformation which further improves the performance. We demonstrate the effectiveness of the proposed method through extensive experiments conducted on a large-scale multi-view action recognition NTU-RGB+D dataset. In addition, we show that the proposed method can transform the action to a novel viewpoint with an entirely different scene or actor. The code is publicly available at https://github.com/schatzkara/cross-view-video.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Vyas, Shruti; Rawat, Yogesh Singh; Shah, Mubarak
Multi-view Action Recognition using Cross-view Video Prediction Conference
16th European Conference on Computer Vision, 2020.
@conference{Vyas2020,
title = {Multi-view Action Recognition using Cross-view Video Prediction},
author = {Shruti Vyas and Yogesh Singh Rawat and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Projects_Multi-view-Action-Recognition-using-Cross-view-Video-Prediction.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2020/07/Projects_Multi-view-Action-Recognition-using-Cross-view-Video-Prediction_Supp.pdf
https://www.crcv.ucf.edu/research/projects/multi-view-action-recognition-using-cross-view-video-prediction/},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {In this work, we address the problem of action recognition in a multi-view environment. Most of the existing approaches utilize pose information for multi-view action recognition. We focus on RGB modality instead and propose an unsupervised representation learning framework, which encodes the scene dynamics in videos captured from multiple viewpoints via predicting actions from unseen views. The framework takes multiple short video clips from different viewpoints and time as input and learns an holistic internal representation which is used to predict a video clip from an unseen viewpoint and time. The ability of the proposed network to render unseen video frames enables it to learn a meaningful and robust representation of the scene dynamics. We evaluate the effectiveness of the learned representation for multiview video action recognition in a supervised approach. We observe a significant improvement in the performance with RGB modality on NTU-RGB+D dataset, which is the largest dataset for multi-view action recognition. The proposed framework also achieves state-of-the-art results with depth modality, which validates the generalization capability of the approach to other data modalities. The code is publicly available at https://github.com/svyas23/cross-view-action.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Venkataramanan, Shashanka; Peng, Kuan-Chuan; Singh, Rajat Vikram; Mahalanobis, Abhijit
Attention Guided Anomaly Localization in Images Conference
16th European Conference on Computer Vision, 2020.
@conference{Venkataramanan2020,
title = {Attention Guided Anomaly Localization in Images},
author = {Shashanka Venkataramanan and Kuan-Chuan Peng and Rajat Vikram Singh and Abhijit Mahalanobis},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2020/07/2813.pdf
https://youtu.be/b-EQr-fGPWo},
year = {2020},
date = {2020-08-23},
booktitle = {16th European Conference on Computer Vision},
abstract = {Anomaly localization is an important problem in computer vision which involves localizing anomalous regions within images with applications in industrial inspection, surveillance, and medical imaging. This task is challenging due to the small sample size and pixel coverage of the anomaly in real-world scenarios. Most prior works need to use anomalous training images to compute a class-specific threshold to localize anomalies. Without the need of anomalous training images, we propose Convolutional Adversarial Variational autoencoder with Guided Attention (CAVGA), which localizes the anomaly with a convolutional latent variable to preserve the spatial information. In the unsupervised setting, we propose an attention expansion loss where we encourage CAVGA to focus on all normal regions in the image. Furthermore, in the weakly supervised setting we propose a complementary guided attention loss, where we encourage the attention map to focus on all normal regions while minimizing the attention map corresponding to anomalous regions in the image. CAVGA outperforms the state-of-the-art (SOTA) anomaly localization methods on MVTec Anomaly Detection (MVTAD), modified ShanghaiTech Campus (mSTC) and Large-scale Attention based Glaucoma (LAG) datasets in the unsupervised setting and when using only 2% anomalous images in the weakly-supervised setting. CAVGA also outperforms SOTA anomaly detection methods on the MNIST, CIFAR-10, Fashion-MNIST, MVTAD, mSTC and LAG datasets.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}