You can access the CRCV Publications Page for enhanced search capabilities.
2022
Vyas, Shruti; Chen, Chen; Shah, Mubarak
GAMa: Cross-view Video Geo-localization Conference
European Conference on Computer Vision, 2022.
@conference{Vyas2022,
title = {GAMa: Cross-view Video Geo-localization},
author = {Shruti Vyas and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1512.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1512-supp.pdf
https://youtu.be/KSHuer_VXJo},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {The existing work in cross-view geo-localization is based on images where a ground panorama is matched to an aerial image. In this work, we focus on ground videos instead of images which provides ad-ditional contextual cues which are important for this task. There are no existing datasets for this problem, therefore we propose GAMa dataset, a large-scale dataset with ground videos and corresponding aerial images. We also propose a novel approach to solve this problem. At clip-level, a short video clip is matched with corresponding aerial image and is later used to get video-level geo-localization of a long video. Moreover, we propose a hierarchical approach to further improve the clip-level geo-localization. On this challenging dataset, with unaligned images and lim-ited field of view, our proposed method achieves a Top-1 recall rate of 19.4% and 45.1% @1.0mile. Code & dataset are available at this link.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wang, Wenxuan; Chen, Chen; Wang, Jing; Zha, Sen; Zhang, Yan; Li, Jiangyun
Med-DANet: Dynamic Architecture Network for Efficient Medical Volumetric Segmentation Conference
European Conference on Computer Vision, 2022.
@conference{Wang2022,
title = {Med-DANet: Dynamic Architecture Network for Efficient Medical Volumetric Segmentation},
author = {Wenxuan Wang and Chen Chen and Jing Wang and Sen Zha and Yan Zhang and Jiangyun Li},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2206.06575.pdf},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {For 3D medical image (e.g. CT and MRI) segmentation, the difficulty of segmenting each slice in a clinical case varies greatly. Previous research on volumetric medical image segmentation in a slice-byslice manner conventionally use the identical 2D deep neural network to segment all the slices of the same case, ignoring the data heterogeneity among image slices. In this paper, we focus on multi-modal 3D MRI brain tumor segmentation and propose a dynamic architecture network named Med-DANet based on adaptive model selection to achieve effective accuracy and efficiency trade-off. For each slice of the input 3D MRI volume, our proposed method learns a slice-specific decision by the Decision Network to dynamically select a suitable model from the predefined Model Bank for the subsequent 2D segmentation task. Extensive experimental results on both BraTS 2019 and 2020 datasets show that our proposed method achieves comparable or better results than previous state-of-the art methods for 3D MRI brain tumor segmentation with much less model complexity. Compared with the state-of-the-art 3D method TransBTS, the proposed framework improves the model efficiency by up to 3.5 × without sacrificing the accuracy. Our code will be publicly available at https://github.com/Wenxuan-1119/Med-DANet.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Sijie; Lin, Zhe; Cohen, Scott; Kuen, Jason; Zhang, Zhifei; Chen, Chen
GALA: Toward Geometry-and-Lighting-Aware Object Search for Compositing Conference
European Conference on Computer Vision, 2022.
@conference{Zhu2022,
title = {GALA: Toward Geometry-and-Lighting-Aware Object Search for Compositing},
author = {Sijie Zhu and Zhe Lin and Scott Cohen and Jason Kuen and Zhifei Zhang and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2204.00125.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/GALA_supplementary.pdf},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Compositing-aware object search aims to find the most compatible objects for compositing given a background image and a query bounding box. Previous works focus on learning compatibility between the foreground object and background, but fail to learn other important factors from large-scale data, i.e. geometry and lighting. To move a step further, this paper proposes GALA (Geometry-and-Lighting-Aware), a generic foreground object search method with discriminative modeling on geometry and lighting compatibility for open-world image compositing. Remarkably, it achieves state-of-the-art results on the CAIS dataset and generalizes well on large-scale open-world datasets, i.e. Pixabay and
Open Images. In addition, our method can effectively handle non-box scenarios, where users only provide background images without any input bounding box. A web demo (see supplementary materials) is built to showcase applications of the proposed method for compositing-aware search and automatic location/scale prediction for the foreground object. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Open Images. In addition, our method can effectively handle non-box scenarios, where users only provide background images without any input bounding box. A web demo (see supplementary materials) is built to showcase applications of the proposed method for compositing-aware search and automatic location/scale prediction for the foreground object.
Khan, Aisha Urooj; Kuehne, Hilde; Gan, Chuang; Lobo, Niels Da Vitoria; Shah, Mubarak
Weakly Supervised Grounding for VQA in Vision-Language Transformers Conference
European Conference on Computer Vision, 2022.
@conference{Khan2022,
title = {Weakly Supervised Grounding for VQA in Vision-Language Transformers},
author = {Aisha Urooj Khan and Hilde Kuehne and Chuang Gan and Niels Da Vitoria Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1011.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/1011-supp.pdf
https://github.com/aurooj/WSG-VQA-VLTransformers
https://youtu.be/dekmVb6lq3I},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Transformers for visual-language representation learning have been getting a lot of interest and shown tremendous performance on visual question answering (VQA) and grounding. However, most systems that show good performance of those tasks still rely on pre-trained object
detectors during training, which limits their applicability to the object classes available for those detectors. To mitigate this limitation, this paper
focuses on the problem of weakly supervised grounding in the context of visual question answering in transformers. Our approach leverages
capsules by transforming each visual token into a capsule representation in the visual encoder; it then uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding. Our experiments show that: while removing the information of masked
objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
detectors during training, which limits their applicability to the object classes available for those detectors. To mitigate this limitation, this paper
focuses on the problem of weakly supervised grounding in the context of visual question answering in transformers. Our approach leverages
capsules by transforming each visual token into a capsule representation in the visual encoder; it then uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding. Our experiments show that: while removing the information of masked
objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field.
Rizve, Mamshad Nayeem; Kardan, Navid; Khan, Salman; Khan, Fahad Shahbaz; Shah, Mubarak
OpenLDN: Learning to Discover Novel Classes for Open-World Semi-Supervised Learning Conference
European Conference on Computer Vision, 2022.
@conference{Rizve2022,
title = {OpenLDN: Learning to Discover Novel Classes for Open-World Semi-Supervised Learning},
author = {Mamshad Nayeem Rizve and Navid Kardan and Salman Khan and Fahad Shahbaz Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/6665.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/6665-supp.pdf
https://github.com/nayeemrizve/OpenLDN
https://youtu.be/p2lYqvklcjA},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Semi-supervised learning (SSL) is one of the dominant approaches to address the annotation bottleneck of supervised learning. Recent SSL methods can effectively leverage a large repository of unlabeled data to improve performance while relying on a small set of labeled data. One common assumption in most SSL methods is that the labeled and unlabeled data are from the same data distribution. However, this is hardly the case in many real-world scenarios, which limits their applicability. In this work, instead, we attempt to solve the challenging open-world SSL problem that does not make such an assumption. In the open-world SSL problem, the objective is to recognize samples of known classes, and simultaneously detect and cluster samples belonging to novel classes present in unlabeled data. This work introduces OpenLDN that utilizes a pairwise similarity loss to discover novel classes. Using a bi-level optimization rule this pairwise similarity loss exploits the information available in the labeled set to implicitly cluster novel class samples, while simultaneously recognizing samples from known classes. After discovering novel classes, OpenLDN transforms the open-world SSL problem into a standard SSL problem to achieve additional performance gains using existing SSL methods. Our extensive experiments demonstrate that OpenLDN outperforms the current state-of-the-art methods on multiple popular classification benchmarks while providing a better accuracy/training time trade-off. Code: https://github.com/nayeemrizve/OpenLDN},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Rizve, Mamshad Nayeem; Kardan, Navid; Shah, Mubarak
Towards Realistic Semi-Supervised Learning Conference
European Conference on Computer Vision, 2022.
@conference{Rizve2022b,
title = {Towards Realistic Semi-Supervised Learning},
author = {Mamshad Nayeem Rizve and Navid Kardan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7402.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7402-supp.pdf
https://github.com/nayeemrizve/TRSSL
https://youtu.be/mE7GeQ35WyY},
year = {2022},
date = {2022-10-23},
urldate = {2022-10-23},
booktitle = {European Conference on Computer Vision},
abstract = {Deep learning is pushing the state-of-the-art in many computer
vision applications. However, it relies on large annotated data
repositories, and capturing the unconstrained nature of the real-world
data is yet to be solved. Semi-supervised learning (SSL) complements
the annotated training data with a large corpus of unlabeled data to
reduce annotation cost. The standard SSL approach assumes unlabeled
data are from the same distribution as annotated data. Recently, a more
realistic SSL problem, called open-world SSL, is introduced, where the
unannotated data might contain samples from unknown classes. In this paper, we propose a novel pseudo-label based approach to tackle SSL in
open-world setting. At the core of our method, we utilize sample uncertainty and incorporate prior knowledge about class distribution to generate reliable class-distribution-aware pseudo-labels for unlabeled data belonging to both known and unknown classes. Our extensive experimentation showcases the effectiveness of our approach on several benchmark datasets, where it substantially outperforms the existing state-of-the art on seven diverse datasets including CIFAR-100 (∼17%), ImageNet-100 (∼5%), and Tiny ImageNet (∼9%). We also highlight the flexibility of our approach in solving novel class discovery task, demonstrate its stability in dealing with imbalanced data, and complement our approach with a technique to estimate the number of novel classes. Code: https://github.com/nayeemrizve/TRSSL},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
vision applications. However, it relies on large annotated data
repositories, and capturing the unconstrained nature of the real-world
data is yet to be solved. Semi-supervised learning (SSL) complements
the annotated training data with a large corpus of unlabeled data to
reduce annotation cost. The standard SSL approach assumes unlabeled
data are from the same distribution as annotated data. Recently, a more
realistic SSL problem, called open-world SSL, is introduced, where the
unannotated data might contain samples from unknown classes. In this paper, we propose a novel pseudo-label based approach to tackle SSL in
open-world setting. At the core of our method, we utilize sample uncertainty and incorporate prior knowledge about class distribution to generate reliable class-distribution-aware pseudo-labels for unlabeled data belonging to both known and unknown classes. Our extensive experimentation showcases the effectiveness of our approach on several benchmark datasets, where it substantially outperforms the existing state-of-the art on seven diverse datasets including CIFAR-100 (∼17%), ImageNet-100 (∼5%), and Tiny ImageNet (∼9%). We also highlight the flexibility of our approach in solving novel class discovery task, demonstrate its stability in dealing with imbalanced data, and complement our approach with a technique to estimate the number of novel classes. Code: https://github.com/nayeemrizve/TRSSL