The IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR) is the premier annual computer vision event comprising the main conference and several co-located workshops and short courses.
Members of UCF’s Center for Research in Computer Vision (CRCV) and their collaborators had a record number of 14 papers accepted into the CVPR 2023 (https://cvpr2023.thecvf.com/) conference that will take place in Vancouver, Canada from June 18-22, 2023.
The h5-index is the h-index for articles published in the last 5 complete years. According to Google Scholar Metrics, the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) is ranked 4th in the h5-index rankings on https://scholar.google.com/citations?view_op=top_venues&hl=en.
You can access the CRCV Publications Page for enhanced search capabilities.
Zhu, Sijie; Yang, Linjie; Chen, Chen; Shah, Mubarak; Shen, Xiaohui; Wang, Heng
R2Former: Unified retrieval and ranking Transformer for Place Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Zhu2023,
title = {R2Former: Unified retrieval and ranking Transformer for Place Recognition},
author = {Sijie Zhu and Linjie Yang and Chen Chen and Mubarak Shah and Xiaohui Shen and Heng Wang},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CVPR_2023_PlaceRecognitionFinal.pdf
https://arxiv.org/pdf/2304.03410.pdf
https://github.com/Jeff-Zilence/R2Former},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Visual Place Recognition (VPR) estimates the location of query images by matching them with images in a reference database. Conventional methods generally adopt aggregated CNN features for global retrieval and RANSAC-based geometric verification for reranking. However, RANSAC only considers geometric information but ignores other possible information that could be useful for reranking, e.g. local feature correlation, and attention values. In this paper, we propose a unified place recognition framework that handles both retrieval and reranking with a novel transformer model, named R2Former. The proposed reranking module takes feature correlation, attention value, and xy coordinates into account, and learns to determine whether the image pair is from the same location. The whole pipeline is end-to-end trainable and the reranking module alone can
also be adopted on other CNN or transformer backbones as a generic component. Remarkably, R2Former significantly
outperforms state-of-the-art methods on major VPR datasets with much less inference time and memory consumption.
It also achieves the state-of-the-art on the holdout MSLS challenge set and could serve as a simple yet strong solution for real-world large-scale applications. Experiments also show vision transformer tokens are comparable and sometimes better than CNN local features on local matching. The code will be publicly available. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
also be adopted on other CNN or transformer backbones as a generic component. Remarkably, R2Former significantly
outperforms state-of-the-art methods on major VPR datasets with much less inference time and memory consumption.
It also achieves the state-of-the-art on the holdout MSLS challenge set and could serve as a simple yet strong solution for real-world large-scale applications. Experiments also show vision transformer tokens are comparable and sometimes better than CNN local features on local matching. The code will be publicly available.
Gupta, Rohit; Roy, Anirban; Kim, Sujeong; Christensen, Claire; Grindal, Todd; Gerard, Sarah Nixon; Cincebeaux, Madeline; Divakaran, Ajay; Shah, Mubarak
Class Prototypes based Contrastive Learning for Classifying Multi-Label and Fine-Grained Educational Videos Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Gupta2023b,
title = {Class Prototypes based Contrastive Learning for Classifying Multi-Label and Fine-Grained Educational Videos},
author = {Rohit Gupta and Anirban Roy and Sujeong Kim and Claire Christensen and Todd Grindal and Sarah Nixon Gerard and Madeline Cincebeaux and Ajay Divakaran and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Rohit_SRI_CVPR2023_Multi_Modal_Multi_Label_Contrastive_Learning_Camera_Ready-4.pdf
https://www.rohitg.xyz/MMContrast/
https://nusci.csl.sri.com/project/APPROVE},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {The recent growth in the consumption of online media by children during early childhood necessitates data-driven tools enabling educators to filter out appropriate educational content for young learners. This paper presents an approach for detecting educational content in online videos. We focus on two widely used educational content classes: literacy and math. For each class, we choose prominent codes (sub-classes) based on the Common Core Standards. For example, literacy codes include ‘letter names’, ‘letter sounds’, and math codes include ‘counting’, ‘sorting’. We pose this as a fine-grained multilabel classification problem as videos can contain multiple types of educational content and the content classes can get visually similar (e.g., ‘letter names’ vs ‘letter sounds’). We propose a novel class prototypes based supervised contrastive learning approach that can handle fine-grained samples associated with multiple labels. We learn a class prototype for each class and a loss function is employed to minimize the distances between a class prototype and the samples from the class. Similarly,
distances between a class prototype and the samples from other classes are maximized. As the alignment between visual
and audio cues are crucial for effective comprehension, we consider a multimodal transformer network to capture the interaction between visual and audio cues in videos while learning the embedding for videos. For evaluation, we present a dataset, APPROVE, employing educational videos from YouTube labeled with fine-grained education classes by education researchers. APPROVE consists of 193 hours of expert-annotated videos with 19 classes. The proposed approach outperforms strong baselines on APPROVE and other benchmarks such as Youtube-8M, and COIN. The dataset is available at https://nusci.csl.sri.com/project/APPROVE.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
distances between a class prototype and the samples from other classes are maximized. As the alignment between visual
and audio cues are crucial for effective comprehension, we consider a multimodal transformer network to capture the interaction between visual and audio cues in videos while learning the embedding for videos. For evaluation, we present a dataset, APPROVE, employing educational videos from YouTube labeled with fine-grained education classes by education researchers. APPROVE consists of 193 hours of expert-annotated videos with 19 classes. The proposed approach outperforms strong baselines on APPROVE and other benchmarks such as Youtube-8M, and COIN. The dataset is available at https://nusci.csl.sri.com/project/APPROVE.
Dave, Ishan Rajendrakumar; Rizve, Mamshad Nayeem; Chen, Chen; Shah, Mubarak
TimeBalance: Temporally-Invariant and Temporally-Distinctive Video Representations for Semi-Supervised Action Recognition Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Dave2023,
title = {TimeBalance: Temporally-Invariant and Temporally-Distinctive Video Representations for Semi-Supervised Action Recognition},
author = {Ishan Rajendrakumar Dave and Mamshad Nayeem Rizve and Chen Chen and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/TimeBalance_CVPR23_arxiv.pdf
https://daveishan.github.io/timebalance_webpage/
https://github.com/DAVEISHAN/TimeBalance},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Semi-Supervised Learning can be more beneficial for the video domain compared to images because of its higher annotation
cost and dimensionality. Besides, any video understanding task requires reasoning over both spatial and temporal dimensions. In order to learn both the static and motion related features for the semi-supervised action recognition task, existing methods rely on hard input inductive biases like using two-modalities (RGB and Optical-flow) or two-stream of different playback rates.
Instead of utilizing unlabeled videos through diverse input streams, we rely on self-supervised video representations,
particularly, we utilize temporally-invariant and temporally-distinctive representations. We observe that these representations complement each other depending on the nature of the action. Based on this observation, we propose a student-teacher semi-supervised learning framework, TimeBalance, where we distill the knowledge from a temporally-invariant and a temporally-distinctive teacher. Depending on the nature of the unlabeled video, we dynamically combine the knowledge of these two teachers based on a novel temporal similarity-based reweighting scheme. Our method achieves state-of-the-art performance
on three action recognition benchmarks: UCF101, HMDB51, and Kinetics400. Code: https://github.com/DAVEISHAN/TimeBalance.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
cost and dimensionality. Besides, any video understanding task requires reasoning over both spatial and temporal dimensions. In order to learn both the static and motion related features for the semi-supervised action recognition task, existing methods rely on hard input inductive biases like using two-modalities (RGB and Optical-flow) or two-stream of different playback rates.
Instead of utilizing unlabeled videos through diverse input streams, we rely on self-supervised video representations,
particularly, we utilize temporally-invariant and temporally-distinctive representations. We observe that these representations complement each other depending on the nature of the action. Based on this observation, we propose a student-teacher semi-supervised learning framework, TimeBalance, where we distill the knowledge from a temporally-invariant and a temporally-distinctive teacher. Depending on the nature of the unlabeled video, we dynamically combine the knowledge of these two teachers based on a novel temporal similarity-based reweighting scheme. Our method achieves state-of-the-art performance
on three action recognition benchmarks: UCF101, HMDB51, and Kinetics400. Code: https://github.com/DAVEISHAN/TimeBalance.
Rizve, Mamshad Nayeem; Mittal, Gaurav; Yu, Ye; Hall, Matthew; Sajeev, Sandra; Shah, Mubarak; Chen, Mei
PivoTAL: Prior-Driven Supervision for Weakly-Supervised Temporal Action Localization Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{Rizve2023,
title = {PivoTAL: Prior-Driven Supervision for Weakly-Supervised Temporal Action Localization},
author = {Mamshad Nayeem Rizve and Gaurav Mittal and Ye Yu and Matthew Hall and Sandra Sajeev and Mubarak Shah and Mei Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR_2023.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR_2023_Supplemental_Material.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PivoTAL_CVPR2023_Poster.pdf
https://www.youtube.com/watch?v=6kAoQjXfzio},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Urooj, Aisha; Kuehne, Hilde; Wu, Bo; Chheu, Kim; Bousselham, Walid; Gan, Chuang; Lobo, Niels; Shah, Mubarak
Learning Situation Hyper-Graphs for Video Question Answering Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Urooj2023,
title = {Learning Situation Hyper-Graphs for Video Question Answering},
author = {Aisha Urooj and Hilde Kuehne and Bo Wu and Kim Chheu and Walid Bousselham and Chuang Gan and Niels Lobo and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2023072364-4.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/SHG_VQA_CVPR2023_cam_ready_supp.pdf
},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Answering questions about complex situations in videos requires not only capturing the presence of actors, objects, and their relations but also the evolution of these relationships over time. A situation hyper-graph is a representation that describes situations as scene sub-graphs for video frames and hyper-edges for connected sub-graphs and has been proposed to capture all such information in a compact structured form. In this work, we propose an architecture for Video Question Answering (VQA) that enables answering questions related to video content by predicting situation hyper-graphs, coined Situation Hyper-Graph based Video Question Answering (SHG-VQA). To this end, we train a situation hyper-graph decoder to implicitly identify graph representations with actions and object/human-object relationships from the input video clip. and to use cross-attention
between the predicted situation hyper-graphs and the question embedding to predict the correct answer. The proposed
method is trained in an end-to-end manner and optimized by a VQA loss with the cross-entropy function and a Hungarian
matching loss for the situation graph prediction. The effectiveness of the proposed architecture is extensively evaluated
on two challenging benchmarks: AGQA and STAR. Our results show that learning the underlying situation hypergraphs
helps the system to significantly improve its performance for novel challenges of video question-answering tasks. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
between the predicted situation hyper-graphs and the question embedding to predict the correct answer. The proposed
method is trained in an end-to-end manner and optimized by a VQA loss with the cross-entropy function and a Hungarian
matching loss for the situation graph prediction. The effectiveness of the proposed architecture is extensively evaluated
on two challenging benchmarks: AGQA and STAR. Our results show that learning the underlying situation hypergraphs
helps the system to significantly improve its performance for novel challenges of video question-answering tasks.
Bhunia, Ankan Kumar; Khan, Salman; Cholakkal, Hisham; Anwer, Rao Muhammad; Laaksonen, Jorma Tapio; Shah, Mubarak; Khan, Fahad
Person Image Synthesis via Denoising Diffusion Model Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Bhunia2023,
title = {Person Image Synthesis via Denoising Diffusion Model},
author = {Ankan Kumar Bhunia and Salman Khan and Hisham Cholakkal and Rao Muhammad Anwer and Jorma Tapio Laaksonen and Mubarak Shah and Fahad Khan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/person_image_synthesis_via_den-Camera-ready-PDF.pdf
https://lnkd.in/d-8v3r8B
https://lnkd.in/dGPTjvge
https://lnkd.in/dxcGQsUX
https://github.com/ankanbhunia/PIDM},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {The pose-guided person image generation task requires synthesizing photorealistic images of humans in arbitrary poses. The existing approaches use generative adversarial networks that do not necessarily maintain realistic textures or need dense correspondences that struggle to handle complex deformations and severe occlusions. In this work, we show how denoising diffusion models can be applied for high-fidelity person image synthesis with
strong sample diversity and enhanced mode coverage of the learnt data distribution. Our proposed Person Image Diffusion Model (PIDM) disintegrates the complex transfer problem into a series of simpler forward-backward denoising steps. This helps in learning plausible source-to-target transformation trajectories that result in faithful textures and undistorted appearance details. We introduce a ‘texture diffusion module’ based on cross-attention to accurately model the correspondences between appearance and pose information available in source and target images. Further, we propose disentangled classifier-free guidance’ to ensure close resemblance between the conditional inputs and the synthesized output in terms of both pose and appearance information. Our extensive results on two large-scale benchmarks and a user study demonstrate the photorealism of our proposed approach under challenging scenarios. We also show how our generated images can help in downstream tasks. Code is available at https://github.com/ankanbhunia/PIDM.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
strong sample diversity and enhanced mode coverage of the learnt data distribution. Our proposed Person Image Diffusion Model (PIDM) disintegrates the complex transfer problem into a series of simpler forward-backward denoising steps. This helps in learning plausible source-to-target transformation trajectories that result in faithful textures and undistorted appearance details. We introduce a ‘texture diffusion module’ based on cross-attention to accurately model the correspondences between appearance and pose information available in source and target images. Further, we propose disentangled classifier-free guidance’ to ensure close resemblance between the conditional inputs and the synthesized output in terms of both pose and appearance information. Our extensive results on two large-scale benchmarks and a user study demonstrate the photorealism of our proposed approach under challenging scenarios. We also show how our generated images can help in downstream tasks. Code is available at https://github.com/ankanbhunia/PIDM.
Wasim, Syed Talal; Naseer, Muzammal; Khan, Salman; Khan, Fahad; Shah, Mubarak
Vita-CLIP: Video and text adaptive CLIP via Multimodal Prompting Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Wasim2023,
title = {Vita-CLIP: Video and text adaptive CLIP via Multimodal Prompting},
author = {Syed Talal Wasim and Muzammal Naseer and Salman Khan and Fahad Khan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/vita_clip_video_and_text_adapt-Camera-ready-PDF.pdf
},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Adopting contrastive image-text pretrained models like CLIP towards video classification has gained attention due to its cost-effectiveness and competitive performance. However, recent works in this area face a trade-off. Finetuning the pretrained model to achieve strong supervised performance results in low zero-shot generalization. Similarly, freezing the backbone to retain zero-shot capability causes significant drop in supervised accuracy. Because of this,
recent works in literature typically train separate models for supervised and zero-shot action recognition. In this work, we propose a multimodal prompt learning scheme that works to balance the supervised and zero-shot performance under a single unified training. Our prompting approach on the vision side caters for three aspects: 1) Global video-level prompts to model the data distribution; 2) Local frame-level prompts to provide per-frame discriminative
conditioning; and 3) a summary prompt to extract a condensed video representation. Additionally, we define a prompting scheme on the text side to augment the textual context. Through this prompting scheme, we can achieve state-of-the-art zero-shot performance on Kinetics-600, HMDB51 and UCF101 while remaining competitive in the supervised setting. By keeping the pretrained backbone frozen, we optimize a much lower number of parameters and retain the existing general representation which helps achieve the strong zero-shot performance. Our codes and models will be publicly released. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
recent works in literature typically train separate models for supervised and zero-shot action recognition. In this work, we propose a multimodal prompt learning scheme that works to balance the supervised and zero-shot performance under a single unified training. Our prompting approach on the vision side caters for three aspects: 1) Global video-level prompts to model the data distribution; 2) Local frame-level prompts to provide per-frame discriminative
conditioning; and 3) a summary prompt to extract a condensed video representation. Additionally, we define a prompting scheme on the text side to augment the textual context. Through this prompting scheme, we can achieve state-of-the-art zero-shot performance on Kinetics-600, HMDB51 and UCF101 while remaining competitive in the supervised setting. By keeping the pretrained backbone frozen, we optimize a much lower number of parameters and retain the existing general representation which helps achieve the strong zero-shot performance. Our codes and models will be publicly released.
Clark, Brandon Eric; Kerrigan, Alec; Kulkarni, Parth Parag; Cepeda, Vicente Vivanco; Shah, Mubarak
Where We Are and What We're Looking At: Query Based Worldwide Image Geo-localization Using Hierarchies and Scenes Conference
IEEE Computer Vision and Pattern Recognition, 2023.
@conference{Clark2023,
title = {Where We Are and What We're Looking At: Query Based Worldwide Image Geo-localization Using Hierarchies and Scenes},
author = {Brandon Eric Clark and Alec Kerrigan and Parth Parag Kulkarni and Vicente Vivanco Cepeda and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Camera-Ready-Full-Paper.pdf
https://github.com/AHKerrigan/GeoGuessNet
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CVPR23-Poster_THU-PM-246-1.pdf
https://www.youtube.com/watch?v=fp3hZGbwPqk},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
abstract = {Determining the exact latitude and longitude that a photo was taken is a useful and widely applicable task, yet it remains exceptionally difficult despite the accelerated progress of other computer vision tasks. Most previous approaches have opted to learn single representations of query images, which are then classified at different levels of geographic granularity. These approaches fail to exploit the different visual cues that give context to different hierarchies, such as the country, state, and city level. To this end, we introduce an end-to-end transformer-based architecture that exploits the relationship between different geographic levels (which we refer to as hierarchies) and the corresponding visual scene information in an image through hierarchical cross-attention. We achieve this by learning a query for each geographic hierarchy and scene type. Furthermore, we learn a separate representation for different environmental scenes, as different scenes in the same location are often defined by completely different visual features. We achieve state of the art accuracy on 4 standard geo-localization datasets : Im2GPS, Im2GPS3k, YFCC4k, and YFCC26k, as well as qualitatively demonstrate how our method learns different representations for different visual hierarchies and scenes, which has not been demonstrated in the previous methods. Above previous testing datasets mostly consist of iconic landmarks or images taken from social media, which makes the dataset a simple memory task, or makes it biased towards certain places. To address this issue we introduce a much harder testing dataset, Google-World-Streets-15k, comprised of images taken from Google Streetview covering the whole planet and present state of the art results. Our code can be found at https://github.com/AHKerrigan/GeoGuessNet.
},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Rana, Aayush; Rawat, Yogesh
Hybrid Active Learning via Deep Clustering for Video Action Detection Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{Rana2023,
title = {Hybrid Active Learning via Deep Clustering for Video Action Detection},
author = {Aayush Rana and Yogesh Rawat},
url = {https://www.crcv.ucf.edu/research/projects/hybrid-active-learning-via-deep-clustering-for-video-action-detection/},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Chantry, Madeline; Biyani, Naman; Kamtam, Prudvi; Vyas, Shruti; Palangi, Hamid; Vineet, Vibhav; Rawat, Yogesh
A Large-scale Robustness Analysis of Video Action Recognition Models Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{Chantry2023,
title = {A Large-scale Robustness Analysis of Video Action Recognition Models},
author = {Madeline Chantry and Naman Biyani and Prudvi Kamtam and Shruti Vyas and Hamid Palangi and Vibhav Vineet and Yogesh Rawat},
url = {https://sites.google.com/view/videorobustnessbenchmark/home
https://www.crcv.ucf.edu/research/projects/ucf101-ds-action-recognition-for-real-world-distribution-shifts/
https://github.com/Maddy12/ActionRecognitionRobustnessEval
https://youtu.be/pv2AJ_t-v90
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/PosterCVPR2023.png},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Sijie; Lin, Zhe; Cohen, Scott; Kuen, Jason; Zhang, Zhifei; Chen, Chen
TopNet: Transformer-based Object Placement Network for Image Compositing Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{Zhu2023b,
title = {TopNet: Transformer-based Object Placement Network for Image Compositing },
author = {Sijie Zhu and Zhe Lin and Scott Cohen and Jason Kuen and Zhifei Zhang and Chen Chen},
url = {https://arxiv.org/pdf/2304.03372.pdf},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Ce; Mendieta, Matias; Yang, Taojiannan; Qi, Guo-Jun; Chen, Chen
FeatER: An Efficient Network for Human Reconstruction via Feature Map-Based TransformER Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{Zheng2023,
title = {FeatER: An Efficient Network for Human Reconstruction via Feature Map-Based TransformER},
author = {Ce Zheng and Matias Mendieta and Taojiannan Yang and Guo-Jun Qi and Chen Chen},
url = {https://arxiv.org/pdf/2205.15448.pdf
https://zczcwh.github.io/feater_page/
https://github.com/zczcwh/FeatER},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Ce; Liu, Xianpeng; Qi, Guo-Jun; Chen, Chen
POTTER: Pooling Attention Transformer for Efficient Human Mesh Recovery Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{Zheng2023b,
title = {POTTER: Pooling Attention Transformer for Efficient Human Mesh Recovery},
author = {Ce Zheng and Xianpeng Liu and Guo-Jun Qi and Chen Chen},
url = {https://arxiv.org/pdf/2303.13357.pdf
https://zczcwh.github.io/potter_page/
https://github.com/zczcwh/POTTER},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhao, Qitao; Zheng, Ce; Liu, Mengyuan; Wang, Pichao; Chen, Chen
PoseFormerV2: Exploring Frequency Domain for Efficient and Robust 3D Human Pose Estimation Conference
IEEE Computer Vision and Pattern Recognition, 2023.
BibTeX | Links:
@conference{nokey,
title = {PoseFormerV2: Exploring Frequency Domain for Efficient and Robust 3D Human Pose Estimation},
author = {Qitao Zhao and Ce Zheng and Mengyuan Liu and Pichao Wang and Chen Chen},
url = {https://arxiv.org/pdf/2303.17472.pdf
https://qitaozhao.github.io/PoseFormerV2
https://github.com/QitaoZhao/PoseFormerV2},
year = {2023},
date = {2023-06-18},
urldate = {2023-06-18},
publisher = {IEEE Computer Vision and Pattern Recognition},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Cui, Xuanming; Aparcedo, Alejandro; Jang, Young Kyun; Lim, Ser-Nam
On the Robustness of Large Multimodal Models Against Image Adversarial Attacks Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference, 2023.
@conference{Cui2023,
title = {On the Robustness of Large Multimodal Models Against Image Adversarial Attacks},
author = {Xuanming Cui and Alejandro Aparcedo and Young Kyun Jang and Ser-Nam Lim},
url = {https://arxiv.org/pdf/2312.03777
https://arxiv.org/abs/2312.03777},
doi = {https://doi.org/10.48550/arXiv.2312.03777},
year = {2023},
date = {2023-06-18},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference},
abstract = {Recent advances in instruction tuning have led to the development of State-of-the-Art Large Multimodal Models (LMMs). Given the novelty of these models, the impact of visual adversarial attacks on LMMs has not been thoroughly examined. We conduct a comprehensive study of the robustness of various LMMs against different adversarial attacks, evaluated across tasks including image classification, image captioning, and Visual Question Answer (VQA). We find that in general LMMs are not robust to visual adversarial inputs. However, our findings suggest that context provided to the model via prompts, such as questions in a QA pair helps to mitigate the effects of visual adversarial inputs. Notably, the LMMs evaluated demonstrated remarkable resilience to such attacks on the ScienceQA task with only an 8.10% drop in performance compared to their visual counterparts which dropped 99.73%. We also propose a new approach to real-world image classification which we term query decomposition. By incorporating existence queries into our input prompt we observe diminished attack effectiveness and improvements in image classification accuracy. This research highlights a previously under-explored facet of LMM robustness and sets the stage for future work aimed at strengthening the resilience of multimodal systems in adversarial environments.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}