ICCV is the premier international computer vision event comprising the main conference and several co-located workshops and tutorials.
Members of UCF’s Center for Research in Computer Vision (CRCV) and their collaborators had a record number of 18 papers accepted into the (ICCV 2023) conference that will take place in Paris, France from October 2-6, 2023.
The h5-index is the h-index for articles published in the last 5 complete years. According to Google Scholar Metrics, the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) is ranked 17th in the h5-index rankings on https://scholar.google.com/citations?view_op=top_venues&hl=en.
You can access the CRCV Publications Page for enhanced search capabilities.
Li, Ming; Wu, Jie; Wang, Xionghui; Chen, Chen; Qin, Jie; Xiao, Xuefeng; Wang, Rui; Zheng, Min; Pan, Xin
AlignDet: Aligning Pre-training and Fine-tuning in Object Detection Conference
IEEE/CVF International Conference on Computer Vision, 2023.
BibTeX | Links:
@conference{Li2023,
title = {AlignDet: Aligning Pre-training and Fine-tuning in Object Detection},
author = {Ming Li and Jie Wu and Xionghui Wang and Chen Chen and Jie Qin and Xuefeng Xiao and Rui Wang and Min Zheng and Xin Pan},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2307.11077.pdf
https://arxiv.org/abs/2307.11077
https://github.com/liming-ai/AlignDet
https://openreview.net/forum?id=8PA2nX9v_r2
https://liming-ai.github.io/AlignDet/},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Peng, Qucheng; Zheng, Ce; Chen, Chen
Source-free Domain Adaptive Human Pose Estimation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Peng2023,
title = {Source-free Domain Adaptive Human Pose Estimation},
author = {Qucheng Peng and Ce Zheng and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.03202.pdf
https://arxiv.org/abs/2308.03202
https://github.com/davidpengucf/SFDAHPE},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Human Pose Estimation (HPE) is widely used in various fields, including motion analysis, healthcare, and virtual reality. However, the great expenses of labeled real-world datasets present a significant challenge for HPE. To overcome this, one approach is to train HPE models on synthetic datasets and then perform domain adaptation (DA) on realworld data. Unfortunately, existing DA methods for HPE neglect data privacy and security by using both source and target data in the adaptation process. To this end, we propose a new task, named sourcefree domain adaptive HPE, which aims to address the challenges of cross-domain learning of HPE without access to source data during the adaptation process. We further propose a novel framework that consists of three models: source model, intermediate model, and target model, which explores the task from both sourceprotect and target-relevant perspectives. The sourceprotect module preserves source information more effectively while resisting noise, and the target-relevant module reduces the sparsity of spatial representations by building a novel spatial probability space, and pose-specific contrastive learning and information maximization are proposed on the basis of this space. Comprehensive experiments on several domain adaptive HPE benchmarks show that the proposed method outperforms existing approaches by a considerable margin. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Sun, Guangyu; Mendieta, Matias; Chen, Chen
FedPerfix: Towards Partial Model Personalization of Vision Transformers in Federated Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {FedPerfix: Towards Partial Model Personalization of Vision Transformers in Federated Learning},
author = {Guangyu Sun and Matias Mendieta and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.09160.pdf
https://arxiv.org/abs/2308.09160
https://github.com/imguangyu/FedPerfix},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {We propose and analyze a general framework of federated learning with partial model personalization. Compared with full model personalization, partial model personalization relies on domain knowledge to select a small portion of the model to personalize, thus imposing a much smaller on-device memory footprint. We propose two federated optimization algorithms for training partially personalized models, where the shared and personal parameters are updated either simultaneously or alternately on each device, but only the shared parameters are communicated and aggregated at the server. We give convergence analyses of both algorithms for minimizing smooth nonconvex functions, providing theoretical support of them for training deep learning models. Our experiments on real-world image and text datasets demonstrate that (a) partial model personalization can obtain most of the benefit of full model personalization with a small fraction of personalized parameters, and, (b) the alternating update algorithm often outperforms the simultaneous update algorithm.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Luo, Jun; Mendieta, Matias; Chen, Chen
PGFed: Personalize Each Client's Global Objective for Federated Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {PGFed: Personalize Each Client's Global Objective for Federated Learning},
author = {Jun Luo and Matias Mendieta and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2212.01448.pdf
https://github.com/ljaiverson/pgfed},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The mediocre performance of conventional federated learning (FL) over heterogeneous data has been facilitating personalized FL solutions, where, unlike conventional FL which trains a single global consensus model, different models are allowed for different clients. However, in most existing personalized FL algorithms, the collaborative knowledge across the federation was only implicitly passed to the clients in ways such as model aggregation or regularization. We observed that this implicit knowledge transfer fails to maximize the potential value of each client's empirical risk toward other clients. Based on our observation, in this work, we propose Personalized Global Federated Learning (PGFed), a novel personalized FL framework that enables each client to personalize its own global objective by explicitly and adaptively aggregating the empirical risks of itself and other clients. To avoid massive (O(N2)) communication overhead and potential privacy leakage, each client's risk is estimated through a first-order approximation for other clients' adaptive risk aggregation. On top of PGFed, we develop a momentum upgrade, dubbed PGFedMo, to more efficiently utilize clients' empirical risks. Our extensive experiments under different federated settings with benchmark datasets show consistent improvements of PGFed over the compared state-of-the-art alternatives.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Dang, Andong; Yang, Taojiannan; Chen, Chen
A Large-scale Study of Spatiotemporal Representation Learning with a New Benchmark on Action Recognition Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {A Large-scale Study of Spatiotemporal Representation Learning with a New Benchmark on Action Recognition},
author = {Andong Dang and Taojiannan Yang and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.13505.pdf
https://arxiv.org/abs/2303.13505
https://github.com/AndongDeng/BEAR},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The goal of building a benchmark (suite of datasets) is to provide a unified protocol for fair evaluation and thus facilitate the evolution of a specific area. Nonetheless, we point out that existing protocols of action recognition could yield partial evaluations due to several limitations. To comprehensively probe the effectiveness of spatiotemporal representation learning, we introduce BEAR, a new BEnchmark on video Action Recognition. BEAR is a collection of 18 video datasets grouped into 5 categories (anomaly, gesture, daily, sports, and instructional), which covers a diverse set of real-world applications. With BEAR, we thoroughly evaluate 6 common spatiotemporal models pre-trained by both supervised and self-supervised learning. We also report transfer performance via standard finetuning, few-shot finetuning, and unsupervised domain adaptation. Our observation suggests that current state-of-the-art cannot solidly guarantee high performance on datasets close to real-world applications, and we hope BEAR can serve as a fair and challenging evaluation benchmark to gain insights on building next-generation spatiotemporal learners. Our dataset, code, and models are released at: https://github.com/AndongDeng/BEAR},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Mendieta, Matias; Chen, Chen
Towards Geospatial Foundation Models via Continual Pretraining Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Towards Geospatial Foundation Models via Continual Pretraining},
author = {Matias Mendieta and Chen Chen},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2302.04476.pdf
https://arxiv.org/abs/2302.04476
https://github.com/mmendiet/GFM},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Geospatial technologies are becoming increasingly essential in our world for a wide range of applications, including agriculture, urban planning, and disaster response. To help improve the applicability and performance of deep learning models on these geospatial tasks, various works have begun investigating foundation models for this domain. Researchers have explored two prominent approaches for introducing such models in geospatial applications, but both have drawbacks in terms of limited performance benefit or prohibitive training cost. Therefore, in this work, we propose a novel paradigm for building highly effective geospatial foundation models with minimal resource cost and carbon impact. We first construct a compact yet diverse dataset from multiple sources to promote feature diversity, which we term GeoPile. Then, we investigate the potential of continual pretraining from large-scale ImageNet-22k models and propose a multi-objective continual pretraining paradigm, which leverages the strong representations of ImageNet while simultaneously providing the freedom to learn valuable in-domain features. Our approach outperforms previous state-of-the-art geospatial pretraining methods in an extensive evaluation on seven downstream datasets covering various tasks such as change detection, classification, multi-label classification, semantic segmentation, and super-resolution.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Chen, Hao; Qu, Chenyuan; Zhang, Yu; Chen, Chen; Jiao, Jianbo
Multi-view Self-supervised Disentanglement for General Image Denoising Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Multi-view Self-supervised Disentanglement for General Image Denoising},
author = {Hao Chen and Chenyuan Qu and Yu Zhang and Chen Chen and Jianbo Jiao},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICCV2023_MeD_Final_Version.pdf
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICCV2023_MeD_Supplymentary_Final_Version.pdf
https://chqwer2.github.io/MeD/
https://github.com/chqwer2/Multi-view-Self-supervised-Disentanglement-Denoising},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {With its significant performance improvements, the deep
learning paradigm has become a standard tool for modern
image denoisers. While promising performance has been
shown on seen noise distributions, existing approaches often suffer from generalisation to unseen noise types or general and real noise. It is understandable as the model is
designed to learn paired mapping (e.g. from a noisy image
to its clean version). In this paper, we instead propose to
learn to disentangle the noisy image, under the intuitive
assumption that different corrupted versions of the same
clean image share a common latent space. A self-supervised
learning framework is proposed to achieve the goal, without looking at the latent clean image. By taking two different corrupted versions of the same image as input, the proposed Multi-view Self-supervised Disentanglement (MeD) approach learns to disentangle the latent clean features from the corruptions and recover the clean image consequently. Extensive experimental analysis on both synthetic and real noise shows the superiority of the proposed method over prior self-supervised approaches, especially on unseen novel noise types. On real noise, the proposed method even outperforms its supervised counterparts by over 3 dB. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
learning paradigm has become a standard tool for modern
image denoisers. While promising performance has been
shown on seen noise distributions, existing approaches often suffer from generalisation to unseen noise types or general and real noise. It is understandable as the model is
designed to learn paired mapping (e.g. from a noisy image
to its clean version). In this paper, we instead propose to
learn to disentangle the noisy image, under the intuitive
assumption that different corrupted versions of the same
clean image share a common latent space. A self-supervised
learning framework is proposed to achieve the goal, without looking at the latent clean image. By taking two different corrupted versions of the same image as input, the proposed Multi-view Self-supervised Disentanglement (MeD) approach learns to disentangle the latent clean features from the corruptions and recover the clean image consequently. Extensive experimental analysis on both synthetic and real noise shows the superiority of the proposed method over prior self-supervised approaches, especially on unseen novel noise types. On real noise, the proposed method even outperforms its supervised counterparts by over 3 dB.
Lijun Li, Linrui Tian; Zhang, Xindi; Wang, Qi; Zhang, Bang; Bo, Liefeng; Liu, Mengyuan; Chen, Chen
RenderIH: A large-scale synthetic dataset for 3D interacting hand pose estimation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {RenderIH: A large-scale synthetic dataset for 3D interacting hand pose estimation},
author = {Lijun Li, Linrui Tian and Xindi Zhang and Qi Wang and Bang Zhang and Liefeng Bo and Mengyuan Liu and Chen Chen},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Vahidian, Saeed; Kadaveru, Sreevatsank; Baek, Woonjoon; Wang, Weijia; Kungurtsev, Vyacheslav; Chen, Chen; Shah, Mubarak; Lin, Bill
When Do Curricula Work in Federated Learning? Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Vahidian2023b,
title = {When Do Curricula Work in Federated Learning? },
author = {Saeed Vahidian and Sreevatsank Kadaveru and Woonjoon Baek and Weijia Wang and Vyacheslav Kungurtsev and Chen Chen and Mubarak Shah and Bill Lin},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2212.12712.pdf
https://arxiv.org/abs/2212.12712},
doi = {https://doi.org/10.48550/arXiv.2212.12712},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {An oft-cited open problem of federated learning is the existence of data heterogeneity at the clients. One pathway to understanding the drastic accuracy drop in federated learning is by scrutinizing the behavior of the clients' deep models on data with different levels of "difficulty", which has been left unaddressed. In this paper, we investigate a different and rarely studied dimension of FL: ordered learning. Specifically, we aim to investigate how ordered learning principles can contribute to alleviating the heterogeneity effects in FL. We present theoretical analysis and conduct extensive empirical studies on the efficacy of orderings spanning three kinds of learning: curriculum, anti-curriculum, and random curriculum. We find that curriculum learning largely alleviates non-IIDness. Interestingly, the more disparate the data distributions across clients the more they benefit from ordered learning. We provide analysis explaining this phenomenon, specifically indicating how curriculum training appears to make the objective landscape progressively less convex, suggesting fast converging iterations at the beginning of the training procedure. We derive quantitative results of convergence for both convex and nonconvex objectives by modeling the curriculum training on federated devices as local SGD with locally biased stochastic gradients. Also, inspired by ordered learning, we propose a novel client selection technique that benefits from the real-world disparity in the clients. Our proposed approach to client selection has a synergic effect when applied together with ordered learning in FL.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Samarasinghe, Sarinda; Nayeem, Mamshad; Kardan, Rizve Navid; Shah, Mubarak
CDFSL-V: Cross-Domain Few-Shot Learning for Videos Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Samarasinghe2023,
title = {CDFSL-V: Cross-Domain Few-Shot Learning for Videos},
author = {Sarinda Samarasinghe and Mamshad Nayeem and Rizve Navid Kardan and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/CDFSL_Video_Combined_Final.pdf
https://sarinda251.github.io/CDFSL-V-site/
https://www.youtube.com/watch?v=RdlEzfW013o},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Few-shot video action recognition is an effective approach to recognizing new categories with only a few labeled examples, thereby reducing the challenges associated with collecting and annotating large-scale video datasets. Existing methods in video action recognition rely on large labeled datasets from the same domain. However, this setup is not realistic as novel categories may come from different data domains that may have different spatial and temporal characteristics. This dissimilarity between the source and target domains can pose a significant challenge, rendering traditional few-shot action recognition techniques ineffective. To address this issue, in this work, we propose a novel cross-domain few-shot video action recognition method that leverages self-supervised learning and curriculum learning to balance the information from the source and target domains. To be particular, our method employs a masked autoencoder-based self-supervised training objective to learn from both source and target data in a self-supervised manner. Then a progressive curriculum balances learning the discriminative information from the source dataset with the generic information learned from the target domain. Initially, our curriculum utilizes supervised learning to learn class discriminative features from the source data. As the training progresses, we transition to learning target-domain-specific features. We propose a progressive curriculum to encourage the emergence of rich features in the target domain based on class discriminative supervised features in the source domain. %a schedule that helps with this transition. We evaluate our method on several challenging benchmark datasets and demonstrate that our approach outperforms existing cross-domain few-shot learning techniques.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Sirnam, Swetha; Rizve, Mamshad Nayeem; Kuhne, Hilde; Shah, Mubarak
Preserving Modality Structure Improves Multi-Modal Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Preserving Modality Structure Improves Multi-Modal Learning },
author = {Swetha Sirnam and Mamshad Nayeem Rizve and Hilde Kuhne and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.13077.pdf
https://arxiv.org/abs/2308.13077
https://github.com/Swetha5/Multi_Sinkhorn_Knopp
https://swetha5.github.io/MultiSK/
https://youtu.be/1CrGkUATy50
},
doi = {https://doi.org/10.48550/arXiv.2308.13077},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Self-supervised learning on large-scale multi-modal datasets allows learning semantically meaningful embeddings in a joint multi-modal representation space without relying on human annotations. These joint embeddings enable zero-shot cross-modal tasks like retrieval and classification. However, these methods often struggle to generalize well on out-of-domain data as they ignore the semantic structure present in modality-specific embeddings. In this context, we propose a novel Semantic-Structure-Preserving Consistency approach to improve generalizability by preserving the modality-specific relationships in the joint embedding space. To capture modality-specific semantic relationships between samples, we propose to learn multiple anchors and represent the multifaceted relationship between samples with respect to their relationship with these anchors. To assign multiple anchors to each sample, we propose a novel Multi-Assignment Sinkhorn-Knopp algorithm. Our experimentation demonstrates that our proposed approach learns semantically meaningful anchors in a self-supervised manner. Furthermore, our evaluation on MSR-VTT and YouCook2 datasets demonstrates that our proposed multi-anchor assignment based solution achieves state-of-the-art performance and generalizes to both inand out-of-domain datasets. Code: https://github.com/Swetha5/Multi_Sinkhorn_Knopp},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Liu, Daochang; Li, Qiyue; Dinh, Anh-Dung; Jiang, Tingting; Shah, Mubarak; Xu, Chang
Diffusion Action Segmentation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Liu2023b,
title = {Diffusion Action Segmentation},
author = {Daochang Liu and Qiyue Li and Anh-Dung Dinh and Tingting Jiang and Mubarak Shah and Chang Xu},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.17959.pdf
https://arxiv.org/abs/2303.17959
https://finspire13.github.io/DiffAct-Project-Page/
https://github.com/Finspire13/DiffAct
https://youtu.be/o_Jp8shth7U
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/Slides.pptx},
doi = { https://doi.org/10.48550/arXiv.2303.17959},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Temporal action segmentation is crucial for understanding long-form videos. Previous works on this task commonly adopt an iterative refinement paradigm by using multi-stage models. We propose a novel framework via denoising diffusion models, which nonetheless shares the same inherent spirit of such iterative refinement. In this framework, action predictions are iteratively generated from random noise with input video features as conditions. To enhance the modeling of three striking characteristics of human actions, including the position prior, the boundary ambiguity, and the relational dependency, we devise a unified masking strategy for the conditioning inputs in our framework. Extensive experiments on three benchmark datasets, i.e., GTEA, 50Salads, and Breakfast, are performed and the proposed method achieves superior or comparable results to state-of-the-art methods, showing the effectiveness of a generative approach for action segmentation.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Dave, Ishan; Shah, Mubarak
TeD-SPAD: Temporal Distinctiveness for Self-supervised Privacy-preservation for video Anomaly Detection Conference
IEEE/CVF International Conference on Computer Vision, 2023.
BibTeX | Links:
@conference{Fioresi2023,
title = {TeD-SPAD: Temporal Distinctiveness for Self-supervised Privacy-preservation for video Anomaly Detection},
author = {Joseph Fioresi and Ishan Dave and Mubarak Shah},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2308.11072.pdf
https://arxiv.org/abs/2308.11072
https://github.com/UCF-CRCV/TeD-SPAD
https://joefioresi718.github.io/TeD-SPAD_webpage/
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/TeDSPAD_ICCV_poster.pdf
https://youtu.be/3a9qeJUD1GU},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Wasim, Syed Talal; Khattak, Muhammad Uzair; Naseer, Muzammal; Khan, Salman; Shah, Mubarak; Khan, Fahad Shahbaz
Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action Recognition },
author = {Syed Talal Wasim and Muhammad Uzair Khattak and Muzammal Naseer and Salman Khan and Mubarak Shah and Fahad Shahbaz Khan },
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2307.06947.pdf
https://arxiv.org/abs/2307.06947
https://talalwasim.github.io/Video-FocalNets/
https://github.com/TalalWasim/Video-FocalNets
https://talalwasim.github.io/Video-FocalNets/#BibTeX},
doi = { https://doi.org/10.48550/arXiv.2307.06947},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Recent video recognition models utilize Transformer models for long-range spatio-temporal context modeling. Video transformer designs are based on self-attention that can model global context at a high computational cost. In comparison, convolutional designs for videos offer an efficient alternative but lack long-range dependency modeling. Towards achieving the best of both designs, this work proposes Video-FocalNet, an effective and efficient architecture for video recognition that models both local and global contexts. Video-FocalNet is based on a spatio-temporal focal modulation architecture that reverses the interaction and aggregation steps of self-attention for better efficiency. Further, the aggregation step and the interaction step are both implemented using efficient convolution and element-wise multiplication operations that are computationally less expensive than their self-attention counterparts on video representations. We extensively explore the design space of focal modulation-based spatio-temporal context modeling and demonstrate our parallel spatial and temporal encoding design to be the optimal choice. Video-FocalNets perform favorably well against the state-of-the-art transformer-based models for video recognition on three large-scale datasets (Kinetics-400, Kinetics-600, and SS-v2) at a lower computational cost. Our code/models are publicly released.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Jain, Nishant; Behl, Harkirat; Rawat, Yogesh Singh; Vineet, Vibhav
Efficiently Robustify Pre-Trained Models Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{nokey,
title = {Efficiently Robustify Pre-Trained Models},
author = {Nishant Jain and Harkirat Behl and Yogesh Singh Rawat and Vibhav Vineet},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/ICCV23_Robust_Learning.pdf},
year = {2023},
date = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {A recent trend in deep learning algorithms has been towards training large scale models, having high parameter count and trained on big dataset. However, robustness of such large scale models towards real-world settings is still a less-explored topic. In this work, we first benchmark the performance of these models under different perturbations and datasets thereby representing real-world shifts, and highlight their degrading performance under these shifts. We then discuss on how complete model fine-tuning based existing robustification schemes might not be a scalable option given very large scale networks and can also lead them to forget some of the desired characteristics. Finally, we propose a simple and cost-effective method to solve this problem, inspired by knowledge transfer literature. It involves robustifying smaller models, at a lower computation cost, and then use them as teachers to tune a fraction of these large scale networks, reducing the overall computational overhead. We evaluate our proposed method under various vision perturbations including ImageNet-C,R,S,A datasets and also for transfer learning, zero-shot evaluation setups on different datasets. Benchmark results show that our method is able to induce robustness to these large scale models efficiently, requiring significantly lower time and also preserves the transfer learning, zero-shot properties of the original model which none of the existing methods are able to achieve. },
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Zhou, Yifei; Li, Zilu; Shrivasta, Abhinav; Zhao, Hengshuang; Torralba, Antonio; Tian, Taipeng; Lim, Ser-Nam
BT^2 : Backward-compatible Training with Basis Transformation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Zhou2023,
title = {BT^2 : Backward-compatible Training with Basis Transformation},
author = {Yifei Zhou and Zilu Li and Abhinav Shrivasta and Hengshuang Zhao and Antonio Torralba and Taipeng Tian and Ser-Nam Lim},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2211.03989v3.pdf
https://arxiv.org/abs/2211.03989v3},
doi = {https://doi.org/10.48550/arXiv.2211.03989},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Modern retrieval system often requires recomputing the representation of every piece of data in the gallery when updating to a better representation model. This process is known as backfilling and can be especially costly in the real world where the gallery often contains billions of samples. Recently, researchers have proposed the idea of Backward Compatible Training (BCT) where the new representation model can be trained with an auxiliary loss to make it backward compatible with the old representation. In this way, the new representation can be directly compared with the old representation, in principle avoiding the need for any backfilling. However, followup work shows that there is an inherent tradeoff where a backward compatible representation model cannot simultaneously maintain the performance of the new model itself. This paper reports our ``not-so-surprising'' finding that adding extra dimensions to the representation can help here. However, we also found that naively increasing the dimension of the representation did not work. To deal with this, we propose Backward-compatible Training with a novel Basis Transformation (BT2). A basis transformation (BT) is basically a learnable set of parameters that applies an orthonormal transformation. Such a transformation possesses an important property whereby the original information contained in its input is retained in its output. We show in this paper how a BT can be utilized to add only the necessary amount of additional dimensions. We empirically verify the advantage of BT2 over other state-of-the-art methods in a wide range of settings. We then further extend BT2 to other challenging yet more practical settings, including significant change in model architecture (CNN to Transformers), modality change, and even a series of updates in the model architecture mimicking the evolution of deep learning models.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Hammoud, Hasan Abed Al Kader; Prabhu, Ameya; Lim, Ser-Nam; Torr, Philip; Bibi, Adel; Ghanem, Bernard
Towards a True Evaluation of Rapid Adaptation in Online Continual Learning Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Hammoud2023,
title = {Towards a True Evaluation of Rapid Adaptation in Online Continual Learning},
author = {Hasan Abed Al Kader Hammoud and Ameya Prabhu and Ser-Nam Lim and Philip Torr and Adel Bibi and Bernard Ghanem},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2305.09275.pdf
https://arxiv.org/abs/2305.09275
https://github.com/drimpossible/EvalOCL},
doi = {https://doi.org/10.48550/arXiv.2305.09275},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {We revisit the common practice of evaluating adaptation of Online Continual Learning (OCL) algorithms through the metric of online accuracy, which measures the accuracy of the model on the immediate next few samples. However, we show that this metric is unreliable, as even vacuous blind classifiers, which do not use input images for prediction, can achieve unrealistically high online accuracy by exploiting spurious label correlations in the data stream. Our study reveals that existing OCL algorithms can also achieve high online accuracy, but perform poorly in retaining useful information, suggesting that they unintentionally learn spurious label correlations. To address this issue, we propose a novel metric for measuring adaptation based on the accuracy on the near-future samples, where spurious correlations are removed. We benchmark existing OCL approaches using our proposed metric on large-scale datasets under various computational budgets and find that better generalization can be achieved by retaining and reusing past seen information. We believe that our proposed metric can aid in the development of truly adaptive OCL methods. We provide code to reproduce our results at https://github.com/drimpossible/EvalOCL.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Chen, Xi; Li, Shuang; Lim, Ser-Nam; Torralba, Antonio; Zhao, Hengshuang
Open-vocabulary Panoptic Segmentation with Embedding Modulation Conference
IEEE/CVF International Conference on Computer Vision, 2023.
@conference{Chen2023b,
title = {Open-vocabulary Panoptic Segmentation with Embedding Modulation},
author = {Xi Chen and Shuang Li and Ser-Nam Lim and Antonio Torralba and Hengshuang Zhao},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2303.11324.pdf
https://arxiv.org/abs/2303.11324
https://opsnet-page.github.io/},
doi = {https://doi.org/10.48550/arXiv.2303.11324},
year = {2023},
date = {2023-10-02},
urldate = {2023-10-02},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Open-vocabulary image segmentation is attracting increasing attention due to its critical applications in the real world. Traditional closed-vocabulary segmentation methods are not able to characterize novel objects, whereas several recent open-vocabulary attempts obtain unsatisfactory results, i.e., notable performance reduction on the closed vocabulary and massive demand for extra data. To this end, we propose OPSNet, an omnipotent and data-efficient framework for Open-vocabulary Panoptic Segmentation. Specifically, the exquisitely designed Embedding Modulation module, together with several meticulous components, enables adequate embedding enhancement and information exchange between the segmentation model and the visual-linguistic well-aligned CLIP encoder, resulting in superior segmentation performance under both open- and closed-vocabulary settings with much fewer need of additional data. Extensive experimental evaluations are conducted across multiple datasets (e.g., COCO, ADE20K, Cityscapes, and PascalContext) under various circumstances, where the proposed OPSNet achieves state-of-the-art results, which demonstrates the effectiveness and generality of the proposed approach. The code and trained models will be made publicly available.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}