Leveraging temporal synchronization and association within sight and sound is an essential step towards robust localization of sounding objects. To this end, we propose a space-time memory network for sounding object localization in videos.
@article{DBLP:journals/corr/abs-2111-05526,
author = {Sizhe Li and
Yapeng Tian and
Chenliang Xu},
title = {Space-Time Memory Network for Sounding Object Localization in Videos},
journal = {CoRR},
volume = {abs/2111.05526},
year = {2021},
url = {https://arxiv.org/abs/2111.05526},
eprinttype = {arXiv},
eprint = {2111.05526},
timestamp = {Tue, 16 Nov 2021 12:12:31 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2111-05526.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}