@inproceedings{62e3aa4c143a4e18bb944b5c83a05454,
title = "TrajPrompt: Aligning Color Trajectory with Vision-Language Representations",
abstract = "Cross-modal learning shows promising potential to overcome the limitations of single-modality tasks. However, without proper design for representation alignment between different data sources, the external modality cannot fully exhibit its value. For example, recent trajectory prediction approaches incorporate the Bird{\textquoteright}s-Eye-View (BEV) scene as an additional source but do not significantly improve performance compared to single-source strategies, indicating that the BEV scene and trajectory representations are not effectively combined. To overcome this problem, we propose TrajPrompt, a prompt-based approach that seamlessly incorporates trajectory representation into the vision-language framework, i.e. CLIP, for the BEV scene understanding and future forecasting. We discover that CLIP can attend to the local area of the BEV scene by utilizing our innovative design of text prompts and colored lines. Comprehensive results demonstrate TrajPrompt{\textquoteright}s effectiveness via outperforming the state-of-the-art trajectory predictors by a significant margin (over 35% improvement for ADE and FDE metrics on SDD and DroneCrowd dataset), using fewer learnable parameters than the previous trajectory modeling approaches with scene information included. Project page: https://trajprompt.github.io/.",
keywords = "Bird{\textquoteright}s-Eye-View Scene, Cross-Modal Learning, Efficient Prompt Tuning, Trajectory Prediction, Vision-Language Understanding",
author = "Tsao, {Li Wu} and Tsui, {Hao Tang} and Tuan, {Yu Rou} and Chen, {Pei Chi} and Wang, {Kuan Lin} and Wu, {Jhih Ciang} and Shuai, {Hong Han} and Cheng, {Wen Huang}",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2025.; 18th European Conference on Computer Vision, ECCV 2024 ; Conference date: 29-09-2024 Through 04-10-2024",
year = "2025",
doi = "10.1007/978-3-031-72940-9_16",
language = "English",
isbn = "9783031729393",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "275--292",
editor = "Ale{\v s} Leonardis and Elisa Ricci and Stefan Roth and Olga Russakovsky and Torsten Sattler and G{\"u}l Varol",
booktitle = "Computer Vision – ECCV 2024 - 18th European Conference, Proceedings",
address = "德國",
}