@inproceedings{dfdc2e3d03ab4a1abc14c6513c0c8489,
title = "Transformer-based spatial-Temporal feature lifting for 3D hand mesh reconstruction",
abstract = "This paper presents a novel model for reconstructing hand meshes in video sequences. The model extends the MobRecon [1] pipeline and incorporates a variant of the Transformer architecture which effectively models both spatial and temporal relationships using distinct positional encodings. The Transformer encoder enhances the feature representation by modeling joint relationships and learning hidden depth information. Leveraging temporal information from consecutive frames, the Transformer decoder further enhances the feature representation for the mesh decoder's final prediction. Additionally, we incorporate techniques such as Twice-LN, confidence-based attention, scaling in place of Softmax, and learnable encodings to improve the feature representation. Experimental results demonstrate the superiority of the proposed method over existing approaches.",
keywords = "attention mechanism, deep learning, hand mesh, hand pose, machine learning, transformer",
author = "Lin, {Meng Xue} and Tsai, {Wen Jiin}",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2023 IEEE International Conference on Visual Communications and Image Processing, VCIP 2023 ; Conference date: 04-12-2023 Through 07-12-2023",
year = "2023",
doi = "10.1109/VCIP59821.2023.10402608",
language = "English",
series = "2023 IEEE International Conference on Visual Communications and Image Processing, VCIP 2023",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2023 IEEE International Conference on Visual Communications and Image Processing, VCIP 2023",
address = "美國",
}