@inproceedings{001dab9a09e14cfba4c3cdebfa98b5bd,
title = "A Preliminary Study on Taiwanese OCR for Assisting Textual Database Construction from Historical Documents",
abstract = "Currently, there is not enough Taiwanese text available to build a proper language model (LM) to support the construction of emerging Taiwanese automatic speech recognition (ASR) and text-to-speech (TTS) systems. Therefore, this paper reports the first Taiwanese optical character recognition (OCR) [1, 2, 3] system to assist human annotators in converting a vast collection of scanned images of Taiwanese historical documents preserved in the 'Memory of the Written Taiwanese' (MoWT) website [4] into a usable textual database for building state-of-the-art Taiwanese ASR and TTS systems in the future. Supplementary information and replication materials are available on GitHub [5].",
keywords = "Optical Character Recognition, Taiwanese Text Corpus, Written Taiwanese",
author = "Liao, {Yuan Fu} and Huang, {Yu Hsuan} and Matus Pleva and Daniel Hladek and Su, {Ming Hsiang}",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 13th International Symposium on Chinese Spoken Language Processing, ISCSLP 2022 ; Conference date: 11-12-2022 Through 14-12-2022",
year = "2022",
doi = "10.1109/ISCSLP57327.2022.10038277",
language = "English",
series = "2022 13th International Symposium on Chinese Spoken Language Processing, ISCSLP 2022",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "270--274",
editor = "Lee, {Kong Aik} and Hung-yi Lee and Yanfeng Lu and Minghui Dong",
booktitle = "2022 13th International Symposium on Chinese Spoken Language Processing, ISCSLP 2022",
address = "United States",
}