From b09428ff9365b080d102ccbc131447d35d5cbd99 Mon Sep 17 00:00:00 2001 From: Koen Farell Date: Wed, 21 Aug 2024 13:03:13 +0300 Subject: [PATCH 1/6] feat: added ukrainian vocab Added ukrainian vocab to allow use handwritten UA datasets --- doctr/datasets/vocabs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index c17a6a5f0..5b96413cc 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -25,6 +25,7 @@ "hindi_punctuation": "।,?!:्ॐ॰॥॰", "bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ", "bangla_digits": "০১২৩৪৫৬৭৮৯", + "general_cyrillyc_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ" } VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"] @@ -59,6 +60,7 @@ VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪" VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"] VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"] +VOCABS["ukrainian"] = VOCABS["general_cyrillyc_letters"] + "ґіїєҐІЇЄ" VOCABS["multilingual"] = "".join( dict.fromkeys( VOCABS["french"] @@ -73,6 +75,7 @@ + VOCABS["danish"] + VOCABS["finnish"] + VOCABS["swedish"] + + VOCABS["ukrainian"] + "§" ) ) From 0aa97ed6007006c726195ca83ca54c8e84d3f5db Mon Sep 17 00:00:00 2001 From: Koen Farell Date: Wed, 21 Aug 2024 13:06:10 +0300 Subject: [PATCH 2/6] feat: updated vocabs with UA Added ukrainian language for possibility to use UA datasets and train OCR models --- doctr/datasets/vocabs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index 5b96413cc..183085628 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -25,7 +25,7 @@ "hindi_punctuation": "।,?!:्ॐ॰॥॰", "bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ", "bangla_digits": "০১২৩৪৫৬৭৮৯", - "general_cyrillyc_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ" + "generic_cyrillic_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ", } VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"] @@ -60,7 +60,7 @@ VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪" VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"] VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"] -VOCABS["ukrainian"] = VOCABS["general_cyrillyc_letters"] + "ґіїєҐІЇЄ" +VOCABS["ukrainian"] = VOCABS["generic_cyrillic_letters"] + "ґіїєҐІЇЄ" VOCABS["multilingual"] = "".join( dict.fromkeys( VOCABS["french"] From 1cf5a580b8624ffc9401d448a7707806a151cc31 Mon Sep 17 00:00:00 2001 From: Koen Farell Date: Wed, 21 Aug 2024 13:21:25 +0300 Subject: [PATCH 3/6] feat: fixed currency numbers and punctuation --- doctr/datasets/vocabs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index 183085628..0e01a4756 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -13,7 +13,7 @@ "digits": string.digits, "ascii_letters": string.ascii_letters, "punctuation": string.punctuation, - "currency": "£€¥¢฿", + "currency": "£€¥¢฿₴", "ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ", "arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي", "persian_letters": "پچڢڤگ", @@ -60,7 +60,7 @@ VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪" VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"] VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"] -VOCABS["ukrainian"] = VOCABS["generic_cyrillic_letters"] + "ґіїєҐІЇЄ" +VOCABS["ukrainian"] = VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ" VOCABS["multilingual"] = "".join( dict.fromkeys( VOCABS["french"] From 317eb3bdea5d8c7738fb0465da978a097cffadce Mon Sep 17 00:00:00 2001 From: Koen Farell Date: Wed, 21 Aug 2024 18:04:43 +0300 Subject: [PATCH 4/6] fix: moved Hryvnia from currency to ukrainin vocab. removed from multilingual --- doctr/datasets/vocabs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index 0e01a4756..91c5af795 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -13,7 +13,7 @@ "digits": string.digits, "ascii_letters": string.ascii_letters, "punctuation": string.punctuation, - "currency": "£€¥¢฿₴", + "currency": "£€¥¢฿", "ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ", "arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي", "persian_letters": "پچڢڤگ", @@ -60,7 +60,7 @@ VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪" VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"] VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"] -VOCABS["ukrainian"] = VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ" +VOCABS["ukrainian"] = VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ₴" VOCABS["multilingual"] = "".join( dict.fromkeys( VOCABS["french"] @@ -75,7 +75,6 @@ + VOCABS["danish"] + VOCABS["finnish"] + VOCABS["swedish"] - + VOCABS["ukrainian"] + "§" ) ) From ef52d474f3660f3cda07b3e6274f185c89ce8b1e Mon Sep 17 00:00:00 2001 From: Koen Farell Date: Wed, 21 Aug 2024 18:08:28 +0300 Subject: [PATCH 5/6] chore: updated docs for ukrainian vocab --- docs/source/modules/datasets.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst index b4690247c..c676fe8ce 100644 --- a/docs/source/modules/datasets.rst +++ b/docs/source/modules/datasets.rst @@ -151,6 +151,9 @@ of vocabs. * - swedish - 106 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿åäöÅÄÖ + * - ukrainian + - 115 + - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴ * - vietnamese - 236 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ From 17fbab2d035f5c0d3f23d3a83f699f972ccd17f3 Mon Sep 17 00:00:00 2001 From: Koen Farell Date: Wed, 21 Aug 2024 18:27:03 +0300 Subject: [PATCH 6/6] fix: added generic cyrilic letters to docs as well --- docs/source/modules/datasets.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst index c676fe8ce..872212a12 100644 --- a/docs/source/modules/datasets.rst +++ b/docs/source/modules/datasets.rst @@ -94,6 +94,9 @@ of vocabs. * - arabic_letters - 37 - ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي + * - generic_cyrillic_letters + - 58 + - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ * - persian_letters - 5 - پچڢڤگ