diff --git a/data/ca-ba/g2p/model.crf b/data/ca-ba/g2p/model.crf new file mode 100644 index 0000000..5de981e Binary files /dev/null and b/data/ca-ba/g2p/model.crf differ diff --git a/data/ca-ba/language.yml b/data/ca-ba/language.yml new file mode 100644 index 0000000..39f4528 --- /dev/null +++ b/data/ca-ba/language.yml @@ -0,0 +1,50 @@ +--- + +language: + name: "Balear Catalan" + code: "ca-ba" + phonemes: !env "${config_dir}/phonemes.txt" + keep_stress: true + +lexicon: !env "${config_dir}/lexicon.db" + +g2p: + model: !env "${config_dir}/g2p.fst" + +symbols: + casing: "lower" + number_regex: "^-?\\d+([,.]\\d+)*$" + token_split: "\\s+" + token_join: " " + minor_breaks: + - "," + - ":" + - ";" + - "..." + major_breaks: + - "." + - "?" + - "!" + replace: + "[\\<\\>\\(\\)\\[\\]\"]+": "" + "\\B'": "\"" + "'\\B": "\"" + "’": "'" + "'": "" + "-": "" + "l·l": "l" + punctuations: + - "\"" + - "„" + - "“" + - "”" + - "«" + - "»" + - "," + - ":" + - ";" + - "." + - "?" + - "¿" + - "!" + - "¡" \ No newline at end of file diff --git a/data/ca-ba/lexicon.db b/data/ca-ba/lexicon.db new file mode 100644 index 0000000..c961947 Binary files /dev/null and b/data/ca-ba/lexicon.db differ diff --git a/data/ca-ba/phonemes.txt b/data/ca-ba/phonemes.txt new file mode 100644 index 0000000..51bfea1 --- /dev/null +++ b/data/ca-ba/phonemes.txt @@ -0,0 +1,44 @@ +# https://en.wikipedia.org/wiki/Catalan_phonology +# Catalan phonemes + +p [p]ala +b [b]ala +t [t]ela +d [d]onar +k [k]ala +ɡ [g]ala +m [m]ala +ɲ fa[ng] +β aca[b]a +ð ca[d]a +ɣ ama[g]ar +f [f]als +v a[f]ganès +s [s]ala +z ca[s]a +ʃ [x]oc +ʒ mà[g]ic +tʃ co[tx]e +dʒ me[tg]e +l [l]íquid +ʎ [ll]amp +r ca[rr]o +ɾ ca[r]a +w ve[u]en +uw ca[u]re +j ca[i]re +y [i]a[i]a +n [n]ena +ŋ pi[n]güí +ts po[ts]er +dz do[tz]e + +# Vowels +i r[i]c +e c[e]c +ɛ s[e]c +a s[a]c +ɔ f[o]c +o s[ó]c +u s[u]c +ə [a]mor \ No newline at end of file diff --git a/data/ca-ce/g2p/model.crf b/data/ca-ce/g2p/model.crf new file mode 100644 index 0000000..1e8f7a8 Binary files /dev/null and b/data/ca-ce/g2p/model.crf differ diff --git a/data/ca-ce/language.yml b/data/ca-ce/language.yml new file mode 100644 index 0000000..4da8506 --- /dev/null +++ b/data/ca-ce/language.yml @@ -0,0 +1,50 @@ +--- + +language: + name: "Central Catalan" + code: "ca-ce" + phonemes: !env "${config_dir}/phonemes.txt" + keep_stress: true + +lexicon: !env "${config_dir}/lexicon.db" + +g2p: + model: !env "${config_dir}/g2p.fst" + +symbols: + casing: "lower" + number_regex: "^-?\\d+([,.]\\d+)*$" + token_split: "\\s+" + token_join: " " + minor_breaks: + - "," + - ":" + - ";" + - "..." + major_breaks: + - "." + - "?" + - "!" + replace: + "[\\<\\>\\(\\)\\[\\]\"]+": "" + "\\B'": "\"" + "'\\B": "\"" + "’": "'" + "'": "" + "-": "" + "l·l": "l" + punctuations: + - "\"" + - "„" + - "“" + - "”" + - "«" + - "»" + - "," + - ":" + - ";" + - "." + - "?" + - "¿" + - "!" + - "¡" \ No newline at end of file diff --git a/data/ca-ce/lexicon.db b/data/ca-ce/lexicon.db new file mode 100644 index 0000000..f3fd9df Binary files /dev/null and b/data/ca-ce/lexicon.db differ diff --git a/data/ca-ce/phonemes.txt b/data/ca-ce/phonemes.txt new file mode 100644 index 0000000..51bfea1 --- /dev/null +++ b/data/ca-ce/phonemes.txt @@ -0,0 +1,44 @@ +# https://en.wikipedia.org/wiki/Catalan_phonology +# Catalan phonemes + +p [p]ala +b [b]ala +t [t]ela +d [d]onar +k [k]ala +ɡ [g]ala +m [m]ala +ɲ fa[ng] +β aca[b]a +ð ca[d]a +ɣ ama[g]ar +f [f]als +v a[f]ganès +s [s]ala +z ca[s]a +ʃ [x]oc +ʒ mà[g]ic +tʃ co[tx]e +dʒ me[tg]e +l [l]íquid +ʎ [ll]amp +r ca[rr]o +ɾ ca[r]a +w ve[u]en +uw ca[u]re +j ca[i]re +y [i]a[i]a +n [n]ena +ŋ pi[n]güí +ts po[ts]er +dz do[tz]e + +# Vowels +i r[i]c +e c[e]c +ɛ s[e]c +a s[a]c +ɔ f[o]c +o s[ó]c +u s[u]c +ə [a]mor \ No newline at end of file diff --git a/data/ca-no/g2p/model.crf b/data/ca-no/g2p/model.crf new file mode 100644 index 0000000..f7cdbdb Binary files /dev/null and b/data/ca-no/g2p/model.crf differ diff --git a/data/ca-no/language.yml b/data/ca-no/language.yml new file mode 100644 index 0000000..413ee3b --- /dev/null +++ b/data/ca-no/language.yml @@ -0,0 +1,50 @@ +--- + +language: + name: "Nord-Occidental Catalan" + code: "ca-no" + phonemes: !env "${config_dir}/phonemes.txt" + keep_stress: true + +lexicon: !env "${config_dir}/lexicon.db" + +g2p: + model: !env "${config_dir}/g2p.fst" + +symbols: + casing: "lower" + number_regex: "^-?\\d+([,.]\\d+)*$" + token_split: "\\s+" + token_join: " " + minor_breaks: + - "," + - ":" + - ";" + - "..." + major_breaks: + - "." + - "?" + - "!" + replace: + "[\\<\\>\\(\\)\\[\\]\"]+": "" + "\\B'": "\"" + "'\\B": "\"" + "’": "'" + "'": "" + "-": "" + "l·l": "l" + punctuations: + - "\"" + - "„" + - "“" + - "”" + - "«" + - "»" + - "," + - ":" + - ";" + - "." + - "?" + - "¿" + - "!" + - "¡" \ No newline at end of file diff --git a/data/ca-no/lexicon.db b/data/ca-no/lexicon.db new file mode 100644 index 0000000..6cb9e0e Binary files /dev/null and b/data/ca-no/lexicon.db differ diff --git a/data/ca-no/phonemes.txt b/data/ca-no/phonemes.txt new file mode 100644 index 0000000..51bfea1 --- /dev/null +++ b/data/ca-no/phonemes.txt @@ -0,0 +1,44 @@ +# https://en.wikipedia.org/wiki/Catalan_phonology +# Catalan phonemes + +p [p]ala +b [b]ala +t [t]ela +d [d]onar +k [k]ala +ɡ [g]ala +m [m]ala +ɲ fa[ng] +β aca[b]a +ð ca[d]a +ɣ ama[g]ar +f [f]als +v a[f]ganès +s [s]ala +z ca[s]a +ʃ [x]oc +ʒ mà[g]ic +tʃ co[tx]e +dʒ me[tg]e +l [l]íquid +ʎ [ll]amp +r ca[rr]o +ɾ ca[r]a +w ve[u]en +uw ca[u]re +j ca[i]re +y [i]a[i]a +n [n]ena +ŋ pi[n]güí +ts po[ts]er +dz do[tz]e + +# Vowels +i r[i]c +e c[e]c +ɛ s[e]c +a s[a]c +ɔ f[o]c +o s[ó]c +u s[u]c +ə [a]mor \ No newline at end of file diff --git a/data/ca-va/g2p/model.crf b/data/ca-va/g2p/model.crf new file mode 100644 index 0000000..417e24c Binary files /dev/null and b/data/ca-va/g2p/model.crf differ diff --git a/data/ca-va/language.yml b/data/ca-va/language.yml new file mode 100644 index 0000000..1d2074b --- /dev/null +++ b/data/ca-va/language.yml @@ -0,0 +1,50 @@ +--- + +language: + name: "Valencià Catalan" + code: "ca-va" + phonemes: !env "${config_dir}/phonemes.txt" + keep_stress: true + +lexicon: !env "${config_dir}/lexicon.db" + +g2p: + model: !env "${config_dir}/g2p.fst" + +symbols: + casing: "lower" + number_regex: "^-?\\d+([,.]\\d+)*$" + token_split: "\\s+" + token_join: " " + minor_breaks: + - "," + - ":" + - ";" + - "..." + major_breaks: + - "." + - "?" + - "!" + replace: + "[\\<\\>\\(\\)\\[\\]\"]+": "" + "\\B'": "\"" + "'\\B": "\"" + "’": "'" + "'": "" + "-": "" + "l·l": "l" + punctuations: + - "\"" + - "„" + - "“" + - "”" + - "«" + - "»" + - "," + - ":" + - ";" + - "." + - "?" + - "¿" + - "!" + - "¡" \ No newline at end of file diff --git a/data/ca-va/lexicon.db b/data/ca-va/lexicon.db new file mode 100644 index 0000000..5b6518d Binary files /dev/null and b/data/ca-va/lexicon.db differ diff --git a/data/ca-va/phonemes.txt b/data/ca-va/phonemes.txt new file mode 100644 index 0000000..51bfea1 --- /dev/null +++ b/data/ca-va/phonemes.txt @@ -0,0 +1,44 @@ +# https://en.wikipedia.org/wiki/Catalan_phonology +# Catalan phonemes + +p [p]ala +b [b]ala +t [t]ela +d [d]onar +k [k]ala +ɡ [g]ala +m [m]ala +ɲ fa[ng] +β aca[b]a +ð ca[d]a +ɣ ama[g]ar +f [f]als +v a[f]ganès +s [s]ala +z ca[s]a +ʃ [x]oc +ʒ mà[g]ic +tʃ co[tx]e +dʒ me[tg]e +l [l]íquid +ʎ [ll]amp +r ca[rr]o +ɾ ca[r]a +w ve[u]en +uw ca[u]re +j ca[i]re +y [i]a[i]a +n [n]ena +ŋ pi[n]güí +ts po[ts]er +dz do[tz]e + +# Vowels +i r[i]c +e c[e]c +ɛ s[e]c +a s[a]c +ɔ f[o]c +o s[ó]c +u s[u]c +ə [a]mor \ No newline at end of file diff --git a/gruut-lang-ca/LANGUAGE b/gruut-lang-ca/LANGUAGE new file mode 100644 index 0000000..d9e4b40 --- /dev/null +++ b/gruut-lang-ca/LANGUAGE @@ -0,0 +1 @@ +ca-ce Catalan diff --git a/gruut-lang-ca/README.md b/gruut-lang-ca/README.md new file mode 100644 index 0000000..6266c01 --- /dev/null +++ b/gruut-lang-ca/README.md @@ -0,0 +1,3 @@ +# gruut Catalan + +Language-specific files for Catalan (ca) in [gruut](https://github.com/rhasspy/gruut) diff --git a/gruut-lang-ca/gruut_lang_ca/VERSION b/gruut-lang-ca/gruut_lang_ca/VERSION new file mode 100644 index 0000000..77d6f4c --- /dev/null +++ b/gruut-lang-ca/gruut_lang_ca/VERSION @@ -0,0 +1 @@ +0.0.0 diff --git a/gruut-lang-ca/gruut_lang_ca/__init__.py b/gruut-lang-ca/gruut_lang_ca/__init__.py new file mode 100644 index 0000000..bed0ab8 --- /dev/null +++ b/gruut-lang-ca/gruut_lang_ca/__init__.py @@ -0,0 +1,22 @@ +"""Catalan language resources""" +import os +import typing +from pathlib import Path + +try: + import importlib.resources + + files = importlib.resources.files +except (ImportError, AttributeError): + # Backport for Python < 3.9 + import importlib_resources # type: ignore + + files = importlib_resources.files + +_PACKAGE = "gruut_lang_ca" +_DIR = Path(typing.cast(os.PathLike, files(_PACKAGE))) + + +def get_lang_dir() -> Path: + """Get directory with language resources""" + return _DIR diff --git a/gruut-lang-ca/gruut_lang_ca/g2p/model.crf b/gruut-lang-ca/gruut_lang_ca/g2p/model.crf new file mode 100644 index 0000000..1e8f7a8 Binary files /dev/null and b/gruut-lang-ca/gruut_lang_ca/g2p/model.crf differ diff --git a/gruut-lang-ca/gruut_lang_ca/lexicon.db b/gruut-lang-ca/gruut_lang_ca/lexicon.db new file mode 100644 index 0000000..f3fd9df Binary files /dev/null and b/gruut-lang-ca/gruut_lang_ca/lexicon.db differ diff --git a/gruut-lang-ca/gruut_lang_ca/phonemes.txt b/gruut-lang-ca/gruut_lang_ca/phonemes.txt new file mode 100644 index 0000000..51bfea1 --- /dev/null +++ b/gruut-lang-ca/gruut_lang_ca/phonemes.txt @@ -0,0 +1,44 @@ +# https://en.wikipedia.org/wiki/Catalan_phonology +# Catalan phonemes + +p [p]ala +b [b]ala +t [t]ela +d [d]onar +k [k]ala +ɡ [g]ala +m [m]ala +ɲ fa[ng] +β aca[b]a +ð ca[d]a +ɣ ama[g]ar +f [f]als +v a[f]ganès +s [s]ala +z ca[s]a +ʃ [x]oc +ʒ mà[g]ic +tʃ co[tx]e +dʒ me[tg]e +l [l]íquid +ʎ [ll]amp +r ca[rr]o +ɾ ca[r]a +w ve[u]en +uw ca[u]re +j ca[i]re +y [i]a[i]a +n [n]ena +ŋ pi[n]güí +ts po[ts]er +dz do[tz]e + +# Vowels +i r[i]c +e c[e]c +ɛ s[e]c +a s[a]c +ɔ f[o]c +o s[ó]c +u s[u]c +ə [a]mor \ No newline at end of file diff --git a/gruut-lang-ca/setup.py b/gruut-lang-ca/setup.py new file mode 100644 index 0000000..ce5814b --- /dev/null +++ b/gruut-lang-ca/setup.py @@ -0,0 +1,58 @@ +"""Setup file for gruut_lang_ca""" +from pathlib import Path + +import setuptools + +module_name = "gruut_lang_ca" + +this_dir = Path(__file__).parent +module_dir = this_dir / module_name + +# ----------------------------------------------------------------------------- + +# Load README in as long description +long_description: str = "" +readme_path = this_dir / "README.md" +if readme_path.is_file(): + long_description = readme_path.read_text(encoding="utf-8") + +version_path = module_dir / "VERSION" +with open(version_path, "r", encoding="utf-8") as version_file: + version = version_file.read().strip() + + +# Extra package data files +extra_files = [] +maybe_extra_files = ["pos/model.crf", "pos/postagger.model"] +for maybe_extra_str in maybe_extra_files: + extra_path = module_dir / maybe_extra_str + if extra_path.is_file(): + extra_files.append(maybe_extra_str) + +# ----------------------------------------------------------------------------- + +setuptools.setup( + name=module_name, + description="Catalan language files for gruut tokenizer/phonemizer", + version=version, + author="Michael Hansen", + author_email="mike@rhasspy.org", + url="https://github.com/rhasspy/gruut", + packages=setuptools.find_packages(), + package_data={ + module_name: [ + "VERSION", + "lexicon.db", + "g2p/model.crf", + #"espeak/lexicon.db", + #"espeak/g2p/model.crf", + ] + + extra_files + }, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + ], + long_description=long_description, + long_description_content_type="text/markdown", +) diff --git a/gruut/__main__.py b/gruut/__main__.py index e78d16b..5d907b5 100755 --- a/gruut/__main__.py +++ b/gruut/__main__.py @@ -64,7 +64,6 @@ def main(): args.model_prefix = "espeak" # ------------------------------------------------------------------------- - text_processor = TextProcessor( default_lang=args.language, model_prefix=args.model_prefix, ) @@ -132,7 +131,7 @@ def output_sentences(sentences, writer, text_data=None): for sentence in sentences: sentence_dict = dataclasses.asdict(sentence) writer.write(sentence_dict) - + for text, text_data in input_text(lines): try: graph, root = text_processor( @@ -166,8 +165,9 @@ def output_sentences(sentences, writer, text_data=None): punctuations=(not args.no_punctuation), ) ) - + output_sentences(sentences, writer, text_data) + except Exception as e: _LOGGER.exception(text) diff --git a/gruut/const.py b/gruut/const.py index dcc4486..dad195e 100644 --- a/gruut/const.py +++ b/gruut/const.py @@ -16,6 +16,11 @@ # alias -> full language name LANG_ALIASES = { "ar": "ar", + "ca": "ca-ce", + "ca-ce": "ca-ce", + "ca-ba": "ca-ba", + "ca-no": "ca-no", + "ca-va": "ca-va", "cs": "cs-cz", "de": "de-de", "en": "en-us", diff --git a/gruut/lang.py b/gruut/lang.py index cbd54d2..ccfffaa 100644 --- a/gruut/lang.py +++ b/gruut/lang.py @@ -15,7 +15,7 @@ from gruut.text_processor import InterpretAsFormat, TextProcessorSettings from gruut.utils import find_lang_dir, remove_non_word_chars, resolve_lang -_LOGGER = logging.getLogger("gruut") +_LOGGER = logging.getLogger("gruut.lang") # ----------------------------------------------------------------------------- @@ -115,6 +115,10 @@ def get_settings( # Arabic return get_ar_settings(lang_dir, **settings_args) + if lang_only in {"ca-ce", "ca-ba", "ca-no", "ca-va"}: + # Catalan + return get_ca_settings(lang_dir, **settings_args) + if lang_only == "cs-cz": # Czech return get_cs_settings(lang_dir, **settings_args) @@ -828,7 +832,1434 @@ def get_zh_settings(lang_dir=None, **settings_args) -> TextProcessorSettings: # ----------------------------------------------------------------------------- +# Catalan (ca, Catalan) +# ----------------------------------------------------------------------------- + +# Pre-Process constants +# Same for all accents in this version +VOWEL_CHARS = ['a', 'ä', 'à', 'e', 'ë', 'é', 'è', 'i', 'í', 'ï', 'o', 'ö', 'ó', 'ò', 'u', 'ü', 'ú'] +ACCENTED_VOWEL_CHARS = ['à', 'é', 'è', 'í', 'ó', 'ò', 'ú'] +NUCLITIC_CHARS = ['a', 'à', 'e', 'é', 'è', 'í', 'ï', 'o', 'ó', 'ò', 'ú'] +ACCENT_CHANGES = { + "a" : "à", + "e" : "é", + "i" : "í", + "ï" : "í", + "o" : "ó", + "u" : "ú", + "ü" : "ú", +} +INSEPARABLES = [ + 'bh', 'bl', 'br', 'ch', 'cl', 'cr', 'dh', 'dj', 'dr', 'fh', 'fh', 'fl', 'fr', \ + 'gh', 'gl', 'gr', 'gu', 'gü', 'jh', 'kh', 'kl', 'kr', 'lh', 'll', 'mh', \ + 'nh', 'ny', 'ph', 'pl', 'pr', 'qu', 'qü', 'rh', 'sh', 'th', 'th', 'tr', \ + 'vh', 'wh', 'xh', 'xh', 'yh', 'zh', +] +VOC_IR = ["cuir", "vair"] +EINESGRAM = [ + '-de-', '-en', '-hi', '-ho', '-i', '-i-', '-la', '-les', '-li', '-lo', '-los', '-me', '-ne', '-nos', \ + '-se', '-te', '-us', '-vos', 'a', 'a-', 'al', 'als', 'amb', 'bi-', 'co', 'de', 'de-', 'del', 'dels', \ + 'el', 'els', 'em', 'en', 'ens', 'es', 'et', 'hi', 'ho', 'i', 'i-', 'la', 'les', 'li', 'lo', 'ma', \ + 'me', 'mon', 'na', 'pel', 'pels', 'per', 'que', 're', 'sa', 'se', 'ses', 'si', 'sos', 'sub', \ + 'ta', 'te', 'tes', 'ton', 'un', 'uns', 'us', +] +EXCEP_ACC = { + 'antropologico': 'antropològico', 'arterio': 'artèrio', 'artistico': 'artístico', 'basquet': 'bàsquet', 'cardio': 'càrdio', \ + 'catolico': 'catòlico', 'cientifico': 'científico', 'circum': 'círcum', 'civico': 'cívico', 'democrata': 'demòcrata', \ + 'democratico': 'democràtico', 'dumping': 'dúmping', 'economico': 'econòmico', 'edgar': 'èdgar', 'fenicio': 'fenício', \ + 'filosofico': 'filosòfico', 'fisico': 'físico', 'fisio': 'físio', 'geografico': 'geogràfico', 'hetero': 'hétero', \ + 'higenico': 'higènico', 'higienico': 'higiènico', 'hiper': 'híper', 'historico': 'històrico', 'ibero': 'íbero', \ + 'ideologico': 'ideològico', 'input': 'ínput', 'inter': 'ínter', 'jonatan': 'jònatan', 'juridico': 'jurídico', 'labio': 'làbio', \ + 'linguo': 'línguo', 'literario': 'literàrio', 'logico': 'lògico', 'magico': 'màgico', 'maniaco': 'maníaco', 'marketing': 'màrketing', \ + 'oxido': 'òxido', 'petroleo': 'petròleo', 'politico': 'político', 'quantum': 'quàntum', 'quimico': 'químico', 'quimio': 'químio', \ + 'radio': 'ràdio', 'romanico': 'romànico', 'simbolico': 'simbòlico', 'socio': 'sòcio', 'super': 'súper', 'tecnico': 'tècnico', \ + 'teorico': 'teòrico', 'tragico': 'tràgico', 'traqueo': 'tràqueo', +} +DIFT_DECR = ["au", "ai", "eu", "ei", "ou", "oi", "iu", "àu", "ui"] +VOC_SOLA = ["a", "e", "i", "o", "u", "ï", "ü"] +VOC_MES_S = ["as", "es", "is", "os", "us", "às", "ès"] +EN_IN = ["en", "in", "àn"] + +# Pre-Process functions and classes + +from collections import deque + +# TODO review all functions, may need refactor +# TODO define depending the dialect +def vocal(carac: str) -> bool: + + return carac in VOWEL_CHARS + +def acaba_en_vocal(prefix: str) -> bool: + darrer = prefix[-1] + return vocal(darrer) + +def post_prefix_ok(resta: str) -> bool: + + mida = len(resta) + primer = resta[0] + segon = '\0' + if mida > 1: + segon = resta[1] + + if primer in "iu": + return True + elif primer in "rs": + if mida > 1 and vocal(segon): + return True + return False + +def nuclitica(carac: str) -> bool: + return carac in NUCLITIC_CHARS + +def gicf_suf(mot: str, pos: int, mots_voc_ir: typing.List[str]) -> bool: + + mida = len(mot) + + if mot[pos:].endswith("isme") and len(mot) - pos == 4: + return True + elif mot[pos:].endswith("ista") and len(mot) - pos == 4: + return True + elif mot[pos:].endswith("ismes") and len(mot) - pos == 5: + return True + elif mot[pos:].endswith("istes") and len(mot) - pos == 5: + return True + + i1 = mot.find("ir") + if i1 == pos and len(mot) - pos == 2: + if mot in mots_voc_ir: + return False + else: + return True + + i1 = mot.find("int") + if i1 == pos and len(mot) - pos == 3: + return True + + i1 = mot.find("iré") + if i1 == pos and len(mot) - pos == 3: + return True + + i1 = mot.find("iràs") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("irà") + if i1 == pos and len(mot) - pos == 3: + return True + + i1 = mot.find("irem") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("ireu") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("iran") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("iria") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("iries") + if i1 == pos and len(mot) - pos == 5: + return True + + i1 = mot.find("iríem") + if i1 == pos and len(mot) - pos == 5: + return True + + i1 = mot.find("iríeu") + if i1 == pos and len(mot) - pos == 5: + return True + + i1 = mot.find("irien") + if i1 == pos and len(mot) - pos == 5: + return True + + return False + + +class Sillaba: + + def __init__(self, sil: str): + + self.text_ = sil + self.tonica_ = False + self.grafnuc_ = -1 + self.fonnuc_ = -1 + self.fons_ = deque() + + def grafnuc(self, nuc: int): + self.grafnuc_ = nuc + + def get_grafnuc(self) -> int: + return self.grafnuc_ + + def get_text(self) -> str: + return self.text_ + + def get_text_at_index(self, idx: int) -> str: + return self.text_[idx] + + def sizetext(self) -> int: + return len(self.text_) + + def tonica(self) -> bool: + self.tonica_ = True + + def asktonica(self) -> bool: + return self.tonica_ + + def es_sil_tonica(self) -> bool: + + if self.tonica_: + return "sí" + else: + return "no" + + def numfons(self) -> int: + return len(self.fons_) + + def allofon(self, fonidx: int) -> str: + return self.fons_[fonidx] + + def allofons(self) -> deque: + return self.fons_ + + def push_back(self, fon: str): + self.fons_.append(fon) + + def push_front(self, fon: str): + self.fons_.insert(0, fon) + + def pop_front(self): + self.fons_.popleft() + + def pop_back(self): + self.fons_.pop() + + def empty(self) -> bool: + return len(self.fons_) == 0 + + def fonnuc(self, fnuc: int): + self.fonnuc_ = fnuc + + def get_fonnuc(self) -> int: + return self.fonnuc_ + + +class Part: + + def __init__(self, tros: str): + self.text_ = tros + self.transsil_ = deque() # It will be a deque structure with Sillaba instances as elements + + def push_back(self, sil: Sillaba): + self.transsil_.append(sil) + + def pop_back(self): + self.transsil_.pop() + + def pop_front(self): + self.transsil_.popleft() + + def empty(self) -> bool: + return len(self.transsil_) == 0 + + def size(self) -> int: + return len(self.transsil_) + + def tonica(self, silidx: int) -> bool: + # self.transsil_[silidx] is an Sillaba instance, which has the attribute tonica_ + return self.transsil_[silidx].tonica_ + + def idxgrafnucli(self, silidx: int) -> int: + # self.transsil_[silidx] is an Sillaba instance, which has the attribute grafnuc_ + return self.transsil_[silidx].grafnuc_ + + def grafnucli(self, silidx: int) -> str: + # self.transsil_[silidx] is an Sillaba instance, which has an attributes text_ and grafnuc_ + return self.transsil_[silidx].text_[self.transsil_[silidx].grafnuc_] + + def sil(self, silnum: int) -> Sillaba: + return self.transsil_[silnum] + + def sils(self) -> deque: + return self.transsil_ + + def text(self) -> str: + return self.text_ + + def textinici(self, silindex: int, charindex: int) -> str: + + # Gives the text of the previous syllables, and from the syllable silindex to charindex not included + + mot = "" + for i in range(silindex): + mot += self.transsil_[i].text_ + if charindex: + mot += self.transsil_[silindex].text_[:charindex] + return mot + + def textfinal(self, silindex: int, charindex: int) -> str: + + # Gives the text starting from the syllable silindex and the character charindex (included) and up to the end of the word + + mot = self.transsil_[silindex].text_[charindex:] + for i in range(silindex + 1, len(self.transsil_)): + mot += self.transsil_[i].text_ + return mot + + def textsilini(self, silindex: int, charindex: int) -> str: + + # gives the text of the syllable silindex, from the beginning to the character charindex not included + return self.transsil_[silindex].text_[:charindex] + + def textsilfinal(self, silindex: int, charindex: int) -> str: + + # Gives the text of the syllable silindex, from charindex inclusive to the end + return self.transsil_[silindex].text_[charindex:] + + def charidxsilini(self, silindex: int) -> int: + + car = self.transsil_[silindex].text_[0] + if car == "'" or car == '-': + return 1 + else: + return 0 + + def charidxsilfinal(self, silindex: int) -> int: + + siltxt = self.transsil_[silindex].text_ + car = siltxt[-1] + if car == "'" or car == '-': + return len(siltxt) - 2 + else: + return len(siltxt) - 1 + + +class MotNuclis: + + def __init__(self, mot: str, es_adverbi: bool): + + self.adverbi_ = es_adverbi + self.el_mot = mot + self.pos_nuclis = [] + + self.load_insep() + + def load_insep(self): + + # Set self.insep_ and self.mots_voc_ir_ + + self.insep_ = INSEPARABLES + self.mots_voc_ir_ = VOC_IR + + def troba_nuclis_mot(self): + + mida = len(self.el_mot) + adjectiu = "" + + if self.adverbi_: + adjectiu = self.el_mot[0:mida - 4] + self.el_mot = adjectiu + mida = len(self.el_mot) + + gr = 0 + while gr < mida: + + car = self.el_mot[gr] + + if nuclitica(car): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif car == 'i': + # gicf o sufix + if gicf_suf(self.el_mot, gr, self.mots_voc_ir_): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + abans = self.el_mot[0:gr] + premida = len(abans) + + if (premida == 0) or (premida == 1 and abans == "h"): + # casos iode o hiena, i, hi + if gr == mida - 1: + # i, hi + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif vocal(self.el_mot[gr+1]): + # hiena iode + gr = gr + 1 + continue + # hissar, ira + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif (premida == 1) and (abans == "u"): + + if gr == mida - 1 or self.el_mot[gr + 1] == 'x': + gr = gr + 1 + continue + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif (premida == 2) and (abans == "hu"): + + if gr == mida - 1: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + if self.el_mot[gr + 1] == 'x': + gr = gr + 1 + continue + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'u': + # tres vocals seguides vocal+u+i, la u es consonant i la "i" es nucli + if (premida > 1) and vocal(self.el_mot[gr - 2]): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif (premida > 1) and (self.el_mot[gr - 2] == 'q' or self.el_mot[gr - 2] == 'g'): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + # ui tot sol + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'ü': + + if (premida > 1) and (self.el_mot[gr - 2] == 'q' or self.el_mot[gr - 2] == 'g'): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + # üi no precedit de g,q + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr - 1]): + # vocal + i, la i no es nucli + gr = gr + 1 + continue + + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif car == 'u': + + abans = self.el_mot[0:gr] + premida = len(abans) + + if (premida == 0) or (premida == 1 and abans == "h"): + # casos uadi o hu+vocal, u, hu + if gr == mida - 1: + # u, hu + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif self.el_mot == "ui" or self.el_mot == "uix": + # potser se n'han d'afegir mes + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif (pos := self.el_mot.find("ix")) != -1 and pos == gr + 1: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif vocal(self.el_mot[gr+1]): + # uadi hu+vocal + gr = gr + 1 + continue + else: + # huns, una + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif (premida == 1) and (abans == "i"): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'i': + # tres vocals seguides vocal+i+u, la i es consonant i la "u" es nucli + if premida > 2: + boci = self.el_mot[gr - 3 : gr - 1] + + if boci == "gu" or boci == "qu": + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr - 2]): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + else: + gr = gr + 1 + continue + + elif premida == 2: + if vocal(self.el_mot[gr - 2]): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + gr = gr + 1 + continue + else: + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'g' or self.el_mot[gr - 1] == 'q': + if gr == mida - 1: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr + 1]): + gr = gr + 1 + continue + + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'ü': + if (premida > 1) and (self.el_mot[gr - 2] == 'q' or self.el_mot[gr - 2] == 'g'): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + # üu no precedit de g,q + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr - 1]): + # vocal + u, la u no es nucli + gr = gr + 1 + continue + + else: + # tancara l'else de quan no es sufix + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif car == 'ü': + + pos = 0 + + if (pos := self.el_mot.find("argü")) != -1: + if pos + 3 == gr: + self.pos_nuclis.append(gr) + self.pos_nuclis.append(gr + 1) + gr += 1 + gr = gr + 1 + continue + else: + gr = gr + 1 + continue + elif gr > 0: + if self.el_mot[gr - 1] == 'g' or self.el_mot[gr - 1] == 'q': + gr = gr + 1 + continue + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + else: + gr = gr + 1 + continue + + if self.adverbi_: + self.el_mot += "ment" + mida = len(self.el_mot) + self.pos_nuclis.append(mida - 3) + + def inseparable(self, tros: str) -> bool: + return tros in self.insep_ + + def separa_sillabes(self, vec_sil: typing.List[str], els_nuclis: typing.List[int]) -> typing.Tuple[typing.List[str], typing.List[int]]: + + fronteres = [] + + if len(self.pos_nuclis) == 1: + + vec_sil.append(self.el_mot) + els_nuclis.append(self.pos_nuclis[0]) + + return vec_sil, els_nuclis + + # Set the fronteres vector + for i in range(len(self.pos_nuclis) - 1): + + longi = self.pos_nuclis[i + 1] - self.pos_nuclis[i] - 1 + tros = self.el_mot[self.pos_nuclis[i] + 1: self.pos_nuclis[i] + 1 + longi] + + # vocals contigues + if longi == 0: + fronteres.append(self.pos_nuclis[i]) + + elif longi == 1: + fronteres.append(self.pos_nuclis[i]) + + elif longi == 2: + if self.inseparable(self.el_mot[self.pos_nuclis[i] + 1: self.pos_nuclis[i] + 1 + 2]): + fronteres.append(self.pos_nuclis[i]) + elif self.el_mot[self.pos_nuclis[i] + 2] == 'h': + fronteres.append(self.pos_nuclis[i]) + else: + fronteres.append(self.pos_nuclis[i] + 1) + + elif longi == 3: + if self.inseparable(self.el_mot[self.pos_nuclis[i] + 2: self.pos_nuclis[i] + 2 + 2]): + if self.el_mot[self.pos_nuclis[i] + 1] == '-': + fronteres.append(self.pos_nuclis[i]) + else: + fronteres.append(self.pos_nuclis[i] + 1) + else: + if self.el_mot[self.pos_nuclis[i] + 3] == '-': + fronteres.append(self.pos_nuclis[i] + 1) + else: + fronteres.append(self.pos_nuclis[i] + 2) + + elif longi == 4: + pos = 0 + + if (pos := tros.find("s")) != -1: + fronteres.append(self.pos_nuclis[i] + pos + 1) + else: + fronteres.append(self.pos_nuclis[i] + 2) + + elif longi == 5: + fronteres.append(self.pos_nuclis[i] + 3) + + else: + _LOGGER.debug(f"No puc separar en sillabes el mot {self.el_mot}, cluster massa gran, de longitud {longi}") + exit(1) + + numsil = len(fronteres) + for i in range(numsil): + if i == 0: + if fronteres[i] != 0: + esta_sil = self.el_mot[0:fronteres[i] + 1] + vec_sil.append(esta_sil) + else: + esta_sil = self.el_mot[0] + vec_sil.append(esta_sil) + else: + esta_sil = self.el_mot[fronteres[i - 1] + 1 : fronteres[i] + 1] + vec_sil.append(esta_sil) + + esta_sil = self.el_mot[fronteres[numsil - 1] + 1:] + vec_sil.append(esta_sil) + + els_nuclis.append(self.pos_nuclis[0]) + longitud = len(vec_sil[0]) + + for i in range(1, len(self.pos_nuclis)): + this_nucli = self.pos_nuclis[i] - longitud + els_nuclis.append(this_nucli) + longitud += len(vec_sil[i]) + + return vec_sil, els_nuclis + + def empty(self) -> bool: + return len(self.pos_nuclis) == 0 + + def mot(self) -> str: + return self.el_mot + + def nucli(self, i: int) -> typing.Union[int, None]: + if 0 <= i < len(self.pos_nuclis): + return self.pos_nuclis[i] + return None + + def size(self) -> int: + return len(self.pos_nuclis) + + def nuclis(self) -> typing.List[int]: + return self.pos_nuclis + + +class Transcripcio: + + def __init__(self, mot: str): + + self.motorig_ = mot + + self.prefixos_ = [] + self.pref_atons = [] + self.excepcions_prefs = {} + self.excepcions_gen = set() + self.einesgram_ = set() + self.excep_acc = {} + self.trossos_ = [] + self.transpart_ = [] + + self.carrega_einesgram() + self.carrega_exc_accent() + + def carrega_einesgram(self): + # Set self.einesgram_ + self.einesgram_ = EINESGRAM + + def carrega_exc_accent(self): + # Set self.excep_acc (excepcions d'accentuacio) + self.excep_acc = EXCEP_ACC + + def normalize_word(self, word: str) -> str: + + word = word.lower() + + return word + + def segmenta(self, mot: str, final: typing.List[str]) -> typing.List[str]: + + # Word with prefixes segmentation + + no_te_prefix = True + for prefix in self.prefixos_: + lon = len(prefix) + pos = mot.find(prefix) + if pos != -1 and pos == 0: + no_te_prefix = False + + if lon == len(mot): + final.append(mot) + return final + elif lon == len(mot) - 1 and mot[lon] == '-': + final.append(mot) + return final + else: + # If there are no exceptions split it + if prefix not in self.excepcions_prefs: + final.append(prefix) + resta = mot[lon:] + self.segmenta(resta, final) + return final + # If there are exceptions check that it is not part of it + else: + if mot not in self.excepcions_prefs[prefix]: + final.append(prefix) + resta = mot[lon:] + self.segmenta(resta, final) + return final + else: + final.append(mot) + return final + + for prefix in self.pref_atons: + lon = len(prefix) + pos = mot.find(prefix) + if pos != -1 and pos == 0: + no_te_prefix = False + + if lon == len(mot): + final.append(mot) + return final + elif lon == len(mot) - 1 and mot[lon] == '-': + final.append(mot) + return final + else: + # It should only be started if: + # if the prefix ends in a vowel + # only if the word continues with i, u, -r+vowel, -s+vowel + # if the prefix always ends in a consonant + # except in both cases + # if it is part of the exceptions, if there are any + if acaba_en_vocal(prefix): + resta = mot[lon:] + if post_prefix_ok(resta): + if prefix not in self.excepcions_prefs: + final.append(prefix) + self.segmenta(resta, final) + return final + else: + if mot not in self.excepcions_prefs[prefix]: + final.append(prefix) + self.segmenta(resta, final) + return final + else: + final.append(mot) + return final + else: + final.append(mot) + return final + # It is not an exception + else: + if prefix not in self.excepcions_prefs: + final.append(prefix) + queda = mot[lon:] + self.segmenta(queda, final) + return final + else: + if mot not in self.excepcions_prefs[prefix]: + final.append(prefix) + queda = mot[lon:] + self.segmenta(queda, final) + return final + else: + final.append(mot) + return final + + if no_te_prefix: + final.append(mot) + return final + + def tracta_prefixos(self, inici: typing.List[str], final: typing.List[str]) -> typing.List[str]: + + # For each start word, + # if there is a prefix at the beginning and the word is not part of the exception list, + # split it after the prefix, unless after the prefix there is a hyphen + + for mot in inici: + final = self.segmenta(mot, final) + + return final + + def parteix_mot(self): + + # Set parts + parts = [self.motnorm_] + + self.trossos_ = self.tracta_prefixos(parts, self.trossos_) + + for tros in self.trossos_: + partmot = Part(tros) + self.transpart_.append(partmot) + + def no_es_nom_ment(self, mot: str) -> bool: + + if mot not in self.excepcions_gen: + return True + else: + return False + + def es_adverbi(self, mot: str) -> bool: + + pos = 0 + tros = "ment" + pos = mot.rfind(tros) + if pos != -1: + if pos == len(mot) - len(tros): + if self.no_es_nom_ment(mot): + return True + else: + return False + else: + return False + else: + return False + + def es_exc_accent(self, mot: str) -> str: + + if mot in self.excep_acc: + mot = self.excep_acc[mot] + + return mot + + def troba_nuclis_mot(self): + + for i in range(len(self.trossos_)): + + self.trossos_[i] = self.es_exc_accent(self.trossos_[i]) + + # Determine if it's an adverb and pass the information to mot_amb_nuclis + is_adverb = self.es_adverbi(self.trossos_[i]) + + mot_amb_nuclis = MotNuclis( + mot = self.trossos_[i], + es_adverbi = is_adverb, + ) + + mot_amb_nuclis.troba_nuclis_mot() + + sillabes, nuclis = [], [] + if not mot_amb_nuclis.empty(): + sillabes, nuclis = mot_amb_nuclis.separa_sillabes(sillabes, nuclis) + for sil in range(len(sillabes)): + sillab = Sillaba(sillabes[sil]) + sillab.grafnuc(nuclis[sil]) + self.transpart_[i].push_back(sillab) + else: + sillab = Sillaba(self.trossos_[i]) + self.transpart_[i].push_back(sillab) + + def dotze_term(self, pnum: int) -> bool: + + # retorna cert quan es mot pla (paroxiton) ja sigui per les dotze terminacions o per ser un diftong decreixent + + dift_decr = DIFT_DECR + voc_sola = VOC_SOLA + voc_mes_s = VOC_MES_S + en_in = EN_IN + + numsil = self.transpart_[pnum].size() + darsil = self.transpart_[pnum].transsil_[numsil - 1].get_text() + darsil = darsil.lower() # Convert to lowercase for case-insensitive comparison + + mida = len(darsil) + + # mida de la sillaba 2 o + + if mida >= 2: + last_dos = darsil[-2:] + + # diftong decreixent, inclou gui, qui + for dift in dift_decr: + es_dift_decr = last_dos == dift + # diftong decreixent i nucli -> agut + # diftong decreixent i no es nucli (ex: preui)-> pla + if es_dift_decr and (self.transpart_[pnum].transsil_[numsil - 1].grafnuc_ == mida - 2): + return False + elif es_dift_decr: + return True + + # vocal sola + last_voc = darsil[-1:] + if last_voc in voc_sola: + return True + + # si la dar sil acaba en s (mida 2 o + encara) + if darsil[-1:] == 's': + if mida >= 3: + last_dos = darsil[-3:-1] + for dift in dift_decr: + es_dift_decr = last_dos == dift + if es_dift_decr and (self.transpart_[pnum].transsil_[numsil - 1].grafnuc_ == mida - 3): + return False + elif es_dift_decr: + return True + + last_dos = darsil[-2:] + if last_dos in voc_mes_s: + return True + + last_dos = darsil[-2:] + if last_dos in en_in: + return True + + last_voc = darsil[-1:] + if last_voc in voc_sola: + return True + + return False + + def accentua_mot(self, pnum: int): + + numsil = self.transpart_[pnum].size() + + if self.dotze_term(pnum): + # If it ends with a vowel or vowel+s, or with o or in, it's flat (plana) + # Vowels are aeiouàèéíòóúü + self.transpart_[pnum].transsil_[numsil - 2].tonica() + else: + # Otherwise, it's acute (aguda) + self.transpart_[pnum].transsil_[numsil - 1].tonica() + + def einagram(self, mot: str) -> bool: + + if mot not in self.einesgram_: + return False + else: + return True + + def troba_accent_tonic_mot(self): + + vocaccent = ACCENTED_VOWEL_CHARS + + for pnum in range(len(self.trossos_)): + + if not self.transpart_[pnum]: + # es una particula sense vocal + continue + + numsil = self.transpart_[pnum].size() + accent_grafic = False + # bucle sobre les sillabes per veure si hi ha accent grafic + for snum in range(numsil): + sillaba = self.transpart_[pnum].transsil_[snum].get_text() + pos = 0 + if any(accented_vowel in sillaba for accented_vowel in vocaccent): + + last_sil = self.transpart_[pnum].transsil_[numsil - 1].get_text() + accent_grafic = True + + if last_sil == "ment": + self.transpart_[pnum].transsil_[snum].tonica() + self.transpart_[pnum].transsil_[numsil - 1].tonica() + else: + self.transpart_[pnum].transsil_[snum].tonica() + + break + + if not accent_grafic: + + # si es monosillab es tonic a menys que sigui eina gramatical + # tonic car es morfema lexematic d'una sillaba + # si te mes d'una sillaba, estudiar la terminacio, descartant abans + # un guio que hi pugui haver al final + # prefixos que poden ser d'una o dues sillabes tenen nomes + # accent secundari si son tonics i funcionen realment com a prefix + + if numsil == 1: + sillaba = self.transpart_[pnum].transsil_[0].get_text() + if (self.transpart_[pnum].transsil_[0].grafnuc_ == -1): + # es part de mot sense nucli + continue + elif self.einagram(sillaba): + #amb les parts de mot + continue + else: + # soliem mirar si era un prefix tonic o un lexema, ja no cal + self.transpart_[pnum].transsil_[0].tonica() + else: + # no es monosillab + + last_sil = self.transpart_[pnum].transsil_[numsil - 1].get_text() + # no es referencia, last_sil, car volem guardar el valor + + if last_sil == "ment": + # no cal tractar diferent els prefixos tonics + if self.no_es_nom_ment(self.trossos_[pnum]) and self.no_es_nom_ment(self.motnorm_): + if numsil - 1 > 1: + self.transpart_[pnum].pop_back() # Remove the last syllable + self.accentua_mot(pnum) # Accentuate from the syllables + darsil = Sillaba(last_sil) # Create a syllable like before + self.transpart_[pnum].push_back(darsil) # Add it and make it tonic + self.transpart_[pnum].transsil_[numsil - 1].tonica() + self.transpart_[pnum].transsil_[numsil - 1].grafnuc_ = 1 + # # es la e de ment + else: + self.transpart_[pnum].transsil_[0].tonica() + self.transpart_[pnum].transsil_[numsil - 1].tonica() + else: + self.accentua_mot(pnum) + + def sillaba_accentua_mot(self): + + self.parteix_mot() + self.troba_nuclis_mot() + self.troba_accent_tonic_mot() + + def stress_tonic(self) -> str: + + accent_changes = ACCENT_CHANGES + + all_vowels = VOWEL_CHARS + accented_vowels = ACCENTED_VOWEL_CHARS + unaccented_vowels = list(set(all_vowels) - set(accented_vowels)) + + original_word = "" + stressed_word = "" + + for i in range(len(self.transpart_)): + + word = self.transpart_[i].text_ + + if any(ext in word for ext in accented_vowels): + stressed_word = stressed_word + word + else: + for j in range(self.transpart_[i].size()): + sil = self.transpart_[i].transsil_[j] + sillaba_text = sil.get_text() + idxgrafnucli = sil.get_grafnuc() + graf_nucli = sil.get_text_at_index(idxgrafnucli) + is_tonic = sil.es_sil_tonica() + + if is_tonic == "sí": + sillaba_list = list(sillaba_text) + if sillaba_list[idxgrafnucli] in unaccented_vowels: + if sillaba_list[idxgrafnucli] == "e": + if j == self.transpart_[i].size() - 1: + # for accute words almost always this is the correct accented e + sillaba_list[idxgrafnucli] = "è" + elif j == self.transpart_[i].size() - 2: + # the word has its accent in the penultimate sillabe + # almost always this is the correct accented e + sillaba_list[idxgrafnucli] = "è" + else: + # proparoxytone + # almost always this is the correct accented e + sillaba_list[idxgrafnucli] = "è" + elif sillaba_list[idxgrafnucli] == "o": + if j == self.transpart_[i].size() - 1: + # the word has its accent in the last sillabe + # almost always this is the correct accented o + sillaba_list[idxgrafnucli] = "ó" + elif j == self.transpart_[i].size() - 2: + # the word has its accent in the penultimate sillabe + # almost always this is the correct accented o + sillaba_list[idxgrafnucli] = "ò" + else: + # proparoxytone + # almost always this is the correct accented o + sillaba_list[idxgrafnucli] = "ò" + else: + sillaba_list[idxgrafnucli] = accent_changes[sillaba_list[idxgrafnucli]] + + sillaba_text = "".join(sillaba_list) + + stressed_word = stressed_word + sillaba_text + + original_word = original_word + word + + return stressed_word + + def stress_word(self) -> str: + + self.motnorm_ = self.normalize_word(self.motorig_) + + self.sillaba_accentua_mot() + + self.stressed_word = self.stress_tonic() + + return self.stressed_word + + +class CatalanPreProcessText: + """Pre-processes text""" + + # The preprocessing is the same for all accents in this version (variable lang is not used) + + def __init__(self, lookup_phonemes, settings_values: dict, lang: str): + + self.lookup_phonemes = lookup_phonemes + self.settings_values = settings_values + self.lang = lang + + + def __call__(self, text: str) -> str: + + breaks = [" "] + breaks = breaks + list(self.settings_values["major_breaks"]) + breaks = breaks + list(self.settings_values["minor_breaks"]) + breaks = breaks + list(self.settings_values["word_breaks"]) + breaks = breaks + list(self.settings_values["begin_punctuations"]) + breaks = breaks + list(self.settings_values["end_punctuations"]) + + tokens = [text.strip()] + for char_break in breaks: + tokens = [re.split(f"(\{char_break})", item) for item in tokens] + tokens = [item for sublist in tokens for item in sublist if item != ""] + + preprocessed_tokens = [] + for token in tokens: + + try: + if token in breaks: + processed_token = token + else: + is_in_lexicon = self.lookup_phonemes(token) is not None + if is_in_lexicon: + processed_token = token + else: + tr = Transcripcio(token) + processed_token = tr.stress_word() + except: + processed_token = token + _LOGGER.debug(f"Unable to stress token {token}.") + + preprocessed_tokens.append(processed_token) + + processed_text = "".join(preprocessed_tokens) + + _LOGGER.debug(f"{text} preprocessed obtaining: {processed_text}") + + return processed_text + + +# Post-Process constants +# Only defined for "ca", "ca-ce" accent. +# For the rest of accents, not post-processing is done + +PHONEME_VOWELS = ["'a", "'ɛ", "'ɔ", "'e", "'i", "'o", "'u", "ə", "i", "u"] +PHONEME_STRESSED_VOWELS = ["'a", "'ɛ", "'ɔ", "'e", "'i", "'o", "'u"] +PHONEME_HIGH_VOWELS = ["i", "u", "'i", "'u"] +PHONEME_NEUTRAL_VOWELS = ["ə"] + +# Post-Process functions and classes + +from gruut.text_processor import DATA_PROP, WordNode, BreakWordNode, BreakNode, PunctuationWordNode +from gruut.utils import sliding_window + +def identify_lang(nodes: typing.List[typing.Union[WordNode, BreakWordNode, BreakNode, PunctuationWordNode]]) -> str: + + from gruut.text_processor import WordNode + + try: + for node in nodes: + if isinstance(node, WordNode): + lang = node.lang + break + except: + lang = "ca" + + return lang + +def phoneme_is_vowel(phoneme: str) -> bool: + return phoneme in PHONEME_VOWELS + +def phoneme_is_stressed_vowel(phoneme: str) -> bool: + return phoneme in PHONEME_STRESSED_VOWELS + +def phoneme_is_unstressed_vowel(phoneme: str) -> bool: + return phoneme_is_vowel(phoneme) and not phoneme_is_stressed_vowel(phoneme) + +def phoneme_is_high_vowel(phoneme: str) -> bool: + return phoneme in PHONEME_HIGH_VOWELS + +def phoneme_is_high_stressed_vowel(phoneme: str) -> bool: + return phoneme_is_high_vowel(phoneme) and phoneme_is_stressed_vowel(phoneme) +def phoneme_is_high_unstressed_vowel(phoneme: str) -> bool: + return phoneme_is_high_vowel(phoneme) and phoneme_is_unstressed_vowel(phoneme) + +def phoneme_is_neutral_vowel(phoneme: str) -> bool: + return phoneme in PHONEME_NEUTRAL_VOWELS + +def fusion_if_needed(node_1: WordNode, node_2: WordNode, lang: str): + + if lang in ["ca", "ca-ce"]: + if len(node_1.phonemes) == 0 or len(node_2.phonemes) == 0: + return + else: + + last_phoneme_word_1 = node_1.phonemes[-1] + first_phoneme_word_2 = node_2.phonemes[0] + + # Case 1: high unstressed vowel + stressed vowel of the same timbre + if phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_high_stressed_vowel(first_phoneme_word_2) \ + and last_phoneme_word_1 == first_phoneme_word_2.replace("'", ""): + # Case [i] + [i'] = [i'] or [u] + [u'] = [u'] + node_1.phonemes.pop() + _LOGGER.debug(f"FUSION CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 2: high unstressed vowel + high unstressed vowel of the same timbre + elif phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_high_unstressed_vowel(first_phoneme_word_2) \ + and last_phoneme_word_1 == first_phoneme_word_2: + # Case [i] + [i] = [i] or [u] + [u] = [u] + node_1.phonemes.pop() + _LOGGER.debug(f"FUSION CASE 2 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 3: neutral vowel + neutral vowel (except if any of the vowels is the proposition "a") + elif phoneme_is_neutral_vowel(last_phoneme_word_1) and phoneme_is_neutral_vowel(first_phoneme_word_2) \ + and node_1.text != "a" and node_2.text != "a": + node_1.phonemes.pop() + _LOGGER.debug(f"FUSION CASE 3 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + else: + pass + +def elision_if_needed(node_1: WordNode, node_2: WordNode, lang: str): + + if lang in ["ca", "ca-ce"]: + + if len(node_1.phonemes) == 0 or len(node_2.phonemes) == 0: + return + else: + + last_phoneme_word_1 = node_1.phonemes[-1] + first_phoneme_word_2 = node_2.phonemes[0] + + # Case 1: stressed vowel ['a], ['ɛ], ['e], ['o] or ['ɔ] + neutral vowel (except if any of the vowels is the proposition "a") + if (phoneme_is_stressed_vowel(last_phoneme_word_1) and not phoneme_is_high_vowel(last_phoneme_word_1)) \ + and (phoneme_is_neutral_vowel(first_phoneme_word_2) and node_2.text != "a"): + node_2.phonemes.pop(0) + _LOGGER.debug(f"ELISION CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 2: neutral vowel + stressed vowel ['a], ['ɛ], ['e], ['o] or ['ɔ] + elif phoneme_is_neutral_vowel(last_phoneme_word_1) \ + and (phoneme_is_stressed_vowel(first_phoneme_word_2) and not phoneme_is_high_vowel(first_phoneme_word_2)): + node_1.phonemes.pop() + _LOGGER.debug(f"ELISION CASE 2 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + else: + pass + +def diphthong_if_needed(node_1: WordNode, node_2: WordNode, lang: str): + + if lang in ["ca", "ca-ce"]: + + if len(node_1.phonemes) == 0 or len(node_2.phonemes) == 0: + return + else: + + last_phoneme_word_1 = node_1.phonemes[-1] + first_phoneme_word_2 = node_2.phonemes[0] + + # Case 1: stressed vowel + high unstressed vowel + if (phoneme_is_stressed_vowel(last_phoneme_word_1) and not phoneme_is_high_vowel(last_phoneme_word_1)) \ + and phoneme_is_high_unstressed_vowel(first_phoneme_word_2): + if first_phoneme_word_2 == "i": + # Case [stressed vowel] + [i] = [stressed vowel + j], stressed vowel not 'i or 'u + node_2.phonemes[0] = "j" + _LOGGER.debug(f"DIPTHONG CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + elif first_phoneme_word_2 == "u": + # Case [stressed vowel] + [u] = [stressed vowel + uw], stressed vowel not 'i or 'u + node_2.phonemes[0] = "uw" + _LOGGER.debug(f"DIPTHONG CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 2: high unstressed vowel + stressed vowel + elif phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_stressed_vowel(first_phoneme_word_2): + if last_phoneme_word_1 == "i" and first_phoneme_word_2 not in ["'i"] and node_1.text in ["hi", "ho", "i"]: + # Case [i] + [stressed] = [y + stressed vowel], i only from "hi", "ho" or "i" + node_1.phonemes[-1] = "y" + _LOGGER.debug(f"DIPTHONG CASE 2 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + elif last_phoneme_word_1 == "u" and first_phoneme_word_2 not in ["'u"] and node_1.text in ["hi", "ho", "i"]: + # Case [u] + [stressed] = [u + stressed vowel], i only from "hi", "ho" or "i" + pass + + # Case 3: unstressed vowel + high unstressed vowel + elif phoneme_is_neutral_vowel(last_phoneme_word_1) and phoneme_is_high_unstressed_vowel(first_phoneme_word_2): + if first_phoneme_word_2 == "i": + # Case [neutral vowel] + [i] = [neutral vowel + j] + node_2.phonemes[0] = "j" + _LOGGER.debug(f"DIPTHONG CASE 3 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + elif first_phoneme_word_2 == "u": + # Case [neutral vowel] + [u] = [neutral vowel + uw] + node_2.phonemes[0] = "uw" + _LOGGER.debug(f"DIPTHONG CASE 3 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 4: unstressed vowel + high unstressed vowel + elif phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_neutral_vowel(first_phoneme_word_2): + pass + else: + pass + +def ca_post_process_sentence( + graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings +): + + # Create a list of relevant nodes + nodes = [] + for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node): + + node = graph.nodes[dfs_node][DATA_PROP] + + if not graph.out_degree(dfs_node) == 0: + # Only leave + continue + + node = graph.nodes[dfs_node][DATA_PROP] + if isinstance(node, WordNode): + nodes.append(typing.cast(WordNode, node)) + if isinstance(node, BreakWordNode): + nodes.append(typing.cast(BreakWordNode, node)) + if isinstance(node, BreakNode): + nodes.append(typing.cast(BreakNode, node)) + if isinstance(node, PunctuationWordNode): + nodes.append(typing.cast(PunctuationWordNode, node)) + + lang = identify_lang(nodes) + + # HACK + # Training corpora includes an invalid sequence of phonemes: l ʎ l + # We fix that here, in the next iteration will be properly solved + phonemes_to_fix = "l ʎ l" + fixed_phonemes = "l l" + for node in nodes: + + if node is None: + continue + + if isinstance(node, WordNode): + if not (node.text and node.phonemes): + continue + phonemes_text = " ".join(node.phonemes) + if phonemes_to_fix in phonemes_text: + phonemes_text = phonemes_text.replace(phonemes_to_fix, fixed_phonemes) + node.phonemes = phonemes_text.split(" ") + _LOGGER.debug(f"FIX: phoneme sequence '{phonemes_to_fix}' fixed at {node.text}. Fixed transcription: {node.phonemes}") + + # Create a list of contiguous word nodes + contiguous_word_nodes = [] + for node_1, node_2 in sliding_window(nodes, 2): + + if node_1 is None or node_2 is None: + continue + + if isinstance(node_1, WordNode) and isinstance(node_2, WordNode): + if not (node_1.text and node_1.phonemes and node_2.text and node_2.phonemes): + continue + contiguous_word_nodes.append([node_1, node_2]) + + for (node_1, node_2) in contiguous_word_nodes: + + diphthong_if_needed(node_1, node_2, lang) + fusion_if_needed(node_1, node_2, lang) + elision_if_needed(node_1, node_2, lang) + + +# Settings + +def get_ca_settings(lang_dir=None, **settings_args) -> TextProcessorSettings: + + """Create settings for Catalan""" + + try: + lang = str(lang_dir).split("/")[-1] + main_lang, lang_version = lang.split("-") + lang = f"{main_lang.lower()}-{lang_version.upper()}" + except: + lang = "ca" + + lookup_phonemes = settings_args["lookup_phonemes"] + + settings_values = { + "major_breaks": {".", "?", "!"}, + "minor_breaks": {",", ";", ":", "..."}, + "word_breaks": {"_"}, + "begin_punctuations": {'"', "“", "«", "[", "(", "<", "¡", "¿"}, + "end_punctuations": {'"', "”", "»", "]", ")", ">", "!", "?"}, + "default_currency": "EUR", + "default_date_format": InterpretAsFormat.DATE_DMY, + "replacements": [ + ("’", "'"), # normalize apostrophe + ("'", ""), # remove orthographic apostrophe + ("-", ""), + ("l·l", "l"), + ], + } + + settings_args = { + **settings_values, + "pre_process_text": CatalanPreProcessText(lookup_phonemes, settings_values, lang), + "post_process_sentence": ca_post_process_sentence, + **settings_args, + } + + return TextProcessorSettings(lang="ca", **settings_args) + +# ----------------------------------------------------------------------------- class DelayedGraphemesToPhonemes: """Grapheme to phoneme guesser that loads on first use""" @@ -897,4 +2328,4 @@ def __call__( self.phonemizer = SqlitePhonemizer(db_conn=db_conn, **self.phonemizer_args) assert self.phonemizer is not None - return self.phonemizer(word, role=role, do_transforms=do_transforms) + return self.phonemizer(word, role=role, do_transforms=do_transforms) \ No newline at end of file diff --git a/setup.py b/setup.py index b24952b..78c4439 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ # Create language-specific extras for lang in [ "ar", + "ca", "cs", "de", "es",