Skip to content

Commit

Permalink
fix: language detector proportion_in_each_language giving results hig…
Browse files Browse the repository at this point in the history
…her than 1

(already fixed but brought back accidentally)
  • Loading branch information
juanjoDiaz committed Sep 17, 2024
1 parent e1fb6a0 commit 0071258
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
20 changes: 12 additions & 8 deletions simplemma/language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,18 @@ def proportion_in_target_languages(
Returns:
float: The proportion of text in the target language(s).
"""
return sum(
percentage
for (
lang_code,
percentage,
) in self.proportion_in_each_language(text).items()
if lang_code != "unk"
)
tokens = self._token_sampler.sample_text(text)
if len(tokens) == 0:
return 0

in_target = 0
for token in tokens:
for lang_code in self._lang:
candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
if candidate is not None:
in_target += 1
break
return in_target / len(tokens)

def main_language(
self,
Expand Down
8 changes: 8 additions & 0 deletions tests/test_language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ def test_in_target_language() -> None:
== 1.0
)

langs = ("en", "de")
text = "It was a true gift"
assert (
LanguageDetector(lang=langs).proportion_in_target_languages(text)
== in_target_language(text, lang=langs)
== 1.0
)


def test_main_language():
text = "Dieser Satz ist auf Deutsch."
Expand Down

0 comments on commit 0071258

Please sign in to comment.