Skip to content

Commit

Permalink
prepare release 0.8.1
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Sep 1, 2022
1 parent a92226d commit 5b5ee9d
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 14 deletions.
8 changes: 8 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@ History
=======


0.8.1
-----

* better rules for English and German
* inconsistencies fixed for cy, de, en, ga, sv (#16)
* docs: added language detection and citation info


0.8.0
-----

Expand Down
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ Code Language Words (10³) Acc. Comments
``es`` Spanish 720 0.94 on UD ES-GSD
``et`` Estonian 133 low coverage
``fa`` Persian 10 low coverage, potential issues
``fi`` Finnish 2,106 `here <https://github.com/aajanki/finnish-pos-accuracy>`_ alternatives: `voikko <https://voikko.puimula.org/python.html>`_ or `NLP list <https://blogs.helsinki.fi/language-technology/hi-nlp/morphology/>`_
``fi`` Finnish 2,106 evaluation and alternatives: see `this benchmark <https://github.com/aajanki/finnish-pos-accuracy>`_
``fr`` French 217 0.94 on UD FR-GSD
``ga`` Irish 383
``gd`` Gaelic 48
Expand Down Expand Up @@ -257,8 +257,8 @@ Orders of magnitude given for reference only, measured on an old laptop to give
Installing the most recent Python version can improve speed.


Optional pre-compilation with ``mypyc``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Optional pre-compilation with `mypyc <https://github.com/mypyc/mypyc>`_
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

1. ``pip3 install mypy``
2. clone or download the source code from the repository
Expand Down
2 changes: 1 addition & 1 deletion simplemma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
__author__ = "Adrien Barbaresi"
__email__ = "[email protected]"
__license__ = "MIT"
__version__ = "0.8.0"
__version__ = "0.8.1"


from .simplemma import lemmatize, lemma_iterator, text_lemmatizer, is_known
Expand Down
12 changes: 11 additions & 1 deletion tests/test_langdetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,14 @@ def test_detection():
"Dieser Satz ist auf Deutsch.", lang=("de", "en"), extensive=True
)
assert results[0][0] == "de"
assert in_target_language("Diese Wörter", lang=("de", "en")) == 1
assert lang_detector(
'"Moderní studie narazily na několik tajemství." Extracted from Wikipedia.',
lang=("cs", "sk"),
) == [("cs", 0.625), ("unk", 0.375), ("sk", 0.125)]
# target language
assert (
in_target_language(
"opera post physica posita (τὰ μετὰ τὰ φυσικά)", lang=("la",)
)
== 0.5
)
10 changes: 5 additions & 5 deletions tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@ def test_apply_de():
assert apply_de("Achterls") == "Achterl"
assert apply_de("Inspekteurinnen") == "Inspekteurin"
assert apply_de("Zwiebelschneider") == "Zwiebelschneider"
#assert apply_de("Zwiebelschneidern") == "Zwiebelschneider"
# assert apply_de("Zwiebelschneidern") == "Zwiebelschneider"
assert apply_de("Facetten") == "Facette"
assert apply_de("Kazakhstans") == "Kazakhstan"
assert apply_de("Hämatome") == "Hämatom"
assert apply_de("Hämatomen") == "Hämatom"
assert apply_de("Hämatoms") == "Hämatom"
assert apply_de("Ökonomen") == "Ökonom"
#assert apply_de("Theologien") == "Theologie"
#assert apply_de("Zeitschriftenmarken", greedy=True) == "Zeitschriftenmarke"
#assert apply_de("Gesundheitsfreaks", greedy=True) == "Gesundheitsfreak"
# assert apply_de("Theologien") == "Theologie"
# assert apply_de("Zeitschriftenmarken", greedy=True) == "Zeitschriftenmarke"
# assert apply_de("Gesundheitsfreaks", greedy=True) == "Gesundheitsfreak"
# adjectives
assert apply_de("großartiges") == "großartig"
assert apply_de("achtsame") == "achtsam"
Expand All @@ -46,7 +46,7 @@ def test_apply_de():
# Gendersprache normalization
assert apply_de("ZuschauerInnen") == "Zuschauer:innen"
assert apply_de("Zuschauer*innen") == "Zuschauer:innen"
assert apply_de('Zuschauer_innen') == 'Zuschauer:innen'
assert apply_de("Zuschauer_innen") == "Zuschauer:innen"


def test_apply_en():
Expand Down
8 changes: 4 additions & 4 deletions tests/test_simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,17 +401,17 @@ def test_subwords():
simplemma.lemmatize("Kapuzenpullis", lang="de", greedy=True) == "Kapuzenpulli"
)
assert simplemma.lemmatize("Pharmagrößen", lang="de", greedy=True) == "Pharmagröße"
assert simplemma.lemmatize('beständigsten', lang='de', greedy=True) == 'beständig'
#assert simplemma.lemmatize('zweitstärkster', lang='de', greedy=True) == 'zweitstärkste'
assert simplemma.lemmatize("beständigsten", lang="de", greedy=True) == "beständig"
# assert simplemma.lemmatize('zweitstärkster', lang='de', greedy=True) == 'zweitstärkste'
# assert simplemma.lemmatize('Abholservices', lang='de', greedy=True) == 'Abholservice'
# assert simplemma.lemmatize('Funktionärsebene', lang='de', greedy=True) == 'Funktionärsebene'
# assert simplemma.lemmatize('strafbewehrte', lang='de', greedy=True) == 'strafbewehrt'
# assert simplemma.lemmatize('fälschungssicheren', lang='de', greedy=True) == 'fälschungssicher'
# assert simplemma.lemmatize('Spargelstangen', lang='de', greedy=True) == 'Spargelstange'
#assert (
# assert (
# simplemma.lemmatize("Bürgerschaftsabgeordneter", lang="de", greedy=True)
# == "Bürgerschaftsabgeordnete"
#)
# )


def test_tokenizer():
Expand Down

0 comments on commit 5b5ee9d

Please sign in to comment.