From 617c8328db8b9b980c5688d5d92d65850d725459 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 14 Aug 2024 10:43:36 +0300 Subject: [PATCH] add more instructions and tests for opus_read --- .../downloading_and_selecting_data.md | 7 +++++- tests/test_opusfilter.py | 22 ++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/docs/functions/downloading_and_selecting_data.md b/docs/functions/downloading_and_selecting_data.md index cad15ae..95d35a0 100644 --- a/docs/functions/downloading_and_selecting_data.md +++ b/docs/functions/downloading_and_selecting_data.md @@ -11,11 +11,16 @@ Parameters: * `source_language`: language code for the source language * `target_language`: language code for the target language * `release`: version of the corpus in OPUS -* `preprocessing`: `raw` for untokenized and `xml` for tokenized segments +* `preprocessing`: `moses` or `raw` for untokenized and `xml` for tokenized segments * `src_output`: output file for source language * `tgt_output`: output file for target language * `suppress_prompts`: `false` (default) prompts user to confirm before download, `true` to download without prompting +The `moses` preprocessing type (available with `OpusTools` version +1.6.2 and above) is recommended for those corpora for which it +exists. The output is equivalent to `raw`, but in some cases it can +significantly reduce the amount of data downloaded in the process. + ## concatenate Concatenate two or more text files. diff --git a/tests/test_opusfilter.py b/tests/test_opusfilter.py index 200d2bf..993efb3 100644 --- a/tests/test_opusfilter.py +++ b/tests/test_opusfilter.py @@ -223,6 +223,25 @@ def test_create_output_dir_if_it_does_not_exist(self, mocked_input): shutil.rmtree('test_creating_dir') +class TestOpusRead(unittest.TestCase): + + def test_preprocessing(self): + outfiles = ['RF1_sents.en', 'RF1_sents.sv'] + for pre_type in ['xml', 'raw', 'moses', 'parsed']: + with tempfile.TemporaryDirectory() as tempdir: + opusfilter = OpusFilter( + {'common': {'output_directory': tempdir}, + 'steps': [ + {'type': 'opus_read', 'parameters': { + 'corpus_name': 'RF', 'source_language': 'en', 'target_language': 'sv', 'release': 'latest', + 'preprocessing': pre_type, 'src_output': outfiles[0], 'tgt_output': outfiles[1], + 'suppress_prompts': True}} + ]}) + opusfilter.execute_steps() + for outfile in outfiles: + self.assertTrue(os.path.isfile(os.path.join(tempdir, outfile))) + + class TestExtraKeyErrors(unittest.TestCase): def test_read_from_opus(self): @@ -230,7 +249,8 @@ def test_read_from_opus(self): {'steps': [ {'type': 'opus_read', 'parameters': { 'corpus_name': 'RF', 'source_language': 'en', 'target_language': 'sv', 'relase': 'latest', - 'preprocessing': 'xml', 'src_output': 'RF1_sents.en', 'tgt_output': 'RF1_sents.sv'}} + 'preprocessing': 'xml', 'src_output': 'RF1_sents.en', 'tgt_output': 'RF1_sents.sv', + 'suppress_prompts': True}} ]}) with self.assertRaises(ConfigurationError): opusfilter.execute_steps()