Skip to content

Commit

Permalink
add more instructions and tests for opus_read
Browse files Browse the repository at this point in the history
  • Loading branch information
svirpioj committed Aug 14, 2024
1 parent 1128b75 commit 617c832
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
7 changes: 6 additions & 1 deletion docs/functions/downloading_and_selecting_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,16 @@ Parameters:
* `source_language`: language code for the source language
* `target_language`: language code for the target language
* `release`: version of the corpus in OPUS
* `preprocessing`: `raw` for untokenized and `xml` for tokenized segments
* `preprocessing`: `moses` or `raw` for untokenized and `xml` for tokenized segments
* `src_output`: output file for source language
* `tgt_output`: output file for target language
* `suppress_prompts`: `false` (default) prompts user to confirm before download, `true` to download without prompting

The `moses` preprocessing type (available with `OpusTools` version
1.6.2 and above) is recommended for those corpora for which it
exists. The output is equivalent to `raw`, but in some cases it can
significantly reduce the amount of data downloaded in the process.

## concatenate

Concatenate two or more text files.
Expand Down
22 changes: 21 additions & 1 deletion tests/test_opusfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,14 +223,34 @@ def test_create_output_dir_if_it_does_not_exist(self, mocked_input):
shutil.rmtree('test_creating_dir')


class TestOpusRead(unittest.TestCase):

def test_preprocessing(self):
outfiles = ['RF1_sents.en', 'RF1_sents.sv']
for pre_type in ['xml', 'raw', 'moses', 'parsed']:
with tempfile.TemporaryDirectory() as tempdir:
opusfilter = OpusFilter(
{'common': {'output_directory': tempdir},
'steps': [
{'type': 'opus_read', 'parameters': {
'corpus_name': 'RF', 'source_language': 'en', 'target_language': 'sv', 'release': 'latest',
'preprocessing': pre_type, 'src_output': outfiles[0], 'tgt_output': outfiles[1],
'suppress_prompts': True}}
]})
opusfilter.execute_steps()
for outfile in outfiles:
self.assertTrue(os.path.isfile(os.path.join(tempdir, outfile)))


class TestExtraKeyErrors(unittest.TestCase):

def test_read_from_opus(self):
opusfilter = OpusFilter(
{'steps': [
{'type': 'opus_read', 'parameters': {
'corpus_name': 'RF', 'source_language': 'en', 'target_language': 'sv', 'relase': 'latest',
'preprocessing': 'xml', 'src_output': 'RF1_sents.en', 'tgt_output': 'RF1_sents.sv'}}
'preprocessing': 'xml', 'src_output': 'RF1_sents.en', 'tgt_output': 'RF1_sents.sv',
'suppress_prompts': True}}
]})
with self.assertRaises(ConfigurationError):
opusfilter.execute_steps()
Expand Down

0 comments on commit 617c832

Please sign in to comment.