From 916d32738f21b0e39ca884aa373f1bc893e01850 Mon Sep 17 00:00:00 2001 From: Ellington Date: Mon, 17 Apr 2023 11:43:56 +0200 Subject: [PATCH 1/4] tokenization: adding function to api to tokenize file works in the same manner as the clean file function --- preprocessor/api.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/preprocessor/api.py b/preprocessor/api.py index c2f1053..e8e4de5 100755 --- a/preprocessor/api.py +++ b/preprocessor/api.py @@ -94,3 +94,26 @@ def clean_file(input_file_path, add_timestamp=False, *options): output_path = write_to_output_file(input_file_path, cleaned_content, add_timestamp) print("Saved the cleaned tweets to:" + output_path) return output_path + +def tokenize_file(input_file_path, add_timestamp=False, *options): + """Tokenize given input file in JSON and txt format if it can be found at the given path. + Returns a stdout for the output file path. + :param input_file_path: Absolute path for the tweets. Could be either in JSON or .txt format. + :param add_timestamp: If True, adds current timestamp to the filename + :return: output file path: str. Returns the file path of the cleaned file. + :rtype: str + :raises IOError if the input file empty + Usage:: + >>> input_file_name = "sample.json" + >>> p.tokenize_file(file_name, p.OPT.URL, p.OPT.MENTION) + """ + file_contents = get_file_contents(input_file_path) + if not file_contents or len(file_contents) == 0: + raise IOError("Empty file given at path:" + input_file_path) + + tokenized_content = [] + for line in file_contents: + tokenized_content.append(tokenize(line)) + output_path = write_to_output_file(input_file_path, tokenized_content, add_timestamp) + print("Saved the tokenized tweets to:" + output_path) + return output_path From ed9dc5c0df632c50040907ea6bce05f7edf849d1 Mon Sep 17 00:00:00 2001 From: Ellington Date: Mon, 17 Apr 2023 11:48:28 +0200 Subject: [PATCH 2/4] tests: add unittest for file tokenization involved splitting the _get_tokenized_test_data_for_option function tested with python -m unittest discover tests/ --- tests/artifacts/tokenize_file_sample.json | 6 +++ tests/artifacts/tokenize_file_sample.txt | 4 ++ tests/test_api.py | 45 +++++++++++++++++++++-- 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 tests/artifacts/tokenize_file_sample.json create mode 100644 tests/artifacts/tokenize_file_sample.txt diff --git a/tests/artifacts/tokenize_file_sample.json b/tests/artifacts/tokenize_file_sample.json new file mode 100644 index 0000000..358330a --- /dev/null +++ b/tests/artifacts/tokenize_file_sample.json @@ -0,0 +1,6 @@ +[ + "Preprocessor now supports files. https://github.com/s/preprocessor", + "#preprocessing is a cruical part of @ML projects.", + "@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl", + "#emoji #smiley 😀😍 https://emojipedia.org" +] diff --git a/tests/artifacts/tokenize_file_sample.txt b/tests/artifacts/tokenize_file_sample.txt new file mode 100644 index 0000000..fca71a8 --- /dev/null +++ b/tests/artifacts/tokenize_file_sample.txt @@ -0,0 +1,4 @@ +Preprocessor now supports files. https://github.com/s/preprocessor +#preprocessing is a cruical part of @ML projects. +@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl +#emoji #smiley 😀😍 https://emojipedia.org diff --git a/tests/test_api.py b/tests/test_api.py index 43c8786..dd2d70a 100755 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -101,7 +101,7 @@ def test_clean_file(self): self.assertIsNotNone(raw_data) # Test all option - check_against = self._get_test_data_for_option(raw_data) + check_against = self._get_cleaned_test_data_for_option(raw_data) self._test_clean_file(full_input_path, check_against) # Test individual options @@ -115,9 +115,36 @@ def test_clean_file(self): p.OPT.NUMBER ] for opt in options: - check_against = self._get_test_data_for_option(raw_data, opt) + check_against = self._get_cleaned_test_data_for_option(raw_data, opt) self._test_clean_file(full_input_path, check_against, opt) + def test_tokenize_file(self): + current_dir = os.path.dirname(__file__) + artifacts_dir = os.path.join(current_dir, self._artifacts_dir_name) + extensions = [p.InputFileType.json, p.InputFileType.text] + for ext in extensions: + full_input_path = os.path.join(artifacts_dir, "tokenize_file_sample" + ext) + raw_data = p.get_file_contents(full_input_path) + self.assertIsNotNone(raw_data) + + # Test all option + check_against = self._get_tokenized_test_data_for_option(raw_data) + self._test_clean_file(full_input_path, check_against) + + # Test individual options + options = [ + p.OPT.URL, + p.OPT.MENTION, + p.OPT.HASHTAG, + p.OPT.RESERVED, + p.OPT.EMOJI, + p.OPT.SMILEY, + p.OPT.NUMBER + ] + for opt in options: + check_against = self._get_tokenized_test_data_for_option(raw_data, opt) + self._test_tokenize_file(full_input_path, check_against, opt) + def test_escape_chars(self): p.set_options(p.OPT.ESCAPE_CHAR) input_str = u"\x01\x02\x03\x04I \x05\x06\x07\x10\x11have \x12\x13\x14" \ @@ -130,12 +157,24 @@ def _test_clean_file(self, full_input_path, check_against, *options): self.assertTrue(os.path.exists(output_path)) clean_content = p.get_file_contents(output_path) p.are_lists_equal(clean_content, check_against) + + def _test_tokenize_file(self, full_input_path, check_against, *options): + output_path = p.tokenize_file(full_input_path, True, options) + self.assertTrue(os.path.exists(output_path)) + clean_content = p.get_file_contents(output_path) + p.are_lists_equal(clean_content, check_against) - def _get_test_data_for_option(self, raw_data, *options): + def _get_cleaned_test_data_for_option(self, raw_data, *options): clean_data = [] for d in raw_data: clean_data.append(p.clean(d)) return clean_data + + def _get_tokenized_test_data_for_option(self, raw_data, *options): + clean_data = [] + for d in raw_data: + clean_data.append(p.tokenize(d)) + return clean_data if __name__ == '__main__': From 7af30e072549a44448a946dd16381b85a2fe35f0 Mon Sep 17 00:00:00 2001 From: Ellington Date: Mon, 17 Apr 2023 11:49:25 +0200 Subject: [PATCH 3/4] misc: update .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 93d0f0c..99a0d5f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ tests/artifacts/* !tests/artifacts/clean_file_sample.json !tests/artifacts/clean_file_sample.txt +!tests/artifacts/tokenize_file_sample.json +!tests/artifacts/tokenize_file_sample.txt .DS_Store .python-version __pycache__/ From c1ff9608dbf2cc576a78da28b86f8b2c36106946 Mon Sep 17 00:00:00 2001 From: Ellington Date: Mon, 17 Apr 2023 11:49:53 +0200 Subject: [PATCH 4/4] misc: add tokenize_file to __init.py__ --- preprocessor/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessor/__init__.py b/preprocessor/__init__.py index b8b146c..1f9b669 100755 --- a/preprocessor/__init__.py +++ b/preprocessor/__init__.py @@ -1,4 +1,4 @@ -from .api import clean, tokenize, parse, set_options, clean_file +from .api import clean, tokenize, parse, set_options, clean_file, tokenize_file from .defines import Options as OPT from .defines import InputFileType, Defines from .utils import get_worker_methods,\ @@ -28,4 +28,4 @@ write_to_text_file, generate_random_file_name, generate_random_alphanumeric_string, - are_lists_equal] \ No newline at end of file + are_lists_equal]