From 916d32738f21b0e39ca884aa373f1bc893e01850 Mon Sep 17 00:00:00 2001
From: Ellington <ellingtonkirby31@gmail.com>
Date: Mon, 17 Apr 2023 11:43:56 +0200
Subject: [PATCH 1/4] tokenization: adding function to api to tokenize file

works in the same manner as the clean file function
---
 preprocessor/api.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/preprocessor/api.py b/preprocessor/api.py
index c2f1053..e8e4de5 100755
--- a/preprocessor/api.py
+++ b/preprocessor/api.py
@@ -94,3 +94,26 @@ def clean_file(input_file_path, add_timestamp=False, *options):
     output_path = write_to_output_file(input_file_path, cleaned_content, add_timestamp)
     print("Saved the cleaned tweets to:" + output_path)
     return output_path
+
+def tokenize_file(input_file_path, add_timestamp=False, *options):
+    """Tokenize given input file in JSON and txt format if it can be found at the given path.
+    Returns a stdout for the output file path.
+    :param input_file_path: Absolute path for the tweets. Could be either in JSON or .txt format.
+    :param add_timestamp: If True, adds current timestamp to the filename
+    :return: output file path: str. Returns the file path of the cleaned file.
+    :rtype: str
+    :raises IOError if the input file empty
+    Usage::
+      >>> input_file_name = "sample.json"
+      >>> p.tokenize_file(file_name, p.OPT.URL, p.OPT.MENTION)
+    """
+    file_contents = get_file_contents(input_file_path)
+    if not file_contents or len(file_contents) == 0:
+        raise IOError("Empty file given at path:" + input_file_path)
+
+    tokenized_content = []
+    for line in file_contents:
+        tokenized_content.append(tokenize(line))
+    output_path = write_to_output_file(input_file_path, tokenized_content, add_timestamp)
+    print("Saved the tokenized tweets to:" + output_path)
+    return output_path

From ed9dc5c0df632c50040907ea6bce05f7edf849d1 Mon Sep 17 00:00:00 2001
From: Ellington <ellingtonkirby31@gmail.com>
Date: Mon, 17 Apr 2023 11:48:28 +0200
Subject: [PATCH 2/4] tests: add unittest for file tokenization

involved splitting the _get_tokenized_test_data_for_option function
tested with python -m unittest discover tests/
---
 tests/artifacts/tokenize_file_sample.json |  6 +++
 tests/artifacts/tokenize_file_sample.txt  |  4 ++
 tests/test_api.py                         | 45 +++++++++++++++++++++--
 3 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 tests/artifacts/tokenize_file_sample.json
 create mode 100644 tests/artifacts/tokenize_file_sample.txt

diff --git a/tests/artifacts/tokenize_file_sample.json b/tests/artifacts/tokenize_file_sample.json
new file mode 100644
index 0000000..358330a
--- /dev/null
+++ b/tests/artifacts/tokenize_file_sample.json
@@ -0,0 +1,6 @@
+[
+    "Preprocessor now supports files. https://github.com/s/preprocessor",
+    "#preprocessing is a cruical part of @ML projects.",
+    "@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl",
+    "#emoji #smiley 😀😍 https://emojipedia.org"
+]
diff --git a/tests/artifacts/tokenize_file_sample.txt b/tests/artifacts/tokenize_file_sample.txt
new file mode 100644
index 0000000..fca71a8
--- /dev/null
+++ b/tests/artifacts/tokenize_file_sample.txt
@@ -0,0 +1,4 @@
+Preprocessor now supports files. https://github.com/s/preprocessor
+#preprocessing is a cruical part of @ML projects.
+@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl
+#emoji #smiley 😀😍 https://emojipedia.org
diff --git a/tests/test_api.py b/tests/test_api.py
index 43c8786..dd2d70a 100755
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -101,7 +101,7 @@ def test_clean_file(self):
             self.assertIsNotNone(raw_data)
 
             # Test all option
-            check_against = self._get_test_data_for_option(raw_data)
+            check_against = self._get_cleaned_test_data_for_option(raw_data)
             self._test_clean_file(full_input_path, check_against)
 
             # Test individual options
@@ -115,9 +115,36 @@ def test_clean_file(self):
                 p.OPT.NUMBER
             ]
             for opt in options:
-                check_against = self._get_test_data_for_option(raw_data, opt)
+                check_against = self._get_cleaned_test_data_for_option(raw_data, opt)
                 self._test_clean_file(full_input_path, check_against, opt)
 
+    def test_tokenize_file(self):
+            current_dir = os.path.dirname(__file__)
+            artifacts_dir = os.path.join(current_dir, self._artifacts_dir_name)
+            extensions = [p.InputFileType.json, p.InputFileType.text]
+            for ext in extensions:
+                full_input_path = os.path.join(artifacts_dir, "tokenize_file_sample" + ext)
+                raw_data = p.get_file_contents(full_input_path)
+                self.assertIsNotNone(raw_data)
+
+                # Test all option
+                check_against = self._get_tokenized_test_data_for_option(raw_data)
+                self._test_clean_file(full_input_path, check_against)
+
+                # Test individual options
+                options = [
+                    p.OPT.URL,
+                    p.OPT.MENTION,
+                    p.OPT.HASHTAG,
+                    p.OPT.RESERVED,
+                    p.OPT.EMOJI,
+                    p.OPT.SMILEY,
+                    p.OPT.NUMBER
+                ]
+                for opt in options:
+                    check_against = self._get_tokenized_test_data_for_option(raw_data, opt)
+                    self._test_tokenize_file(full_input_path, check_against, opt)
+
     def test_escape_chars(self):
         p.set_options(p.OPT.ESCAPE_CHAR)
         input_str = u"\x01\x02\x03\x04I \x05\x06\x07\x10\x11have \x12\x13\x14" \
@@ -130,12 +157,24 @@ def _test_clean_file(self, full_input_path, check_against, *options):
         self.assertTrue(os.path.exists(output_path))
         clean_content = p.get_file_contents(output_path)
         p.are_lists_equal(clean_content, check_against)
+        
+    def _test_tokenize_file(self, full_input_path, check_against, *options):
+        output_path = p.tokenize_file(full_input_path, True, options)
+        self.assertTrue(os.path.exists(output_path))
+        clean_content = p.get_file_contents(output_path)
+        p.are_lists_equal(clean_content, check_against)
 
-    def _get_test_data_for_option(self, raw_data, *options):
+    def _get_cleaned_test_data_for_option(self, raw_data, *options):
         clean_data = []
         for d in raw_data:
             clean_data.append(p.clean(d))
         return clean_data
+    
+    def _get_tokenized_test_data_for_option(self, raw_data, *options):
+        clean_data = []
+        for d in raw_data:
+            clean_data.append(p.tokenize(d))
+        return clean_data
 
 
 if __name__ == '__main__':

From 7af30e072549a44448a946dd16381b85a2fe35f0 Mon Sep 17 00:00:00 2001
From: Ellington <ellingtonkirby31@gmail.com>
Date: Mon, 17 Apr 2023 11:49:25 +0200
Subject: [PATCH 3/4] misc: update .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 93d0f0c..99a0d5f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@
 tests/artifacts/*
 !tests/artifacts/clean_file_sample.json
 !tests/artifacts/clean_file_sample.txt
+!tests/artifacts/tokenize_file_sample.json
+!tests/artifacts/tokenize_file_sample.txt
 .DS_Store
 .python-version
 __pycache__/

From c1ff9608dbf2cc576a78da28b86f8b2c36106946 Mon Sep 17 00:00:00 2001
From: Ellington <ellingtonkirby31@gmail.com>
Date: Mon, 17 Apr 2023 11:49:53 +0200
Subject: [PATCH 4/4] misc: add tokenize_file to __init.py__

---
 preprocessor/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/preprocessor/__init__.py b/preprocessor/__init__.py
index b8b146c..1f9b669 100755
--- a/preprocessor/__init__.py
+++ b/preprocessor/__init__.py
@@ -1,4 +1,4 @@
-from .api import clean, tokenize, parse, set_options, clean_file
+from .api import clean, tokenize, parse, set_options, clean_file, tokenize_file
 from .defines import Options as OPT
 from .defines import InputFileType, Defines
 from .utils import get_worker_methods,\
@@ -28,4 +28,4 @@
            write_to_text_file,
            generate_random_file_name,
            generate_random_alphanumeric_string,
-           are_lists_equal]
\ No newline at end of file
+           are_lists_equal]