s · EllingtonKirby · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 tests/artifacts/*
 !tests/artifacts/clean_file_sample.json
 !tests/artifacts/clean_file_sample.txt
+!tests/artifacts/tokenize_file_sample.json
+!tests/artifacts/tokenize_file_sample.txt
 .DS_Store
 .python-version
 __pycache__/

diff --git a/preprocessor/__init__.py b/preprocessor/__init__.py
@@ -1,4 +1,4 @@
-from .api import clean, tokenize, parse, set_options, clean_file
+from .api import clean, tokenize, parse, set_options, clean_file, tokenize_file
 from .defines import Options as OPT
 from .defines import InputFileType, Defines
 from .utils import get_worker_methods,\
@@ -28,4 +28,4 @@
            write_to_text_file,
            generate_random_file_name,
            generate_random_alphanumeric_string,
-           are_lists_equal]
+           are_lists_equal]
diff --git a/preprocessor/api.py b/preprocessor/api.py
@@ -94,3 +94,26 @@ def clean_file(input_file_path, add_timestamp=False, *options):
     output_path = write_to_output_file(input_file_path, cleaned_content, add_timestamp)
     print("Saved the cleaned tweets to:" + output_path)
     return output_path
+
+def tokenize_file(input_file_path, add_timestamp=False, *options):
+    """Tokenize given input file in JSON and txt format if it can be found at the given path.
+    Returns a stdout for the output file path.
+    :param input_file_path: Absolute path for the tweets. Could be either in JSON or .txt format.
+    :param add_timestamp: If True, adds current timestamp to the filename
+    :return: output file path: str. Returns the file path of the cleaned file.
+    :rtype: str
+    :raises IOError if the input file empty
+    Usage::
+      >>> input_file_name = "sample.json"
+      >>> p.tokenize_file(file_name, p.OPT.URL, p.OPT.MENTION)
+    """
+    file_contents = get_file_contents(input_file_path)
+    if not file_contents or len(file_contents) == 0:
+        raise IOError("Empty file given at path:" + input_file_path)
+
+    tokenized_content = []
+    for line in file_contents:
+        tokenized_content.append(tokenize(line))
+    output_path = write_to_output_file(input_file_path, tokenized_content, add_timestamp)
+    print("Saved the tokenized tweets to:" + output_path)
+    return output_path
diff --git a/tests/artifacts/tokenize_file_sample.json b/tests/artifacts/tokenize_file_sample.json
@@ -0,0 +1,6 @@
+[
+    "Preprocessor now supports files. https://github.com/s/preprocessor",
+    "#preprocessing is a cruical part of @ML projects.",
+    "@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl",
+    "#emoji #smiley 😀😍 https://emojipedia.org"
+]
diff --git a/tests/artifacts/tokenize_file_sample.txt b/tests/artifacts/tokenize_file_sample.txt
@@ -0,0 +1,4 @@
+Preprocessor now supports files. https://github.com/s/preprocessor
+#preprocessing is a cruical part of @ML projects.
+@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl
+#emoji #smiley 😀😍 https://emojipedia.org
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -101,7 +101,7 @@ def test_clean_file(self):
             self.assertIsNotNone(raw_data)
 
             # Test all option
-            check_against = self._get_test_data_for_option(raw_data)
+            check_against = self._get_cleaned_test_data_for_option(raw_data)
             self._test_clean_file(full_input_path, check_against)
 
             # Test individual options
@@ -115,9 +115,36 @@ def test_clean_file(self):
                 p.OPT.NUMBER
             ]
             for opt in options:
-                check_against = self._get_test_data_for_option(raw_data, opt)
+                check_against = self._get_cleaned_test_data_for_option(raw_data, opt)
                 self._test_clean_file(full_input_path, check_against, opt)
 
+    def test_tokenize_file(self):
+            current_dir = os.path.dirname(__file__)
+            artifacts_dir = os.path.join(current_dir, self._artifacts_dir_name)
+            extensions = [p.InputFileType.json, p.InputFileType.text]
+            for ext in extensions:
+                full_input_path = os.path.join(artifacts_dir, "tokenize_file_sample" + ext)
+                raw_data = p.get_file_contents(full_input_path)
+                self.assertIsNotNone(raw_data)
+
+                # Test all option
+                check_against = self._get_tokenized_test_data_for_option(raw_data)
+                self._test_clean_file(full_input_path, check_against)
+
+                # Test individual options
+                options = [
+                    p.OPT.URL,
+                    p.OPT.MENTION,
+                    p.OPT.HASHTAG,
+                    p.OPT.RESERVED,
+                    p.OPT.EMOJI,
+                    p.OPT.SMILEY,
+                    p.OPT.NUMBER
+                ]
+                for opt in options:
+                    check_against = self._get_tokenized_test_data_for_option(raw_data, opt)
+                    self._test_tokenize_file(full_input_path, check_against, opt)
+
     def test_escape_chars(self):
         p.set_options(p.OPT.ESCAPE_CHAR)
         input_str = u"\x01\x02\x03\x04I \x05\x06\x07\x10\x11have \x12\x13\x14" \
@@ -130,12 +157,24 @@ def _test_clean_file(self, full_input_path, check_against, *options):
         self.assertTrue(os.path.exists(output_path))
         clean_content = p.get_file_contents(output_path)
         p.are_lists_equal(clean_content, check_against)
+
+    def _test_tokenize_file(self, full_input_path, check_against, *options):
+        output_path = p.tokenize_file(full_input_path, True, options)
+        self.assertTrue(os.path.exists(output_path))
+        clean_content = p.get_file_contents(output_path)
+        p.are_lists_equal(clean_content, check_against)
 
-    def _get_test_data_for_option(self, raw_data, *options):
+    def _get_cleaned_test_data_for_option(self, raw_data, *options):
         clean_data = []
         for d in raw_data:
             clean_data.append(p.clean(d))
         return clean_data
+
+    def _get_tokenized_test_data_for_option(self, raw_data, *options):
+        clean_data = []
+        for d in raw_data:
+            clean_data.append(p.tokenize(d))
+        return clean_data
 
 
 if __name__ == '__main__':