Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: allow tokenization of files #57

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
tests/artifacts/*
!tests/artifacts/clean_file_sample.json
!tests/artifacts/clean_file_sample.txt
!tests/artifacts/tokenize_file_sample.json
!tests/artifacts/tokenize_file_sample.txt
.DS_Store
.python-version
__pycache__/
Expand Down
4 changes: 2 additions & 2 deletions preprocessor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .api import clean, tokenize, parse, set_options, clean_file
from .api import clean, tokenize, parse, set_options, clean_file, tokenize_file
from .defines import Options as OPT
from .defines import InputFileType, Defines
from .utils import get_worker_methods,\
Expand Down Expand Up @@ -28,4 +28,4 @@
write_to_text_file,
generate_random_file_name,
generate_random_alphanumeric_string,
are_lists_equal]
are_lists_equal]
23 changes: 23 additions & 0 deletions preprocessor/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,26 @@ def clean_file(input_file_path, add_timestamp=False, *options):
output_path = write_to_output_file(input_file_path, cleaned_content, add_timestamp)
print("Saved the cleaned tweets to:" + output_path)
return output_path

def tokenize_file(input_file_path, add_timestamp=False, *options):
"""Tokenize given input file in JSON and txt format if it can be found at the given path.
Returns a stdout for the output file path.
:param input_file_path: Absolute path for the tweets. Could be either in JSON or .txt format.
:param add_timestamp: If True, adds current timestamp to the filename
:return: output file path: str. Returns the file path of the cleaned file.
:rtype: str
:raises IOError if the input file empty
Usage::
>>> input_file_name = "sample.json"
>>> p.tokenize_file(file_name, p.OPT.URL, p.OPT.MENTION)
"""
file_contents = get_file_contents(input_file_path)
if not file_contents or len(file_contents) == 0:
raise IOError("Empty file given at path:" + input_file_path)

tokenized_content = []
for line in file_contents:
tokenized_content.append(tokenize(line))
output_path = write_to_output_file(input_file_path, tokenized_content, add_timestamp)
print("Saved the tokenized tweets to:" + output_path)
return output_path
6 changes: 6 additions & 0 deletions tests/artifacts/tokenize_file_sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[
"Preprocessor now supports files. https://github.com/s/preprocessor",
"#preprocessing is a cruical part of @ML projects.",
"@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl",
"#emoji #smiley 😀😍 https://emojipedia.org"
]
4 changes: 4 additions & 0 deletions tests/artifacts/tokenize_file_sample.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Preprocessor now supports files. https://github.com/s/preprocessor
#preprocessing is a cruical part of @ML projects.
@RT @Twitter raw text data usually has lots of #residue. http://t.co/g00gl
#emoji #smiley 😀😍 https://emojipedia.org
45 changes: 42 additions & 3 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def test_clean_file(self):
self.assertIsNotNone(raw_data)

# Test all option
check_against = self._get_test_data_for_option(raw_data)
check_against = self._get_cleaned_test_data_for_option(raw_data)
self._test_clean_file(full_input_path, check_against)

# Test individual options
Expand All @@ -115,9 +115,36 @@ def test_clean_file(self):
p.OPT.NUMBER
]
for opt in options:
check_against = self._get_test_data_for_option(raw_data, opt)
check_against = self._get_cleaned_test_data_for_option(raw_data, opt)
self._test_clean_file(full_input_path, check_against, opt)

def test_tokenize_file(self):
current_dir = os.path.dirname(__file__)
artifacts_dir = os.path.join(current_dir, self._artifacts_dir_name)
extensions = [p.InputFileType.json, p.InputFileType.text]
for ext in extensions:
full_input_path = os.path.join(artifacts_dir, "tokenize_file_sample" + ext)
raw_data = p.get_file_contents(full_input_path)
self.assertIsNotNone(raw_data)

# Test all option
check_against = self._get_tokenized_test_data_for_option(raw_data)
self._test_clean_file(full_input_path, check_against)

# Test individual options
options = [
p.OPT.URL,
p.OPT.MENTION,
p.OPT.HASHTAG,
p.OPT.RESERVED,
p.OPT.EMOJI,
p.OPT.SMILEY,
p.OPT.NUMBER
]
for opt in options:
check_against = self._get_tokenized_test_data_for_option(raw_data, opt)
self._test_tokenize_file(full_input_path, check_against, opt)

def test_escape_chars(self):
p.set_options(p.OPT.ESCAPE_CHAR)
input_str = u"\x01\x02\x03\x04I \x05\x06\x07\x10\x11have \x12\x13\x14" \
Expand All @@ -130,12 +157,24 @@ def _test_clean_file(self, full_input_path, check_against, *options):
self.assertTrue(os.path.exists(output_path))
clean_content = p.get_file_contents(output_path)
p.are_lists_equal(clean_content, check_against)

def _test_tokenize_file(self, full_input_path, check_against, *options):
output_path = p.tokenize_file(full_input_path, True, options)
self.assertTrue(os.path.exists(output_path))
clean_content = p.get_file_contents(output_path)
p.are_lists_equal(clean_content, check_against)

def _get_test_data_for_option(self, raw_data, *options):
def _get_cleaned_test_data_for_option(self, raw_data, *options):
clean_data = []
for d in raw_data:
clean_data.append(p.clean(d))
return clean_data

def _get_tokenized_test_data_for_option(self, raw_data, *options):
clean_data = []
for d in raw_data:
clean_data.append(p.tokenize(d))
return clean_data


if __name__ == '__main__':
Expand Down