Skip to content

Commit

Permalink
complete data processing helpers for autofill and range
Browse files Browse the repository at this point in the history
  • Loading branch information
amva13 committed Mar 18, 2024
1 parent 2c92a9b commit 2bb5d4e
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 0 deletions.
86 changes: 86 additions & 0 deletions tdc/test/test_data_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-

from __future__ import division
from __future__ import print_function

import os
import sys

import unittest
import shutil

import pandas as pd

# temporary solution for relative imports in case TDC is not installed
# if TDC is installed, no need to use the following line
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))

from tdc.utils.data_processing_utils import DataParser


class TestDataParser(unittest.TestCase):

def setUp(self):
print(os.getcwd())
pass

def testAutofill(self):
test_entries = [
[0,"x",8],
[1,'y',4],
[None, "x", 9],
[None, "y", 8],
[2, "z", 12]
]
col_names = [
"autofill",
"index",
"value"
]
df = pd.DataFrame(test_entries, columns=col_names)
df2 = DataParser.autofill_identifier(df, "autofill", "index")
self.assertEqual(df["autofill"].tolist(), [0,1,0,1,2])
self.assertEqual(df2["autofill"].tolist(), [0,1,0,1,2])
self.assertEqual(df2["index"].tolist(), ["x","y","x","y","z"])
self.assertEqual(df2["value"].tolist(), [8,4,9,8,12])
self.assertEqual(df2.shape[0],5)
self.assertEqual(df2.shape[1],3)

def testCreateRange(self):
test_entries = [
["7.7±4.5", 0],
["10±2.3", 1],
["Putative binder", 5]
]
col_names = [
"num",
"some_value"
]
keys = ["Putative binder"]
subs = [0]
df = pd.DataFrame(test_entries, columns=col_names)
df2 = DataParser.create_range(df, "num", keys, subs)
assert "expected" in df.columns
assert "expected" in df2.columns
assert "lower" in df2.columns
assert "upper" in df2.columns
self.assertEqual(df2["expected"].tolist(), [7.7,10,0])
self.assertEqual(df2["lower"].tolist(), [3.2,7.7,0])
self.assertEqual(df2["upper"].tolist(), [12.2,12.3,0])
self.assertEqual(df2["num"].tolist(), ["7.7±4.5","10±2.3","Putative binder"])
self.assertEqual(df2["some_value"].tolist(), [0,1,5])
self.assertEqual(df2.shape[0],3)
self.assertEqual(df2.shape[1],5)

def tearDown(self):
print(os.getcwd())

if os.path.exists(os.path.join(os.getcwd(), "data")):
shutil.rmtree(os.path.join(os.getcwd(), "data"))
if os.path.exists(os.path.join(os.getcwd(), "oracle")):
shutil.rmtree(os.path.join(os.getcwd(), "oracle"))


if __name__ == "__main__":
unittest.main()
64 changes: 64 additions & 0 deletions tdc/utils/data_processing_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Class encapsulating general data processing functions. Also supports running them in sequence.
Goal is to make it easier to integrate custom datasets not yet in TDC format.
"""

from pandas import Series, DataFrame

class DataParser(object):
"""
Class encapsulating general data processing functions. Also supports running them in sequence.
Goals are to make it easier to integrate custom datasets not yet in TDC format.
"""
def __init__(self):
pass

@classmethod
def autofill_identifier(cls, dataset, autofill_column, key_column):
"""Autofill a column based on base column. Assumes one-to-one mapping between both.
Modifications done in-place.
Args:
dataset (pandas.DataFrame): dataset to modify.
autofill_column (str): name of the column to autofill.
key_column (str): name of the column used for indexing.
Returns:
pandas.DataFrame: The modified dataset.
"""
# Create a mapping from key_column to autofill_column
mapping = dataset.dropna(subset=[autofill_column]).drop_duplicates(subset=[key_column]).set_index(key_column)[autofill_column].to_dict()

# Apply the mapping to fill missing values in autofill_column based on key_column values
dataset[autofill_column] = dataset[key_column].map(mapping)

return dataset

@classmethod
def create_range(cls, dataset, column, keys=None, subs=None):
"""From a column with numeric +/- values, create upper,lower, and expected columns
Modifies dataset in-place.
If special keys are provided, corresponding entries are replaced for the numerical value in subs"""
def helper(entry):
buffer=""
for idx, char in enumerate(entry):
if char.isdigit() or char == ".":
buffer += char
else:
break
rest = entry[idx+1:]
return float(buffer), float(rest)

keys = [] if keys is None else keys
subs = [] if subs is None else subs
assert isinstance(keys, list)
assert isinstance(subs, list)
assert len(keys) == len(subs)
subs_dict = {k:s for k,s in zip(keys, subs)}
entries = [helper(x) if x not in keys else (subs_dict[x], subs_dict[x]) for x in dataset[column]]
bounds = [[x1-x2, x1, x1+x2] if x1 not in keys else [x1,x1,x1] for x1,x2 in entries]
df_bounds = DataFrame(bounds, columns=['lower','expected','upper'])
dataset["lower"] = df_bounds["lower"]
dataset["expected"] = df_bounds["expected"]
dataset["upper"] = df_bounds["upper"]
return dataset

0 comments on commit 2bb5d4e

Please sign in to comment.