diff --git a/tdc/test/test_data_process.py b/tdc/test/test_data_process.py new file mode 100644 index 00000000..8f2569f3 --- /dev/null +++ b/tdc/test/test_data_process.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +from __future__ import division +from __future__ import print_function + +import os +import sys + +import unittest +import shutil + +import pandas as pd + +# temporary solution for relative imports in case TDC is not installed +# if TDC is installed, no need to use the following line +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) + +from tdc.utils.data_processing_utils import DataParser + + +class TestDataParser(unittest.TestCase): + + def setUp(self): + print(os.getcwd()) + pass + + def testAutofill(self): + test_entries = [ + [0,"x",8], + [1,'y',4], + [None, "x", 9], + [None, "y", 8], + [2, "z", 12] + ] + col_names = [ + "autofill", + "index", + "value" + ] + df = pd.DataFrame(test_entries, columns=col_names) + df2 = DataParser.autofill_identifier(df, "autofill", "index") + self.assertEqual(df["autofill"].tolist(), [0,1,0,1,2]) + self.assertEqual(df2["autofill"].tolist(), [0,1,0,1,2]) + self.assertEqual(df2["index"].tolist(), ["x","y","x","y","z"]) + self.assertEqual(df2["value"].tolist(), [8,4,9,8,12]) + self.assertEqual(df2.shape[0],5) + self.assertEqual(df2.shape[1],3) + + def testCreateRange(self): + test_entries = [ + ["7.7±4.5", 0], + ["10±2.3", 1], + ["Putative binder", 5] + ] + col_names = [ + "num", + "some_value" + ] + keys = ["Putative binder"] + subs = [0] + df = pd.DataFrame(test_entries, columns=col_names) + df2 = DataParser.create_range(df, "num", keys, subs) + assert "expected" in df.columns + assert "expected" in df2.columns + assert "lower" in df2.columns + assert "upper" in df2.columns + self.assertEqual(df2["expected"].tolist(), [7.7,10,0]) + self.assertEqual(df2["lower"].tolist(), [3.2,7.7,0]) + self.assertEqual(df2["upper"].tolist(), [12.2,12.3,0]) + self.assertEqual(df2["num"].tolist(), ["7.7±4.5","10±2.3","Putative binder"]) + self.assertEqual(df2["some_value"].tolist(), [0,1,5]) + self.assertEqual(df2.shape[0],3) + self.assertEqual(df2.shape[1],5) + + def tearDown(self): + print(os.getcwd()) + + if os.path.exists(os.path.join(os.getcwd(), "data")): + shutil.rmtree(os.path.join(os.getcwd(), "data")) + if os.path.exists(os.path.join(os.getcwd(), "oracle")): + shutil.rmtree(os.path.join(os.getcwd(), "oracle")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tdc/utils/data_processing_utils.py b/tdc/utils/data_processing_utils.py new file mode 100644 index 00000000..339bcf9e --- /dev/null +++ b/tdc/utils/data_processing_utils.py @@ -0,0 +1,64 @@ +""" +Class encapsulating general data processing functions. Also supports running them in sequence. +Goal is to make it easier to integrate custom datasets not yet in TDC format. +""" + +from pandas import Series, DataFrame + +class DataParser(object): + """ + Class encapsulating general data processing functions. Also supports running them in sequence. + Goals are to make it easier to integrate custom datasets not yet in TDC format. + """ + def __init__(self): + pass + + @classmethod + def autofill_identifier(cls, dataset, autofill_column, key_column): + """Autofill a column based on base column. Assumes one-to-one mapping between both. + Modifications done in-place. + + Args: + dataset (pandas.DataFrame): dataset to modify. + autofill_column (str): name of the column to autofill. + key_column (str): name of the column used for indexing. + + Returns: + pandas.DataFrame: The modified dataset. + """ + # Create a mapping from key_column to autofill_column + mapping = dataset.dropna(subset=[autofill_column]).drop_duplicates(subset=[key_column]).set_index(key_column)[autofill_column].to_dict() + + # Apply the mapping to fill missing values in autofill_column based on key_column values + dataset[autofill_column] = dataset[key_column].map(mapping) + + return dataset + + @classmethod + def create_range(cls, dataset, column, keys=None, subs=None): + """From a column with numeric +/- values, create upper,lower, and expected columns + Modifies dataset in-place. + If special keys are provided, corresponding entries are replaced for the numerical value in subs""" + def helper(entry): + buffer="" + for idx, char in enumerate(entry): + if char.isdigit() or char == ".": + buffer += char + else: + break + rest = entry[idx+1:] + return float(buffer), float(rest) + + keys = [] if keys is None else keys + subs = [] if subs is None else subs + assert isinstance(keys, list) + assert isinstance(subs, list) + assert len(keys) == len(subs) + subs_dict = {k:s for k,s in zip(keys, subs)} + entries = [helper(x) if x not in keys else (subs_dict[x], subs_dict[x]) for x in dataset[column]] + bounds = [[x1-x2, x1, x1+x2] if x1 not in keys else [x1,x1,x1] for x1,x2 in entries] + df_bounds = DataFrame(bounds, columns=['lower','expected','upper']) + dataset["lower"] = df_bounds["lower"] + dataset["expected"] = df_bounds["expected"] + dataset["upper"] = df_bounds["upper"] + return dataset \ No newline at end of file