complete data processing helpers for autofill and range

mims-harvard · Mar 18, 2024 · 2bb5d4e · 2bb5d4e
1 parent 2c92a9b
commit 2bb5d4e
Show file tree

Hide file tree

Showing 2 changed files with 150 additions and 0 deletions.
diff --git a/tdc/test/test_data_process.py b/tdc/test/test_data_process.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import unittest
+import shutil
+
+import pandas as pd
+
+# temporary solution for relative imports in case TDC is not installed
+# if TDC is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+from tdc.utils.data_processing_utils import DataParser
+
+
+class TestDataParser(unittest.TestCase):
+
+    def setUp(self):
+        print(os.getcwd())
+        pass
+
+    def testAutofill(self):
+        test_entries = [
+            [0,"x",8],
+            [1,'y',4],
+            [None, "x", 9],
+            [None, "y", 8],
+            [2, "z", 12]
+        ]
+        col_names = [
+            "autofill",
+            "index",
+            "value"
+        ]
+        df = pd.DataFrame(test_entries, columns=col_names)
+        df2 = DataParser.autofill_identifier(df, "autofill", "index")
+        self.assertEqual(df["autofill"].tolist(), [0,1,0,1,2])
+        self.assertEqual(df2["autofill"].tolist(), [0,1,0,1,2])
+        self.assertEqual(df2["index"].tolist(), ["x","y","x","y","z"])
+        self.assertEqual(df2["value"].tolist(), [8,4,9,8,12])
+        self.assertEqual(df2.shape[0],5)
+        self.assertEqual(df2.shape[1],3)
+
+    def testCreateRange(self):
+        test_entries = [
+            ["7.7±4.5", 0],
+            ["10±2.3", 1],
+            ["Putative binder", 5]
+        ]
+        col_names = [
+            "num",
+            "some_value"
+        ]
+        keys = ["Putative binder"]
+        subs = [0]
+        df = pd.DataFrame(test_entries, columns=col_names)
+        df2 = DataParser.create_range(df, "num", keys, subs)
+        assert "expected" in df.columns
+        assert "expected" in df2.columns
+        assert "lower" in df2.columns
+        assert "upper" in df2.columns
+        self.assertEqual(df2["expected"].tolist(), [7.7,10,0])
+        self.assertEqual(df2["lower"].tolist(), [3.2,7.7,0])
+        self.assertEqual(df2["upper"].tolist(), [12.2,12.3,0])
+        self.assertEqual(df2["num"].tolist(), ["7.7±4.5","10±2.3","Putative binder"])
+        self.assertEqual(df2["some_value"].tolist(), [0,1,5])
+        self.assertEqual(df2.shape[0],3)
+        self.assertEqual(df2.shape[1],5)
+
+    def tearDown(self):
+        print(os.getcwd())
+
+        if os.path.exists(os.path.join(os.getcwd(), "data")):
+            shutil.rmtree(os.path.join(os.getcwd(), "data"))
+        if os.path.exists(os.path.join(os.getcwd(), "oracle")):
+            shutil.rmtree(os.path.join(os.getcwd(), "oracle"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tdc/utils/data_processing_utils.py b/tdc/utils/data_processing_utils.py
@@ -0,0 +1,64 @@
+"""
+Class encapsulating general data processing functions. Also supports running them in sequence.
+Goal is to make it easier to integrate custom datasets not yet in TDC format.
+"""
+
+from pandas import Series, DataFrame
+
+class DataParser(object):
+    """
+    Class encapsulating general data processing functions. Also supports running them in sequence.
+    Goals are to make it easier to integrate custom datasets not yet in TDC format.
+    """
+    def __init__(self):
+        pass
+
+    @classmethod
+    def autofill_identifier(cls, dataset, autofill_column, key_column):
+        """Autofill a column based on base column. Assumes one-to-one mapping between both.
+        Modifications done in-place.
+        
+        Args:
+            dataset (pandas.DataFrame): dataset to modify.
+            autofill_column (str): name of the column to autofill.
+            key_column (str): name of the column used for indexing.
+            
+        Returns:
+            pandas.DataFrame: The modified dataset.
+        """
+        # Create a mapping from key_column to autofill_column
+        mapping = dataset.dropna(subset=[autofill_column]).drop_duplicates(subset=[key_column]).set_index(key_column)[autofill_column].to_dict()
+
+        # Apply the mapping to fill missing values in autofill_column based on key_column values
+        dataset[autofill_column] = dataset[key_column].map(mapping)
+
+        return dataset
+
+    @classmethod
+    def create_range(cls, dataset, column, keys=None, subs=None):
+        """From a column with numeric +/- values, create upper,lower, and expected columns
+        Modifies dataset in-place.
+        If special keys are provided, corresponding entries are replaced for the numerical value in subs"""
+        def helper(entry):
+            buffer=""
+            for idx, char in enumerate(entry):
+                if char.isdigit() or char == ".":
+                    buffer += char
+                else:
+                    break
+            rest = entry[idx+1:]
+            return float(buffer), float(rest)
+
+        keys = [] if keys is None else keys
+        subs = [] if subs is None else subs
+        assert isinstance(keys, list)
+        assert isinstance(subs, list)
+        assert len(keys) == len(subs)
+        subs_dict = {k:s for k,s in zip(keys, subs)}
+        entries = [helper(x) if x not in keys else (subs_dict[x], subs_dict[x]) for x in dataset[column]]
+        bounds = [[x1-x2, x1, x1+x2] if x1 not in keys else [x1,x1,x1] for x1,x2 in entries]
+        df_bounds = DataFrame(bounds, columns=['lower','expected','upper'])
+        dataset["lower"] = df_bounds["lower"]
+        dataset["expected"] = df_bounds["expected"]
+        dataset["upper"] = df_bounds["upper"]
+        return dataset