Sage-Bionetworks · BWMac · Sep 24, 2024 · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -34,7 +34,7 @@
 import re
 import tempfile
 from builtins import zip
-from typing import Dict, List, Tuple, TypeVar, Union
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
 
 from synapseclient.core.constants import concrete_types
 from synapseclient.core.exceptions import SynapseError
@@ -397,16 +397,16 @@ def _convert_df_date_cols_to_datetime(
 
 
 def _csv_to_pandas_df(
-    filepath,
-    separator=DEFAULT_SEPARATOR,
-    quote_char=DEFAULT_QUOTE_CHARACTER,
-    escape_char=DEFAULT_ESCAPSE_CHAR,
-    contain_headers=True,
-    lines_to_skip=0,
-    date_columns=None,
-    list_columns=None,
-    rowIdAndVersionInIndex=True,
-    dtype=None,
+    filepath: str,
+    separator: str = DEFAULT_SEPARATOR,
+    quote_char: str = DEFAULT_QUOTE_CHARACTER,
+    escape_char: str = DEFAULT_ESCAPSE_CHAR,
+    contain_headers: bool = True,
+    lines_to_skip: int = 0,
+    date_columns: Optional[List[str]] = None,
+    list_columns: Optional[List[str]] = None,
+    rowIdAndVersionInIndex: bool = True,
+    dtype: Optional[Dict[str, Any]] = None,
     **kwargs,
 ):
     """
@@ -440,21 +440,26 @@ def _csv_to_pandas_df(
 
     line_terminator = str(os.linesep)
 
+    pandas_args = {
+        "dtype": dtype,
+        "sep": separator,
+        "quotechar": quote_char,
+        "escapechar": escape_char,
+        "header": 0 if contain_headers else None,
+        "skiprows": lines_to_skip,
+    }
+    pandas_args.update(kwargs)
+
     # assign line terminator only if for single character
     # line terminators (e.g. not '\r\n') 'cause pandas doesn't
     # longer line terminators. See: <https://github.com/pydata/pandas/issues/3501>
     # "ValueError: Only length-1 line terminators supported"
     df = pd.read_csv(
         filepath,
-        dtype=dtype,
-        sep=separator,
         lineterminator=line_terminator if len(line_terminator) == 1 else None,
-        quotechar=quote_char,
-        escapechar=escape_char,
-        header=0 if contain_headers else None,
-        skiprows=lines_to_skip,
-        **kwargs,
+        **pandas_args,
     )
+
     # parse date columns if exists
     if date_columns:
         df = _convert_df_date_cols_to_datetime(df, date_columns)

@@ -1,6 +1,7 @@
 """Unit test for synapseclient.table"""
 import csv
 import io
+import json
 import math
 import os
 import shutil
@@ -40,6 +41,7 @@
     Table,
     TableQueryResult,
     _convert_df_date_cols_to_datetime,
+    _csv_to_pandas_df,
     _get_view_type_mask,
     _get_view_type_mask_for_deprecated_type,
     as_table_columns,
@@ -251,6 +253,198 @@ def test_convert_df_date_cols_to_datetime() -> None:
     assert_frame_equal(test_df2, expected_date_df)
 
 
+def test_csv_to_pandas_df_no_kwargs():
+    # GIVEN a pandas DataFrame (CSV file stand-in)
+    expected_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "col2": ["a", "b", "c"], "col3": [True, False, True]}
+    )
+
+    with patch.object(
+        pd, "read_csv", return_value=expected_df
+    ) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
+        # WHEN I call _csv_to_pandas_df with default parameters
+        df = _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect pandas.read_csv was called with default arguments
+        mock_read_csv.assert_called_once_with(
+            "dummy_path.csv",
+            dtype=None,
+            sep=synapseclient.table.DEFAULT_SEPARATOR,
+            quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            header=0,
+            skiprows=0,
+            lineterminator=None,
+        )
+
+        # AND I expect the returned DataFrame to be
+        # the same as the original DataFrame (file)
+        pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_csv_to_pandas_df_with_kwargs() -> None:
+    # GIVEN a pandas DataFrame (CSV file stand-in)
+    expected_df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
+
+    with patch.object(
+        pd, "read_csv", return_value=expected_df
+    ) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
+        # WHEN I call _csv_to_pandas_df with custom keyword arguments
+        kwargs = {"escapechar": "\\", "keep_default_na": False}
+        df = _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+            **kwargs,
+        )
+
+        # THEN I expect pandas.read_csv was called with the keyword arguments
+        mock_read_csv.assert_called_once_with(
+            "dummy_path.csv",
+            dtype=None,
+            sep=synapseclient.table.DEFAULT_SEPARATOR,
+            quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escapechar="\\",
+            header=0,
+            skiprows=0,
+            keep_default_na=False,
+            lineterminator=None,
+        )
+
+        # AND I expect the returned DataFrame to match the expected DataFrame
+        pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_csv_to_pandas_df_calls_convert_date_cols():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with a date column
+    expected_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "date_col": ["2021-01-01", "2021-01-02", "2021-01-03"]}
+    )
+
+    with patch.object(pd, "read_csv", return_value=expected_df), patch.object(
+        synapseclient.table, "_convert_df_date_cols_to_datetime"
+    ) as mock_convert_dates:
+        # WHEN I call _csv_to_pandas_df with date_columns specified
+        _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=["date_col"],  # Specify date column
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect _convert_df_date_cols_to_datetime to be
+        # called with the expected DataFrame and date columns
+        mock_convert_dates.assert_called_once_with(expected_df, ["date_col"])
+
+
+def test_csv_to_pandas_df_handles_list_columns():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with a list column
+    initial_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "list_col": ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"]}
+    )
+
+    # AND a pandas DataFrame (expected result) with the list column converted to a list
+    expected_final_df = pd.DataFrame(
+        {"col1": [1, 2, 3], "list_col": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}
+    )
+
+    with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
+        synapseclient.table, "_convert_df_date_cols_to_datetime"
+    ), patch.object(
+        pd.Series, "apply", return_value=expected_final_df["list_col"]
+    ) as mock_apply:
+        # WHEN I call _csv_to_pandas_df with list_columns specified
+        result_df = synapseclient.table._csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=["list_col"],  # Specify list column
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect json.loads to be applied to the list column
+        mock_apply.assert_called_once_with(json.loads)
+
+        # AND I expect the returned DataFrame to match the expected DataFrame
+        pd.testing.assert_frame_equal(result_df, expected_final_df)
+
+
+def test_csv_to_pandas_df_handles_row_id_and_version():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns
+    initial_df = pd.DataFrame(
+        {
+            "ROW_ID": [1, 2, 3],
+            "ROW_VERSION": [1, 1, 2],
+            "col1": ["a", "b", "c"],
+            "col2": [10, 20, 30],
+        }
+    )
+
+    # AND a pandas DataFrame (expected result)
+    # with the ROW_ID and ROW_VERSION columns removed
+    expected_final_df = pd.DataFrame(
+        {"col1": ["a", "b", "c"], "col2": [10, 20, 30]}, index=["1_1", "2_1", "3_2"]
+    )  # Index format: ROW_ID_ROW_VERSION
+
+    with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
+        synapseclient.table,
+        "row_labels_from_id_and_version",
+        return_value=["1_1", "2_1", "3_2"],
+    ) as mock_row_labels:
+        # WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True
+        result_df = synapseclient.table._csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+        )
+
+        # THEN I expect row_labels_from_id_and_version to be called once
+        mock_row_labels.assert_called_once()
+
+        # AND I expect the returned DataFrame to match the expected
+        # DataFrame with the ROW_ID and ROW_VERSION columns removed
+        pd.testing.assert_frame_equal(result_df, expected_final_df)
+
+        # AND I expect the index of the result_df to be as expected
+        assert list(result_df.index) == ["1_1", "2_1", "3_2"]
+
+
 def test_schema() -> None:
     schema = Schema(name="My Table", parent="syn1000001")