From 51bab6a659623d04b4717ca400d95cc39bc2f843 Mon Sep 17 00:00:00 2001
From: bwmac <bwm03570@gmail.com>
Date: Fri, 20 Sep 2024 11:35:19 -0600
Subject: [PATCH] adds unit tests for _csv_to_pandas_df

---
 tests/unit/synapseclient/unit_test_tables.py | 182 +++++++++++++++++++
 1 file changed, 182 insertions(+)

diff --git a/tests/unit/synapseclient/unit_test_tables.py b/tests/unit/synapseclient/unit_test_tables.py
index 65334f366..2ffb00bdc 100644
--- a/tests/unit/synapseclient/unit_test_tables.py
+++ b/tests/unit/synapseclient/unit_test_tables.py
@@ -3,6 +3,7 @@
 import io
 import math
 import os
+import json
 import shutil
 import tempfile
 import time
@@ -42,6 +43,7 @@
     _convert_df_date_cols_to_datetime,
     _get_view_type_mask,
     _get_view_type_mask_for_deprecated_type,
+    _csv_to_pandas_df,
     as_table_columns,
     build_table,
     cast_values,
@@ -250,6 +252,186 @@ def test_convert_df_date_cols_to_datetime() -> None:
     test_df2 = _convert_df_date_cols_to_datetime(test_df, date_columns)
     assert_frame_equal(test_df2, expected_date_df)
 
+def test_csv_to_pandas_df_no_kwargs():
+    # GIVEN a pandas DataFrame (CSV file stand-in)
+    expected_df = pd.DataFrame({
+        'col1': [1, 2, 3],
+        'col2': ['a', 'b', 'c'],
+        'col3': [True, False, True]
+    })
+
+    with patch.object(pd, 'read_csv', return_value=expected_df) as mock_read_csv:        
+        # WHEN I call _csv_to_pandas_df with default parameters
+        df = _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None
+        )
+        
+        # THEN I expect pandas.read_csv was called with default arguments
+        mock_read_csv.assert_called_once_with(
+            "dummy_path.csv",
+            dtype=None,
+            sep=synapseclient.table.DEFAULT_SEPARATOR,
+            quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            header=0,
+            skiprows=0,
+            lineterminator='\n'
+        )
+        
+        # AND I expect the returned DataFrame to be the same as the original DataFrame (file)
+        pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_csv_to_pandas_df_with_kwargs() -> None:    
+    # GIVEN a pandas DataFrame (CSV file stand-in)
+    expected_df = pd.DataFrame({
+        'col1': [1, 2, 3],
+        'col2': ['a', 'b', 'c']
+    })
+
+    with patch.object(pd, 'read_csv', return_value=expected_df) as mock_read_csv:
+        # WHEN I call _csv_to_pandas_df with custom keyword arguments
+        kwargs = {
+            "escapechar": "\\",
+            "keep_default_na": False
+        }
+        df = _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None,
+            **kwargs
+        )
+        
+        # THEN I expect pandas.read_csv was called with the keyword arguments
+        mock_read_csv.assert_called_once_with(
+            "dummy_path.csv",
+            dtype=None,
+            sep=synapseclient.table.DEFAULT_SEPARATOR,
+            quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escapechar='\\',
+            header=0,
+            skiprows=0,
+            keep_default_na=False,
+            lineterminator='\n'
+        )
+        
+        # AND I expect the returned DataFrame to match the expected DataFrame
+        pd.testing.assert_frame_equal(df, expected_df)
+
+def test_csv_to_pandas_df_calls_convert_date_cols():    
+    # GIVEN a pandas DataFrame (CSV file stand-in) with a date column
+    expected_df = pd.DataFrame({
+        'col1': [1, 2, 3],
+        'date_col': ['2021-01-01', '2021-01-02', '2021-01-03']
+    })
+    
+    with patch.object(pd, 'read_csv', return_value=expected_df), patch.object(synapseclient.table, '_convert_df_date_cols_to_datetime') as mock_convert_dates:
+        # WHEN I call _csv_to_pandas_df with date_columns specified
+        _csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=['date_col'],  # Specify date column
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None
+        )
+        
+        # THEN I expect _convert_df_date_cols_to_datetime to be called with the expected DataFrame and date columns
+        mock_convert_dates.assert_called_once_with(expected_df, ['date_col'])
+
+def test_csv_to_pandas_df_handles_list_columns():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with a list column
+    initial_df = pd.DataFrame({
+        'col1': [1, 2, 3],
+        'list_col': ['[1, 2, 3]', '[4, 5, 6]', '[7, 8, 9]']
+    })
+
+    # AND a pandas DataFrame (expected result) with the list column converted to a list
+    expected_final_df = pd.DataFrame({
+        'col1': [1, 2, 3],
+        'list_col': [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+    })
+
+    with patch.object(pd, 'read_csv', return_value=initial_df), patch.object(synapseclient.table, '_convert_df_date_cols_to_datetime'), patch.object(pd.Series, 'apply', return_value=expected_final_df['list_col']) as mock_apply:
+        # WHEN I call _csv_to_pandas_df with list_columns specified
+        result_df = synapseclient.table._csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=['list_col'],  # Specify list column
+            rowIdAndVersionInIndex=True,
+            dtype=None
+        )
+
+        # THEN I expect json.loads to be applied to the list column
+        mock_apply.assert_called_once_with(json.loads)
+
+        # AND I expect the returned DataFrame to match the expected DataFrame
+        pd.testing.assert_frame_equal(result_df, expected_final_df)
+
+def test_csv_to_pandas_df_handles_row_id_and_version():
+    # GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns
+    initial_df = pd.DataFrame({
+        'ROW_ID': [1, 2, 3],
+        'ROW_VERSION': [1, 1, 2],
+        'col1': ['a', 'b', 'c'],
+        'col2': [10, 20, 30]
+    })
+
+    # AND a pandas DataFrame (expected result) with the ROW_ID and ROW_VERSION columns removed
+    expected_final_df = pd.DataFrame({
+        'col1': ['a', 'b', 'c'],
+        'col2': [10, 20, 30]
+    }, index=['1_1', '2_1', '3_2'])  # Index format: ROW_ID_ROW_VERSION
+
+    with patch.object(pd, 'read_csv', return_value=initial_df), patch.object(synapseclient.table, 'row_labels_from_id_and_version', return_value=['1_1', '2_1', '3_2']) as mock_row_labels:
+        # WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True
+        result_df = synapseclient.table._csv_to_pandas_df(
+            filepath="dummy_path.csv",
+            separator=synapseclient.table.DEFAULT_SEPARATOR,
+            quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
+            escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
+            contain_headers=True,
+            lines_to_skip=0,
+            date_columns=None,
+            list_columns=None,
+            rowIdAndVersionInIndex=True,
+            dtype=None
+        )
+
+        # THEN I expect row_labels_from_id_and_version to be called once
+        mock_row_labels.assert_called_once()
+
+        # AND I expect the returned DataFrame to match the expected DataFrame with the ROW_ID and ROW_VERSION columns removed
+        pd.testing.assert_frame_equal(result_df, expected_final_df)
+
+        # AND I expect the index of the result_df to be as expected
+        assert list(result_df.index) == ['1_1', '2_1', '3_2']
+
 
 def test_schema() -> None:
     schema = Schema(name="My Table", parent="syn1000001")