From 51bab6a659623d04b4717ca400d95cc39bc2f843 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 20 Sep 2024 11:35:19 -0600 Subject: [PATCH] adds unit tests for _csv_to_pandas_df --- tests/unit/synapseclient/unit_test_tables.py | 182 +++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/tests/unit/synapseclient/unit_test_tables.py b/tests/unit/synapseclient/unit_test_tables.py index 65334f366..2ffb00bdc 100644 --- a/tests/unit/synapseclient/unit_test_tables.py +++ b/tests/unit/synapseclient/unit_test_tables.py @@ -3,6 +3,7 @@ import io import math import os +import json import shutil import tempfile import time @@ -42,6 +43,7 @@ _convert_df_date_cols_to_datetime, _get_view_type_mask, _get_view_type_mask_for_deprecated_type, + _csv_to_pandas_df, as_table_columns, build_table, cast_values, @@ -250,6 +252,186 @@ def test_convert_df_date_cols_to_datetime() -> None: test_df2 = _convert_df_date_cols_to_datetime(test_df, date_columns) assert_frame_equal(test_df2, expected_date_df) +def test_csv_to_pandas_df_no_kwargs(): + # GIVEN a pandas DataFrame (CSV file stand-in) + expected_df = pd.DataFrame({ + 'col1': [1, 2, 3], + 'col2': ['a', 'b', 'c'], + 'col3': [True, False, True] + }) + + with patch.object(pd, 'read_csv', return_value=expected_df) as mock_read_csv: + # WHEN I call _csv_to_pandas_df with default parameters + df = _csv_to_pandas_df( + filepath="dummy_path.csv", + separator=synapseclient.table.DEFAULT_SEPARATOR, + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, + contain_headers=True, + lines_to_skip=0, + date_columns=None, + list_columns=None, + rowIdAndVersionInIndex=True, + dtype=None + ) + + # THEN I expect pandas.read_csv was called with default arguments + mock_read_csv.assert_called_once_with( + "dummy_path.csv", + dtype=None, + sep=synapseclient.table.DEFAULT_SEPARATOR, + quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER, + escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR, + header=0, + skiprows=0, + lineterminator='\n' + ) + + # AND I expect the returned DataFrame to be the same as the original DataFrame (file) + pd.testing.assert_frame_equal(df, expected_df) + + +def test_csv_to_pandas_df_with_kwargs() -> None: + # GIVEN a pandas DataFrame (CSV file stand-in) + expected_df = pd.DataFrame({ + 'col1': [1, 2, 3], + 'col2': ['a', 'b', 'c'] + }) + + with patch.object(pd, 'read_csv', return_value=expected_df) as mock_read_csv: + # WHEN I call _csv_to_pandas_df with custom keyword arguments + kwargs = { + "escapechar": "\\", + "keep_default_na": False + } + df = _csv_to_pandas_df( + filepath="dummy_path.csv", + separator=synapseclient.table.DEFAULT_SEPARATOR, + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, + contain_headers=True, + lines_to_skip=0, + date_columns=None, + list_columns=None, + rowIdAndVersionInIndex=True, + dtype=None, + **kwargs + ) + + # THEN I expect pandas.read_csv was called with the keyword arguments + mock_read_csv.assert_called_once_with( + "dummy_path.csv", + dtype=None, + sep=synapseclient.table.DEFAULT_SEPARATOR, + quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER, + escapechar='\\', + header=0, + skiprows=0, + keep_default_na=False, + lineterminator='\n' + ) + + # AND I expect the returned DataFrame to match the expected DataFrame + pd.testing.assert_frame_equal(df, expected_df) + +def test_csv_to_pandas_df_calls_convert_date_cols(): + # GIVEN a pandas DataFrame (CSV file stand-in) with a date column + expected_df = pd.DataFrame({ + 'col1': [1, 2, 3], + 'date_col': ['2021-01-01', '2021-01-02', '2021-01-03'] + }) + + with patch.object(pd, 'read_csv', return_value=expected_df), patch.object(synapseclient.table, '_convert_df_date_cols_to_datetime') as mock_convert_dates: + # WHEN I call _csv_to_pandas_df with date_columns specified + _csv_to_pandas_df( + filepath="dummy_path.csv", + separator=synapseclient.table.DEFAULT_SEPARATOR, + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, + contain_headers=True, + lines_to_skip=0, + date_columns=['date_col'], # Specify date column + list_columns=None, + rowIdAndVersionInIndex=True, + dtype=None + ) + + # THEN I expect _convert_df_date_cols_to_datetime to be called with the expected DataFrame and date columns + mock_convert_dates.assert_called_once_with(expected_df, ['date_col']) + +def test_csv_to_pandas_df_handles_list_columns(): + # GIVEN a pandas DataFrame (CSV file stand-in) with a list column + initial_df = pd.DataFrame({ + 'col1': [1, 2, 3], + 'list_col': ['[1, 2, 3]', '[4, 5, 6]', '[7, 8, 9]'] + }) + + # AND a pandas DataFrame (expected result) with the list column converted to a list + expected_final_df = pd.DataFrame({ + 'col1': [1, 2, 3], + 'list_col': [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + }) + + with patch.object(pd, 'read_csv', return_value=initial_df), patch.object(synapseclient.table, '_convert_df_date_cols_to_datetime'), patch.object(pd.Series, 'apply', return_value=expected_final_df['list_col']) as mock_apply: + # WHEN I call _csv_to_pandas_df with list_columns specified + result_df = synapseclient.table._csv_to_pandas_df( + filepath="dummy_path.csv", + separator=synapseclient.table.DEFAULT_SEPARATOR, + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, + contain_headers=True, + lines_to_skip=0, + date_columns=None, + list_columns=['list_col'], # Specify list column + rowIdAndVersionInIndex=True, + dtype=None + ) + + # THEN I expect json.loads to be applied to the list column + mock_apply.assert_called_once_with(json.loads) + + # AND I expect the returned DataFrame to match the expected DataFrame + pd.testing.assert_frame_equal(result_df, expected_final_df) + +def test_csv_to_pandas_df_handles_row_id_and_version(): + # GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns + initial_df = pd.DataFrame({ + 'ROW_ID': [1, 2, 3], + 'ROW_VERSION': [1, 1, 2], + 'col1': ['a', 'b', 'c'], + 'col2': [10, 20, 30] + }) + + # AND a pandas DataFrame (expected result) with the ROW_ID and ROW_VERSION columns removed + expected_final_df = pd.DataFrame({ + 'col1': ['a', 'b', 'c'], + 'col2': [10, 20, 30] + }, index=['1_1', '2_1', '3_2']) # Index format: ROW_ID_ROW_VERSION + + with patch.object(pd, 'read_csv', return_value=initial_df), patch.object(synapseclient.table, 'row_labels_from_id_and_version', return_value=['1_1', '2_1', '3_2']) as mock_row_labels: + # WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True + result_df = synapseclient.table._csv_to_pandas_df( + filepath="dummy_path.csv", + separator=synapseclient.table.DEFAULT_SEPARATOR, + quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER, + escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR, + contain_headers=True, + lines_to_skip=0, + date_columns=None, + list_columns=None, + rowIdAndVersionInIndex=True, + dtype=None + ) + + # THEN I expect row_labels_from_id_and_version to be called once + mock_row_labels.assert_called_once() + + # AND I expect the returned DataFrame to match the expected DataFrame with the ROW_ID and ROW_VERSION columns removed + pd.testing.assert_frame_equal(result_df, expected_final_df) + + # AND I expect the index of the result_df to be as expected + assert list(result_df.index) == ['1_1', '2_1', '3_2'] + def test_schema() -> None: schema = Schema(name="My Table", parent="syn1000001")