Skip to content

Commit

Permalink
adds unit tests for _csv_to_pandas_df
Browse files Browse the repository at this point in the history
  • Loading branch information
BWMac committed Sep 20, 2024
1 parent 667a918 commit 51bab6a
Showing 1 changed file with 182 additions and 0 deletions.
182 changes: 182 additions & 0 deletions tests/unit/synapseclient/unit_test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io
import math
import os
import json
import shutil
import tempfile
import time
Expand Down Expand Up @@ -42,6 +43,7 @@
_convert_df_date_cols_to_datetime,
_get_view_type_mask,
_get_view_type_mask_for_deprecated_type,
_csv_to_pandas_df,
as_table_columns,
build_table,
cast_values,
Expand Down Expand Up @@ -250,6 +252,186 @@ def test_convert_df_date_cols_to_datetime() -> None:
test_df2 = _convert_df_date_cols_to_datetime(test_df, date_columns)
assert_frame_equal(test_df2, expected_date_df)

def test_csv_to_pandas_df_no_kwargs():
# GIVEN a pandas DataFrame (CSV file stand-in)
expected_df = pd.DataFrame({
'col1': [1, 2, 3],
'col2': ['a', 'b', 'c'],
'col3': [True, False, True]
})

with patch.object(pd, 'read_csv', return_value=expected_df) as mock_read_csv:
# WHEN I call _csv_to_pandas_df with default parameters
df = _csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None
)

# THEN I expect pandas.read_csv was called with default arguments
mock_read_csv.assert_called_once_with(
"dummy_path.csv",
dtype=None,
sep=synapseclient.table.DEFAULT_SEPARATOR,
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
header=0,
skiprows=0,
lineterminator='\n'
)

# AND I expect the returned DataFrame to be the same as the original DataFrame (file)
pd.testing.assert_frame_equal(df, expected_df)


def test_csv_to_pandas_df_with_kwargs() -> None:
# GIVEN a pandas DataFrame (CSV file stand-in)
expected_df = pd.DataFrame({
'col1': [1, 2, 3],
'col2': ['a', 'b', 'c']
})

with patch.object(pd, 'read_csv', return_value=expected_df) as mock_read_csv:
# WHEN I call _csv_to_pandas_df with custom keyword arguments
kwargs = {
"escapechar": "\\",
"keep_default_na": False
}
df = _csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
**kwargs
)

# THEN I expect pandas.read_csv was called with the keyword arguments
mock_read_csv.assert_called_once_with(
"dummy_path.csv",
dtype=None,
sep=synapseclient.table.DEFAULT_SEPARATOR,
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escapechar='\\',
header=0,
skiprows=0,
keep_default_na=False,
lineterminator='\n'
)

# AND I expect the returned DataFrame to match the expected DataFrame
pd.testing.assert_frame_equal(df, expected_df)

def test_csv_to_pandas_df_calls_convert_date_cols():
# GIVEN a pandas DataFrame (CSV file stand-in) with a date column
expected_df = pd.DataFrame({
'col1': [1, 2, 3],
'date_col': ['2021-01-01', '2021-01-02', '2021-01-03']
})

with patch.object(pd, 'read_csv', return_value=expected_df), patch.object(synapseclient.table, '_convert_df_date_cols_to_datetime') as mock_convert_dates:
# WHEN I call _csv_to_pandas_df with date_columns specified
_csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=['date_col'], # Specify date column
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None
)

# THEN I expect _convert_df_date_cols_to_datetime to be called with the expected DataFrame and date columns
mock_convert_dates.assert_called_once_with(expected_df, ['date_col'])

def test_csv_to_pandas_df_handles_list_columns():
# GIVEN a pandas DataFrame (CSV file stand-in) with a list column
initial_df = pd.DataFrame({
'col1': [1, 2, 3],
'list_col': ['[1, 2, 3]', '[4, 5, 6]', '[7, 8, 9]']
})

# AND a pandas DataFrame (expected result) with the list column converted to a list
expected_final_df = pd.DataFrame({
'col1': [1, 2, 3],
'list_col': [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
})

with patch.object(pd, 'read_csv', return_value=initial_df), patch.object(synapseclient.table, '_convert_df_date_cols_to_datetime'), patch.object(pd.Series, 'apply', return_value=expected_final_df['list_col']) as mock_apply:
# WHEN I call _csv_to_pandas_df with list_columns specified
result_df = synapseclient.table._csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=['list_col'], # Specify list column
rowIdAndVersionInIndex=True,
dtype=None
)

# THEN I expect json.loads to be applied to the list column
mock_apply.assert_called_once_with(json.loads)

# AND I expect the returned DataFrame to match the expected DataFrame
pd.testing.assert_frame_equal(result_df, expected_final_df)

def test_csv_to_pandas_df_handles_row_id_and_version():
# GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns
initial_df = pd.DataFrame({
'ROW_ID': [1, 2, 3],
'ROW_VERSION': [1, 1, 2],
'col1': ['a', 'b', 'c'],
'col2': [10, 20, 30]
})

# AND a pandas DataFrame (expected result) with the ROW_ID and ROW_VERSION columns removed
expected_final_df = pd.DataFrame({
'col1': ['a', 'b', 'c'],
'col2': [10, 20, 30]
}, index=['1_1', '2_1', '3_2']) # Index format: ROW_ID_ROW_VERSION

with patch.object(pd, 'read_csv', return_value=initial_df), patch.object(synapseclient.table, 'row_labels_from_id_and_version', return_value=['1_1', '2_1', '3_2']) as mock_row_labels:
# WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True
result_df = synapseclient.table._csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None
)

# THEN I expect row_labels_from_id_and_version to be called once
mock_row_labels.assert_called_once()

# AND I expect the returned DataFrame to match the expected DataFrame with the ROW_ID and ROW_VERSION columns removed
pd.testing.assert_frame_equal(result_df, expected_final_df)

# AND I expect the index of the result_df to be as expected
assert list(result_df.index) == ['1_1', '2_1', '3_2']


def test_schema() -> None:
schema = Schema(name="My Table", parent="syn1000001")
Expand Down

0 comments on commit 51bab6a

Please sign in to comment.