Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYNPY-1521] Fixes asDataFrame kwarg Collision Issue #1132

Merged
merged 11 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 23 additions & 18 deletions synapseclient/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import re
import tempfile
from builtins import zip
from typing import Dict, List, Tuple, TypeVar, Union
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union

from synapseclient.core.constants import concrete_types
from synapseclient.core.exceptions import SynapseError
Expand Down Expand Up @@ -397,16 +397,16 @@ def _convert_df_date_cols_to_datetime(


def _csv_to_pandas_df(
filepath,
separator=DEFAULT_SEPARATOR,
quote_char=DEFAULT_QUOTE_CHARACTER,
escape_char=DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
filepath: str,
separator: str = DEFAULT_SEPARATOR,
quote_char: str = DEFAULT_QUOTE_CHARACTER,
escape_char: str = DEFAULT_ESCAPSE_CHAR,
contain_headers: bool = True,
lines_to_skip: int = 0,
date_columns: Optional[List[str]] = None,
list_columns: Optional[List[str]] = None,
rowIdAndVersionInIndex: bool = True,
dtype: Optional[Dict[str, Any]] = None,
**kwargs,
):
jaymedina marked this conversation as resolved.
Show resolved Hide resolved
"""
Expand Down Expand Up @@ -440,21 +440,26 @@ def _csv_to_pandas_df(

line_terminator = str(os.linesep)

pandas_args = {
"dtype": dtype,
"sep": separator,
"quotechar": quote_char,
"escapechar": escape_char,
"header": 0 if contain_headers else None,
"skiprows": lines_to_skip,
}
pandas_args.update(kwargs)
BWMac marked this conversation as resolved.
Show resolved Hide resolved

# assign line terminator only if for single character
# line terminators (e.g. not '\r\n') 'cause pandas doesn't
# longer line terminators. See: <https://github.com/pydata/pandas/issues/3501>
# "ValueError: Only length-1 line terminators supported"
df = pd.read_csv(
filepath,
dtype=dtype,
sep=separator,
lineterminator=line_terminator if len(line_terminator) == 1 else None,
quotechar=quote_char,
escapechar=escape_char,
header=0 if contain_headers else None,
skiprows=lines_to_skip,
**kwargs,
**pandas_args,
)

# parse date columns if exists
if date_columns:
df = _convert_df_date_cols_to_datetime(df, date_columns)
Expand Down
194 changes: 194 additions & 0 deletions tests/unit/synapseclient/unit_test_tables.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Unit test for synapseclient.table"""
import csv
import io
import json
import math
import os
import shutil
Expand Down Expand Up @@ -40,6 +41,7 @@
Table,
TableQueryResult,
_convert_df_date_cols_to_datetime,
_csv_to_pandas_df,
_get_view_type_mask,
_get_view_type_mask_for_deprecated_type,
as_table_columns,
Expand Down Expand Up @@ -251,6 +253,198 @@ def test_convert_df_date_cols_to_datetime() -> None:
assert_frame_equal(test_df2, expected_date_df)


def test_csv_to_pandas_df_no_kwargs():
# GIVEN a pandas DataFrame (CSV file stand-in)
expected_df = pd.DataFrame(
{"col1": [1, 2, 3], "col2": ["a", "b", "c"], "col3": [True, False, True]}
)

with patch.object(
pd, "read_csv", return_value=expected_df
) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
# WHEN I call _csv_to_pandas_df with default parameters
df = _csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect pandas.read_csv was called with default arguments
mock_read_csv.assert_called_once_with(
"dummy_path.csv",
dtype=None,
sep=synapseclient.table.DEFAULT_SEPARATOR,
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escapechar=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
header=0,
skiprows=0,
lineterminator=None,
)

# AND I expect the returned DataFrame to be
# the same as the original DataFrame (file)
pd.testing.assert_frame_equal(df, expected_df)


def test_csv_to_pandas_df_with_kwargs() -> None:
# GIVEN a pandas DataFrame (CSV file stand-in)
expected_df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
jaymedina marked this conversation as resolved.
Show resolved Hide resolved

with patch.object(
pd, "read_csv", return_value=expected_df
) as mock_read_csv, patch.object(os, "linesep", "\r\n"):
# WHEN I call _csv_to_pandas_df with custom keyword arguments
kwargs = {"escapechar": "\\", "keep_default_na": False}
df = _csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
**kwargs,
)

# THEN I expect pandas.read_csv was called with the keyword arguments
mock_read_csv.assert_called_once_with(
"dummy_path.csv",
dtype=None,
sep=synapseclient.table.DEFAULT_SEPARATOR,
quotechar=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escapechar="\\",
header=0,
skiprows=0,
keep_default_na=False,
lineterminator=None,
)

# AND I expect the returned DataFrame to match the expected DataFrame
pd.testing.assert_frame_equal(df, expected_df)


def test_csv_to_pandas_df_calls_convert_date_cols():
# GIVEN a pandas DataFrame (CSV file stand-in) with a date column
expected_df = pd.DataFrame(
{"col1": [1, 2, 3], "date_col": ["2021-01-01", "2021-01-02", "2021-01-03"]}
)

with patch.object(pd, "read_csv", return_value=expected_df), patch.object(
synapseclient.table, "_convert_df_date_cols_to_datetime"
) as mock_convert_dates:
# WHEN I call _csv_to_pandas_df with date_columns specified
_csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=["date_col"], # Specify date column
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect _convert_df_date_cols_to_datetime to be
# called with the expected DataFrame and date columns
mock_convert_dates.assert_called_once_with(expected_df, ["date_col"])


def test_csv_to_pandas_df_handles_list_columns():
# GIVEN a pandas DataFrame (CSV file stand-in) with a list column
initial_df = pd.DataFrame(
{"col1": [1, 2, 3], "list_col": ["[1, 2, 3]", "[4, 5, 6]", "[7, 8, 9]"]}
)

# AND a pandas DataFrame (expected result) with the list column converted to a list
expected_final_df = pd.DataFrame(
{"col1": [1, 2, 3], "list_col": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}
)

with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
synapseclient.table, "_convert_df_date_cols_to_datetime"
), patch.object(
pd.Series, "apply", return_value=expected_final_df["list_col"]
) as mock_apply:
# WHEN I call _csv_to_pandas_df with list_columns specified
result_df = synapseclient.table._csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=["list_col"], # Specify list column
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect json.loads to be applied to the list column
mock_apply.assert_called_once_with(json.loads)

# AND I expect the returned DataFrame to match the expected DataFrame
pd.testing.assert_frame_equal(result_df, expected_final_df)


def test_csv_to_pandas_df_handles_row_id_and_version():
# GIVEN a pandas DataFrame (CSV file stand-in) with ROW_ID and ROW_VERSION columns
initial_df = pd.DataFrame(
{
"ROW_ID": [1, 2, 3],
"ROW_VERSION": [1, 1, 2],
"col1": ["a", "b", "c"],
"col2": [10, 20, 30],
}
)

# AND a pandas DataFrame (expected result)
# with the ROW_ID and ROW_VERSION columns removed
expected_final_df = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [10, 20, 30]}, index=["1_1", "2_1", "3_2"]
) # Index format: ROW_ID_ROW_VERSION

with patch.object(pd, "read_csv", return_value=initial_df), patch.object(
synapseclient.table,
"row_labels_from_id_and_version",
return_value=["1_1", "2_1", "3_2"],
) as mock_row_labels:
# WHEN I call _csv_to_pandas_df with rowIdAndVersionInIndex=True
result_df = synapseclient.table._csv_to_pandas_df(
filepath="dummy_path.csv",
separator=synapseclient.table.DEFAULT_SEPARATOR,
quote_char=synapseclient.table.DEFAULT_QUOTE_CHARACTER,
escape_char=synapseclient.table.DEFAULT_ESCAPSE_CHAR,
contain_headers=True,
lines_to_skip=0,
date_columns=None,
list_columns=None,
rowIdAndVersionInIndex=True,
dtype=None,
)

# THEN I expect row_labels_from_id_and_version to be called once
mock_row_labels.assert_called_once()

# AND I expect the returned DataFrame to match the expected
# DataFrame with the ROW_ID and ROW_VERSION columns removed
pd.testing.assert_frame_equal(result_df, expected_final_df)

# AND I expect the index of the result_df to be as expected
assert list(result_df.index) == ["1_1", "2_1", "3_2"]


def test_schema() -> None:
schema = Schema(name="My Table", parent="syn1000001")

Expand Down
Loading