diff --git a/synapseutils/sync.py b/synapseutils/sync.py index c5f0bc064..0f65e74bc 100644 --- a/synapseutils/sync.py +++ b/synapseutils/sync.py @@ -60,6 +60,8 @@ ARRAY_BRACKET_PATTERN = re.compile(r"^\[.*\]$") SINGLE_OPEN_BRACKET_PATTERN = re.compile(r"^\[") SINGLE_CLOSING_BRACKET_PATTERN = re.compile(r"\]$") +# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes +COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)") tracer = trace.get_tracer("synapseclient") @@ -968,6 +970,13 @@ def _write_manifest_data( Write a number of keys and a list of data to a manifest file. This will write the data out as a tab separated file. + For the data we are writing to the TSV file we are not quoting the content with any + characters. This is because the syncToSynapse function does not require strings to + be quoted. When quote characters were included extra double quotes were being added + to the strings when they were written to the manifest file. This was not causing + errors, however, it was changing the content of the manifest file when changes + were not required. + Args: filename: The name of the file to write to. keys: The keys of the manifest. @@ -976,7 +985,13 @@ def _write_manifest_data( """ with io.open(filename, "w", encoding="utf8") if filename else sys.stdout as fp: csv_writer = csv.DictWriter( - fp, keys, restval="", extrasaction="ignore", delimiter="\t" + fp, + keys, + restval="", + extrasaction="ignore", + delimiter="\t", + quotechar=None, + quoting=csv.QUOTE_NONE, ) csv_writer.writeheader() for row in data: @@ -1215,9 +1230,8 @@ def syncToSynapse( def _split_string(input_string: str) -> typing.List[str]: """ - Use StringIO to create a file-like object for the csv.reader. + Use regex to split a string apart by commas that are not inside of double quotes. - Extract the first row (there should only be one row) Args: input_string: A string to split apart. @@ -1225,11 +1239,14 @@ def _split_string(input_string: str) -> typing.List[str]: Returns: The list of split items as strings. """ - csv_reader = csv.reader(io.StringIO(input_string)) + row = COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN.split(input_string) - row = next(csv_reader) + modified_row = [] + for item in row: + modified_item = item.strip() + modified_row.append(modified_item) - return row + return modified_row def _convert_cell_in_manifest_to_python_types( @@ -1247,6 +1264,7 @@ def _convert_cell_in_manifest_to_python_types( all that is present. """ values_to_return = [] + cell = cell.strip() if ARRAY_BRACKET_PATTERN.match(string=cell): # Replace the first '[' with an empty string diff --git a/tests/integration/synapseutils/test_synapseutils_sync.py b/tests/integration/synapseutils/test_synapseutils_sync.py index 81f5c1d5c..f46a33163 100644 --- a/tests/integration/synapseutils/test_synapseutils_sync.py +++ b/tests/integration/synapseutils/test_synapseutils_sync.py @@ -37,7 +37,7 @@ def __init__(self): self.header = "path parent used executed activityName synapseStore foo date_1 datetime_1 datetime_2 datetime_3 multiple_strings multiple_dates multiple_bools multiple_ints multiple_floats annotation_with_escaped_commas\n" self.row1 = ( - '%s %s %s "%s;https://www.example.com" provName bar 2020-01-01 2023-12-04T07:00:00Z 2023-12-05 23:37:02.995+00:00 2023-12-05 07:00:00+00:00 [a,b,c,d] [2020-01-01,2023-12-04T07:00:00.111Z,2023-12-05 23:37:02.333+00:00,2023-12-05 07:00:00+00:00] [fAlSe,False,tRuE,True] [1,2,3,4] [1.2,3.4,5.6,7.8] ["my, string with a comma","another, string with a comma"]\n' + '%s %s %s "%s;https://www.example.com" provName bar 2020-01-01 2023-12-04T07:00:00Z 2023-12-05 23:37:02.995+00:00 2023-12-05 07:00:00+00:00 [a, b,c, d] [2020-01-01,2023-12-04T07:00:00.111Z, 2023-12-05 23:37:02.333+00:00,2023-12-05 07:00:00+00:00] [fAlSe,False, tRuE,True ] [1,2,3,4] [1.2,3.4,5.6, 7.8] ["my, string with a comma", "another, string with a comma" ]\n' % ( self.f1, self.project.id, diff --git a/tests/unit/synapseutils/unit_test_synapseutils_sync.py b/tests/unit/synapseutils/unit_test_synapseutils_sync.py index 0439e3602..c1558f25b 100644 --- a/tests/unit/synapseutils/unit_test_synapseutils_sync.py +++ b/tests/unit/synapseutils/unit_test_synapseutils_sync.py @@ -8,6 +8,7 @@ import random import tempfile import threading +import math import pytest from unittest.mock import ANY, patch, create_autospec, Mock, call, MagicMock @@ -1415,3 +1416,217 @@ def test_generate_sync_manifest(syn): patch_write_manifest_data.assert_called_with( manifest_path, ["path", "parent"], patch_walk_directory_tree.return_value ) + + +class TestConvertCellInManifestToPythonTypes(object): + """ + Test the _convert_cell_in_manifest_to_python_types function for each type + and single/multiple values. + """ + + def test_datetime_single_item(self) -> None: + # GIVEN a single item datetime string + datetime_string = "2020-01-01T00:00:00.000Z" + datetime_string_in_brackets = "[2020-01-01T00:00:00.000Z]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a datetime object + assert synapseutils.sync._convert_cell_in_manifest_to_python_types( + datetime_string + ) == datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) + assert synapseutils.sync._convert_cell_in_manifest_to_python_types( + datetime_string_in_brackets + ) == datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) + + def test_datetime_multiple_items(self) -> None: + # GIVEN a multiple item datetime string + datetime_string = "[2020-01-01T00:00:00.000Z, 2020-01-02T00:00:00.000Z]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a list of datetime objects + assert synapseutils.sync._convert_cell_in_manifest_to_python_types( + datetime_string + ) == [ + datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(2020, 1, 2, 0, 0, 0, 0, tzinfo=datetime.timezone.utc), + ] + + def test_bool_single_item(self) -> None: + # GIVEN a single item bool string + bool_string = "TrUe" + bool_string_in_brackets = "[tRue]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a bool object + assert ( + synapseutils.sync._convert_cell_in_manifest_to_python_types(bool_string) + is True + ) + assert ( + synapseutils.sync._convert_cell_in_manifest_to_python_types( + bool_string_in_brackets + ) + is True + ) + + def test_bool_multiple_items(self) -> None: + # GIVEN a multiple item bool string + bool_string = "[TrUe, fAlse]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a list of bool objects + assert synapseutils.sync._convert_cell_in_manifest_to_python_types( + bool_string + ) == [True, False] + + def test_int_single_item(self) -> None: + # GIVEN a single item int string + int_string = "123" + int_string_in_brackets = "[123]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a int object + assert ( + synapseutils.sync._convert_cell_in_manifest_to_python_types(int_string) + == 123 + ) + assert ( + synapseutils.sync._convert_cell_in_manifest_to_python_types( + int_string_in_brackets + ) + == 123 + ) + + def test_int_multiple_items(self) -> None: + # GIVEN a multiple item int string + int_string = "[123, 456]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a list of int objects + assert synapseutils.sync._convert_cell_in_manifest_to_python_types( + int_string + ) == [123, 456] + + def test_float_single_item(self) -> None: + # GIVEN a single item float string + float_string = "123.456" + float_string_in_brackets = "[123.456]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a float object + assert math.isclose( + synapseutils.sync._convert_cell_in_manifest_to_python_types(float_string), + 123.456, + ) + assert math.isclose( + synapseutils.sync._convert_cell_in_manifest_to_python_types( + float_string_in_brackets + ), + 123.456, + ) + + def test_float_multiple_items(self) -> None: + # GIVEN a multiple item float string + float_string = " [123.456, 789.012]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a list of float objects + result = synapseutils.sync._convert_cell_in_manifest_to_python_types( + float_string + ) + assert math.isclose(result[0], 123.456) + assert math.isclose(result[1], 789.012) + + def test_string_single_item(self) -> None: + # GIVEN a single item string + string = " foo" + string_in_brackets = " [foo]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a string object + assert ( + synapseutils.sync._convert_cell_in_manifest_to_python_types(string) == "foo" + ) + assert ( + synapseutils.sync._convert_cell_in_manifest_to_python_types( + string_in_brackets + ) + == "foo" + ) + + def test_string_multiple_items(self) -> None: + # GIVEN a multiple item string + string = " [foo, bar]" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a list of string objects + assert synapseutils.sync._convert_cell_in_manifest_to_python_types(string) == [ + "foo", + "bar", + ] + + def test_string_single_item_with_comma(self) -> None: + # GIVEN a single item string + string = "my, sentence, with, commas" + + # WHEN _convert_cell_in_manifest_to_python_types is called + # THEN I expect the output to be a string object + assert ( + synapseutils.sync._convert_cell_in_manifest_to_python_types(string) + == string + ) + + +class TestSplitString(object): + """ + Test the _split_string function. Note as a pre-check to calling this function + the string would have started with brackets `[]`, but they were removed before + calling this function. + """ + + def test_single_item(self) -> None: + # GIVEN single item strings + string_with_no_quotes = "foo" + string_with_quotes = '"foo"' + string_with_quotes_inside_string = 'foo "bar" baz' + string_with_commas_inside_string = '"foo, bar, baz"' + + # WHEN split_strings is called + + # THEN I expect the output to treat all values as a single item + assert synapseutils.sync._split_string(string_with_no_quotes) == [ + string_with_no_quotes + ] + assert synapseutils.sync._split_string(string_with_quotes) == [ + string_with_quotes + ] + assert synapseutils.sync._split_string(string_with_quotes_inside_string) == [ + string_with_quotes_inside_string + ] + assert synapseutils.sync._split_string(string_with_commas_inside_string) == [ + string_with_commas_inside_string + ] + + def test_multiple_item(self) -> None: + # GIVEN multiple item strings + string_with_no_quotes = "foo, bar, baz" + string_with_quotes = '"foo", "bar", "baz"' + string_with_commas_in_some_items = '"foo, bar", baz, "foo, bar, baz"' + + # WHEN split_strings is called + # THEN I expect the output to split the string into multiple items + assert synapseutils.sync._split_string(string_with_no_quotes) == [ + "foo", + "bar", + "baz", + ] + assert synapseutils.sync._split_string(string_with_quotes) == [ + '"foo"', + '"bar"', + '"baz"', + ] + assert synapseutils.sync._split_string(string_with_commas_in_some_items) == [ + '"foo, bar"', + "baz", + '"foo, bar, baz"', + ]