Skip to content

Commit

Permalink
[SYNPY-1357] Correction to strip spaces before and after values (#1045)
Browse files Browse the repository at this point in the history
* Correction to strip spaces before and after values
  • Loading branch information
BryanFauble authored Jan 16, 2024
1 parent 018d3eb commit 66a4b30
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 7 deletions.
30 changes: 24 additions & 6 deletions synapseutils/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
ARRAY_BRACKET_PATTERN = re.compile(r"^\[.*\]$")
SINGLE_OPEN_BRACKET_PATTERN = re.compile(r"^\[")
SINGLE_CLOSING_BRACKET_PATTERN = re.compile(r"\]$")
# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes
COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)")

tracer = trace.get_tracer("synapseclient")

Expand Down Expand Up @@ -968,6 +970,13 @@ def _write_manifest_data(
Write a number of keys and a list of data to a manifest file. This will write
the data out as a tab separated file.
For the data we are writing to the TSV file we are not quoting the content with any
characters. This is because the syncToSynapse function does not require strings to
be quoted. When quote characters were included extra double quotes were being added
to the strings when they were written to the manifest file. This was not causing
errors, however, it was changing the content of the manifest file when changes
were not required.
Args:
filename: The name of the file to write to.
keys: The keys of the manifest.
Expand All @@ -976,7 +985,13 @@ def _write_manifest_data(
"""
with io.open(filename, "w", encoding="utf8") if filename else sys.stdout as fp:
csv_writer = csv.DictWriter(
fp, keys, restval="", extrasaction="ignore", delimiter="\t"
fp,
keys,
restval="",
extrasaction="ignore",
delimiter="\t",
quotechar=None,
quoting=csv.QUOTE_NONE,
)
csv_writer.writeheader()
for row in data:
Expand Down Expand Up @@ -1215,21 +1230,23 @@ def syncToSynapse(

def _split_string(input_string: str) -> typing.List[str]:
"""
Use StringIO to create a file-like object for the csv.reader.
Use regex to split a string apart by commas that are not inside of double quotes.
Extract the first row (there should only be one row)
Args:
input_string: A string to split apart.
Returns:
The list of split items as strings.
"""
csv_reader = csv.reader(io.StringIO(input_string))
row = COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN.split(input_string)

row = next(csv_reader)
modified_row = []
for item in row:
modified_item = item.strip()
modified_row.append(modified_item)

return row
return modified_row


def _convert_cell_in_manifest_to_python_types(
Expand All @@ -1247,6 +1264,7 @@ def _convert_cell_in_manifest_to_python_types(
all that is present.
"""
values_to_return = []
cell = cell.strip()

if ARRAY_BRACKET_PATTERN.match(string=cell):
# Replace the first '[' with an empty string
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/synapseutils/test_synapseutils_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self):

self.header = "path parent used executed activityName synapseStore foo date_1 datetime_1 datetime_2 datetime_3 multiple_strings multiple_dates multiple_bools multiple_ints multiple_floats annotation_with_escaped_commas\n"
self.row1 = (
'%s %s %s "%s;https://www.example.com" provName bar 2020-01-01 2023-12-04T07:00:00Z 2023-12-05 23:37:02.995+00:00 2023-12-05 07:00:00+00:00 [a,b,c,d] [2020-01-01,2023-12-04T07:00:00.111Z,2023-12-05 23:37:02.333+00:00,2023-12-05 07:00:00+00:00] [fAlSe,False,tRuE,True] [1,2,3,4] [1.2,3.4,5.6,7.8] ["my, string with a comma","another, string with a comma"]\n'
'%s %s %s "%s;https://www.example.com" provName bar 2020-01-01 2023-12-04T07:00:00Z 2023-12-05 23:37:02.995+00:00 2023-12-05 07:00:00+00:00 [a, b,c, d] [2020-01-01,2023-12-04T07:00:00.111Z, 2023-12-05 23:37:02.333+00:00,2023-12-05 07:00:00+00:00] [fAlSe,False, tRuE,True ] [1,2,3,4] [1.2,3.4,5.6, 7.8] ["my, string with a comma", "another, string with a comma" ]\n'
% (
self.f1,
self.project.id,
Expand Down
215 changes: 215 additions & 0 deletions tests/unit/synapseutils/unit_test_synapseutils_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import random
import tempfile
import threading
import math

import pytest
from unittest.mock import ANY, patch, create_autospec, Mock, call, MagicMock
Expand Down Expand Up @@ -1415,3 +1416,217 @@ def test_generate_sync_manifest(syn):
patch_write_manifest_data.assert_called_with(
manifest_path, ["path", "parent"], patch_walk_directory_tree.return_value
)


class TestConvertCellInManifestToPythonTypes(object):
"""
Test the _convert_cell_in_manifest_to_python_types function for each type
and single/multiple values.
"""

def test_datetime_single_item(self) -> None:
# GIVEN a single item datetime string
datetime_string = "2020-01-01T00:00:00.000Z"
datetime_string_in_brackets = "[2020-01-01T00:00:00.000Z]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a datetime object
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
datetime_string
) == datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
datetime_string_in_brackets
) == datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)

def test_datetime_multiple_items(self) -> None:
# GIVEN a multiple item datetime string
datetime_string = "[2020-01-01T00:00:00.000Z, 2020-01-02T00:00:00.000Z]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a list of datetime objects
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
datetime_string
) == [
datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc),
datetime.datetime(2020, 1, 2, 0, 0, 0, 0, tzinfo=datetime.timezone.utc),
]

def test_bool_single_item(self) -> None:
# GIVEN a single item bool string
bool_string = "TrUe"
bool_string_in_brackets = "[tRue]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a bool object
assert (
synapseutils.sync._convert_cell_in_manifest_to_python_types(bool_string)
is True
)
assert (
synapseutils.sync._convert_cell_in_manifest_to_python_types(
bool_string_in_brackets
)
is True
)

def test_bool_multiple_items(self) -> None:
# GIVEN a multiple item bool string
bool_string = "[TrUe, fAlse]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a list of bool objects
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
bool_string
) == [True, False]

def test_int_single_item(self) -> None:
# GIVEN a single item int string
int_string = "123"
int_string_in_brackets = "[123]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a int object
assert (
synapseutils.sync._convert_cell_in_manifest_to_python_types(int_string)
== 123
)
assert (
synapseutils.sync._convert_cell_in_manifest_to_python_types(
int_string_in_brackets
)
== 123
)

def test_int_multiple_items(self) -> None:
# GIVEN a multiple item int string
int_string = "[123, 456]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a list of int objects
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
int_string
) == [123, 456]

def test_float_single_item(self) -> None:
# GIVEN a single item float string
float_string = "123.456"
float_string_in_brackets = "[123.456]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a float object
assert math.isclose(
synapseutils.sync._convert_cell_in_manifest_to_python_types(float_string),
123.456,
)
assert math.isclose(
synapseutils.sync._convert_cell_in_manifest_to_python_types(
float_string_in_brackets
),
123.456,
)

def test_float_multiple_items(self) -> None:
# GIVEN a multiple item float string
float_string = " [123.456, 789.012]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a list of float objects
result = synapseutils.sync._convert_cell_in_manifest_to_python_types(
float_string
)
assert math.isclose(result[0], 123.456)
assert math.isclose(result[1], 789.012)

def test_string_single_item(self) -> None:
# GIVEN a single item string
string = " foo"
string_in_brackets = " [foo]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a string object
assert (
synapseutils.sync._convert_cell_in_manifest_to_python_types(string) == "foo"
)
assert (
synapseutils.sync._convert_cell_in_manifest_to_python_types(
string_in_brackets
)
== "foo"
)

def test_string_multiple_items(self) -> None:
# GIVEN a multiple item string
string = " [foo, bar]"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a list of string objects
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(string) == [
"foo",
"bar",
]

def test_string_single_item_with_comma(self) -> None:
# GIVEN a single item string
string = "my, sentence, with, commas"

# WHEN _convert_cell_in_manifest_to_python_types is called
# THEN I expect the output to be a string object
assert (
synapseutils.sync._convert_cell_in_manifest_to_python_types(string)
== string
)


class TestSplitString(object):
"""
Test the _split_string function. Note as a pre-check to calling this function
the string would have started with brackets `[]`, but they were removed before
calling this function.
"""

def test_single_item(self) -> None:
# GIVEN single item strings
string_with_no_quotes = "foo"
string_with_quotes = '"foo"'
string_with_quotes_inside_string = 'foo "bar" baz'
string_with_commas_inside_string = '"foo, bar, baz"'

# WHEN split_strings is called

# THEN I expect the output to treat all values as a single item
assert synapseutils.sync._split_string(string_with_no_quotes) == [
string_with_no_quotes
]
assert synapseutils.sync._split_string(string_with_quotes) == [
string_with_quotes
]
assert synapseutils.sync._split_string(string_with_quotes_inside_string) == [
string_with_quotes_inside_string
]
assert synapseutils.sync._split_string(string_with_commas_inside_string) == [
string_with_commas_inside_string
]

def test_multiple_item(self) -> None:
# GIVEN multiple item strings
string_with_no_quotes = "foo, bar, baz"
string_with_quotes = '"foo", "bar", "baz"'
string_with_commas_in_some_items = '"foo, bar", baz, "foo, bar, baz"'

# WHEN split_strings is called
# THEN I expect the output to split the string into multiple items
assert synapseutils.sync._split_string(string_with_no_quotes) == [
"foo",
"bar",
"baz",
]
assert synapseutils.sync._split_string(string_with_quotes) == [
'"foo"',
'"bar"',
'"baz"',
]
assert synapseutils.sync._split_string(string_with_commas_in_some_items) == [
'"foo, bar"',
"baz",
'"foo, bar, baz"',
]

0 comments on commit 66a4b30

Please sign in to comment.