From 887b0cc1bd971ce61d768b82706c7b48df5534d6 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 8 Aug 2024 22:36:36 -0400 Subject: [PATCH] to_integer/to_float/to_number refinement --- dcicutils/misc_utils.py | 126 +++++++++++++++++++---------------- dcicutils/structured_data.py | 14 ++-- pyproject.toml | 2 +- test/test_misc_utils.py | 73 ++++++++++---------- 4 files changed, 113 insertions(+), 102 deletions(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 80e1d0e45..b5a72fb47 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -986,53 +986,61 @@ def str_to_bool(x: Optional[str]) -> Optional[bool]: raise ValueError(f"An argument to str_or_bool must be a string or None: {x!r}") -def to_integer(value: str, fallback: Optional[Any] = None, strict: bool = False) -> Optional[Any]: - try: - return int(value) - except Exception: - if strict is not True: - try: - return int(float(value)) - except Exception: - pass - return fallback - - -_MULTIPLIER_K = 1000 -_MULTIPLIER_M = 1000 * _MULTIPLIER_K -_MULTIPLIER_G = 1000 * _MULTIPLIER_M -_MULTIPLIER_T = 1000 * _MULTIPLIER_G - -_MULTIPLIER_SUFFIXES = { - "K": _MULTIPLIER_K, - "KB": _MULTIPLIER_K, - "M": _MULTIPLIER_M, - "MB": _MULTIPLIER_M, - "G": _MULTIPLIER_G, - "GB": _MULTIPLIER_G, - "T": _MULTIPLIER_T, - "TB": _MULTIPLIER_T +def to_integer(value: str, + allow_commas: bool = False, + allow_multiplier_suffix: bool = False, + fallback: Optional[Union[int, float]] = None) -> Optional[int]: + return to_number(value, fallback=fallback, as_float=False, + allow_commas=allow_commas, + allow_multiplier_suffix=allow_multiplier_suffix) + + +def to_float(value: str, + allow_commas: bool = False, + allow_multiplier_suffix: bool = False, + fallback: Optional[Union[int, float]] = None) -> Optional[int]: + return to_number(value, fallback=fallback, as_float=True, + allow_commas=allow_commas, + allow_multiplier_suffix=allow_multiplier_suffix) + + +_TO_NUMBER_MULTIPLIER_K = 1000 +_TO_NUMBER_MULTIPLIER_M = 1000 * _TO_NUMBER_MULTIPLIER_K +_TO_NUMBER_MULTIPLIER_G = 1000 * _TO_NUMBER_MULTIPLIER_M +_TO_NUMBER_MULTIPLIER_T = 1000 * _TO_NUMBER_MULTIPLIER_G + +_TO_NUMBER_MULTIPLIER_SUFFIXES = { + "K": _TO_NUMBER_MULTIPLIER_K, + "KB": _TO_NUMBER_MULTIPLIER_K, + "M": _TO_NUMBER_MULTIPLIER_M, + "MB": _TO_NUMBER_MULTIPLIER_M, + "G": _TO_NUMBER_MULTIPLIER_G, + "GB": _TO_NUMBER_MULTIPLIER_G, + "T": _TO_NUMBER_MULTIPLIER_T, + "TB": _TO_NUMBER_MULTIPLIER_T } def to_number(value: str, + as_float: bool = False, allow_commas: bool = False, allow_multiplier_suffix: bool = False, - allow_float: bool = False, fallback: Optional[Union[int, float]] = None) -> Optional[Union[int, float]]: - """ - Converts the give string value to an int, or possibly float if allow_float is True. - If allow_commas is True (default: False) then allow commas (only) every three digits. - If allow_multiplier_suffix is True (default: False) allow any of K, Kb, KB; or M, Mb, MB; - or G, Gb, or GB; or T, Tb, TB, to mean multiply value by one thousand; one million; - one billion; or one trillion; respectively. If allow_float is True (default: False) - allow the value to be floating point (i.e. with a decimal point and a fractional part), - in which case the returned value will be of type float, if it needs to be, and not int. - If the string is not well formated then returns None. + Converts the given string value to an int, or float if as_float is True, or None if malformed. + If allow_commas is True then allow commas (only) every three digits. If allow_multiplier_suffix + is True allow any of K, KB; or M, MB; or G, or GB; or T, TB (case-insensitive), to mean multiply + value by one thousand; one million; one billion; or one trillion; respectively. If as_float is True + then value is parsed and returned as a float rather than int. Note that even if as_float is False, + values that might look like a float, can be an int, for example, "1.5K", returns 1500 as an int; + but "1.5002K" returns None, i.e. since 1.5002K is 1500.2 which is not an int. """ + if not (isinstance(value, str) and (value := value.strip())): - return value if isinstance(value, (int, float)) else fallback + if as_float is True: + return float(value) if isinstance(value, (float, int)) else fallback + else: + return value if isinstance(value, int) else fallback value_multiplier = 1 value_negative = False @@ -1048,21 +1056,28 @@ def to_number(value: str, if allow_multiplier_suffix is True: value_upper = value.upper() - for suffix in _MULTIPLIER_SUFFIXES: + for suffix in _TO_NUMBER_MULTIPLIER_SUFFIXES: if value_upper.endswith(suffix): - value_multiplier *= _MULTIPLIER_SUFFIXES[suffix] + value_multiplier *= _TO_NUMBER_MULTIPLIER_SUFFIXES[suffix] if not (value := value[:-len(suffix)].strip()): return fallback break - if allow_float is True: + if (allow_multiplier_suffix is True) or (as_float is True): + # Allow for example "1.5K" to mean 1500 (integer). if (dot_index := value.rfind(".")) >= 0: if value_fraction := value[dot_index + 1:].strip(): try: value_fraction = float(f"0.{value_fraction}") except Exception: return fallback - value = value[:dot_index].strip() + if not (value := value[:dot_index].strip()): + if not value_fraction: + return fallback + value = "0" + elif (as_float is not True) and (value_dot_zero_suffix := re.search(r"\.0*$", value)): + # Allow for example "123.00" to mean 123 (integer). + value = value[:value_dot_zero_suffix.start()] if (allow_commas is True) and ("," in value): if not re.fullmatch(r"(-?\d{1,3}(,\d{3})*)", value): @@ -1072,28 +1087,23 @@ def to_number(value: str, if not value.isdigit(): return fallback - result = int(value) + value = float(value) if as_float is True else int(value) if value_fraction: - result += value_fraction - - result *= value_multiplier + value_float = (float(value) + value_fraction) * float(value_multiplier) + if as_float is True: + value = value_float + else: + value = int(value_float) + if value_float != value: + return fallback + else: + value *= value_multiplier if value_negative: - result = -result - - if allow_float is True: - if result == int(result): - result = int(result) - - return result + value = -value - -def to_float(value: str, fallback: Optional[Any] = None) -> Optional[Any]: - try: - return float(value) - except Exception: - return fallback + return value def to_boolean(value: str, fallback: Optional[Any]) -> Optional[Any]: diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py index b1d57c600..dd00a8900 100644 --- a/dcicutils/structured_data.py +++ b/dcicutils/structured_data.py @@ -13,7 +13,7 @@ from dcicutils.datetime_utils import normalize_date_string, normalize_datetime_string from dcicutils.misc_utils import (create_dict, create_readonly_object, is_uuid, load_json_if, merge_objects, remove_empty_properties, right_trim, split_string, - to_boolean, to_enum, to_integer, to_number, VirtualApp) + to_boolean, to_enum, to_float, to_integer, VirtualApp) from dcicutils.portal_object_utils import PortalObject from dcicutils.portal_utils import Portal as PortalBase from dcicutils.submitr.progress_constants import PROGRESS_PARSE as PROGRESS @@ -727,9 +727,9 @@ def _map_function_integer(self, typeinfo: dict) -> Callable: allow_multiplier_suffix = typeinfo.get("allow_multiplier_suffix") is True def map_integer(value: str, src: Optional[str]) -> Any: # noqa nonlocal allow_commas, allow_multiplier_suffix - return to_number(value, fallback=value, allow_float=False, - allow_commas=allow_commas, - allow_multiplier_suffix=allow_multiplier_suffix) + return to_integer(value, fallback=value, + allow_commas=allow_commas, + allow_multiplier_suffix=allow_multiplier_suffix) return map_integer def _map_function_number(self, typeinfo: dict) -> Callable: @@ -737,9 +737,9 @@ def _map_function_number(self, typeinfo: dict) -> Callable: allow_multiplier_suffix = typeinfo.get("allow_multiplier_suffix") is True def map_number(value: str, src: Optional[str]) -> Any: # noqa nonlocal allow_commas, allow_multiplier_suffix - return to_number(value, fallback=value, allow_float=True, - allow_commas=allow_commas, - allow_multiplier_suffix=allow_multiplier_suffix) + return to_float(value, fallback=value, + allow_commas=allow_commas, + allow_multiplier_suffix=allow_multiplier_suffix) return map_number def _map_function_string(self, typeinfo: dict) -> Callable: diff --git a/pyproject.toml b/pyproject.toml index 8155b6297..e44a3352b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.13.3.1b25" # TODO: To become 8.14.0 +version = "8.13.3.1b26" # TODO: To become 8.14.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 5a4262d47..8898dae6f 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -31,7 +31,7 @@ ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents, - pad_to, JsonLinesReader, split_string, merge_objects, to_integer, to_number, + pad_to, JsonLinesReader, split_string, merge_objects, to_float, to_integer, load_json_from_file_expanding_environment_variables, create_readonly_object ) from dcicutils.qa_utils import ( @@ -3703,12 +3703,45 @@ def test_merge_objects_9(): def test_to_integer(): assert to_integer("17") == 17 assert to_integer("17.0") == 17 - assert to_integer("17.1") == 17 - assert to_integer("17.9", "123") == 17 + assert to_integer("17.1") is None + assert to_integer("17.9", fallback="123") == "123" assert to_integer("0") == 0 assert to_integer("0.0") == 0 assert to_integer("asdf") is None - assert to_integer("asdf", "myfallback") == "myfallback" + assert to_integer("asdf", fallback="myfallback") == "myfallback" + + assert to_integer("1234") == 1234 + assert to_integer("1,234,567") is None + assert to_integer("27500") == 27500 + assert to_integer("27500", allow_commas=True) == 27500 + assert to_integer("1,234,567", allow_commas=True) == 1234567 + assert to_integer("1K", allow_multiplier_suffix=True) == 1000 + assert to_integer("1Kb", allow_multiplier_suffix=True) == 1000 + assert to_integer("1kB", allow_multiplier_suffix=True) == 1000 + assert to_integer("2M", allow_multiplier_suffix=True) == 2000000 + assert to_integer("2Mb", allow_multiplier_suffix=True) == 2000000 + assert to_integer("2MB", allow_multiplier_suffix=True) == 2000000 + assert to_integer("3G", allow_multiplier_suffix=True) == 3000000000 + assert to_integer("3Gb", allow_multiplier_suffix=True) == 3000000000 + assert to_integer("3GB", allow_multiplier_suffix=True) == 3000000000 + assert to_integer("4T", allow_multiplier_suffix=True) == 4000000000000 + assert to_integer("4Tb", allow_multiplier_suffix=True) == 4000000000000 + assert to_integer("4TB", allow_multiplier_suffix=True) == 4000000000000 + assert to_integer("1,234,567K", allow_commas=True) is None + assert to_integer("1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == 1234567000 + assert to_integer("-1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == -1234567000 + assert to_integer(4321) == 4321 + # TODO: More ... + pass + + +def test_to_float(): + assert to_float("789") == 789.0 + assert type(to_float("789")) == float + assert to_float("1234.0567") == 1234.0567 + assert to_float("1.5K", allow_multiplier_suffix=True) == 1500 + assert type(to_float("1.5K", allow_multiplier_suffix=True)) == float + assert to_float(4321.1234) == 4321.1234 def test_load_json_from_file_expanding_environment_variables(): @@ -3727,35 +3760,3 @@ def test_create_readonly_object(): assert a.ghi == 456 assert a.jk == "xyzzy" assert a.lmnop == {"greeting": "Hello, world!"} - - -def test_to_number(): - assert to_number("1234") == 1234 - assert to_number("1,234,567") is None - assert to_number("27500") == 27500 - assert to_number("789", allow_float=True) == 789.0 - assert type(to_number("789", allow_float=True)) == int - assert to_number("27500", allow_commas=True) == 27500 - assert to_number("1,234,567", allow_commas=True) == 1234567 - assert to_number("1234.0567", allow_float=True) == 1234.0567 - assert to_number("1K", allow_multiplier_suffix=True) == 1000 - assert to_number("1Kb", allow_multiplier_suffix=True) == 1000 - assert to_number("1kB", allow_multiplier_suffix=True) == 1000 - assert to_number("2M", allow_multiplier_suffix=True) == 2000000 - assert to_number("2Mb", allow_multiplier_suffix=True) == 2000000 - assert to_number("2MB", allow_multiplier_suffix=True) == 2000000 - assert to_number("3G", allow_multiplier_suffix=True) == 3000000000 - assert to_number("3Gb", allow_multiplier_suffix=True) == 3000000000 - assert to_number("3GB", allow_multiplier_suffix=True) == 3000000000 - assert to_number("4T", allow_multiplier_suffix=True) == 4000000000000 - assert to_number("4Tb", allow_multiplier_suffix=True) == 4000000000000 - assert to_number("4TB", allow_multiplier_suffix=True) == 4000000000000 - assert to_number("1,234,567K", allow_commas=True) is None - assert to_number("1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == 1234567000 - assert to_number("-1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == -1234567000 - assert to_number(4321) == 4321 - assert to_number(4321.1234) == 4321.1234 - assert to_number("1.5K", allow_multiplier_suffix=True, allow_float=True) == 1500 - assert type(to_number("1.5K", allow_multiplier_suffix=True, allow_float=True)) == int - # TODO: More ... - pass