Skip to content

Commit

Permalink
to_integer/to_float/to_number refinement
Browse files Browse the repository at this point in the history
  • Loading branch information
dmichaels-harvard committed Aug 9, 2024
1 parent 83e1214 commit 887b0cc
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 102 deletions.
126 changes: 68 additions & 58 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,53 +986,61 @@ def str_to_bool(x: Optional[str]) -> Optional[bool]:
raise ValueError(f"An argument to str_or_bool must be a string or None: {x!r}")


def to_integer(value: str, fallback: Optional[Any] = None, strict: bool = False) -> Optional[Any]:
try:
return int(value)
except Exception:
if strict is not True:
try:
return int(float(value))
except Exception:
pass
return fallback


_MULTIPLIER_K = 1000
_MULTIPLIER_M = 1000 * _MULTIPLIER_K
_MULTIPLIER_G = 1000 * _MULTIPLIER_M
_MULTIPLIER_T = 1000 * _MULTIPLIER_G

_MULTIPLIER_SUFFIXES = {
"K": _MULTIPLIER_K,
"KB": _MULTIPLIER_K,
"M": _MULTIPLIER_M,
"MB": _MULTIPLIER_M,
"G": _MULTIPLIER_G,
"GB": _MULTIPLIER_G,
"T": _MULTIPLIER_T,
"TB": _MULTIPLIER_T
def to_integer(value: str,
allow_commas: bool = False,
allow_multiplier_suffix: bool = False,
fallback: Optional[Union[int, float]] = None) -> Optional[int]:
return to_number(value, fallback=fallback, as_float=False,
allow_commas=allow_commas,
allow_multiplier_suffix=allow_multiplier_suffix)


def to_float(value: str,
allow_commas: bool = False,
allow_multiplier_suffix: bool = False,
fallback: Optional[Union[int, float]] = None) -> Optional[int]:
return to_number(value, fallback=fallback, as_float=True,
allow_commas=allow_commas,
allow_multiplier_suffix=allow_multiplier_suffix)


_TO_NUMBER_MULTIPLIER_K = 1000
_TO_NUMBER_MULTIPLIER_M = 1000 * _TO_NUMBER_MULTIPLIER_K
_TO_NUMBER_MULTIPLIER_G = 1000 * _TO_NUMBER_MULTIPLIER_M
_TO_NUMBER_MULTIPLIER_T = 1000 * _TO_NUMBER_MULTIPLIER_G

_TO_NUMBER_MULTIPLIER_SUFFIXES = {
"K": _TO_NUMBER_MULTIPLIER_K,
"KB": _TO_NUMBER_MULTIPLIER_K,
"M": _TO_NUMBER_MULTIPLIER_M,
"MB": _TO_NUMBER_MULTIPLIER_M,
"G": _TO_NUMBER_MULTIPLIER_G,
"GB": _TO_NUMBER_MULTIPLIER_G,
"T": _TO_NUMBER_MULTIPLIER_T,
"TB": _TO_NUMBER_MULTIPLIER_T
}


def to_number(value: str,
as_float: bool = False,
allow_commas: bool = False,
allow_multiplier_suffix: bool = False,
allow_float: bool = False,
fallback: Optional[Union[int, float]] = None) -> Optional[Union[int, float]]:

"""
Converts the give string value to an int, or possibly float if allow_float is True.
If allow_commas is True (default: False) then allow commas (only) every three digits.
If allow_multiplier_suffix is True (default: False) allow any of K, Kb, KB; or M, Mb, MB;
or G, Gb, or GB; or T, Tb, TB, to mean multiply value by one thousand; one million;
one billion; or one trillion; respectively. If allow_float is True (default: False)
allow the value to be floating point (i.e. with a decimal point and a fractional part),
in which case the returned value will be of type float, if it needs to be, and not int.
If the string is not well formated then returns None.
Converts the given string value to an int, or float if as_float is True, or None if malformed.
If allow_commas is True then allow commas (only) every three digits. If allow_multiplier_suffix
is True allow any of K, KB; or M, MB; or G, or GB; or T, TB (case-insensitive), to mean multiply
value by one thousand; one million; one billion; or one trillion; respectively. If as_float is True
then value is parsed and returned as a float rather than int. Note that even if as_float is False,
values that might look like a float, can be an int, for example, "1.5K", returns 1500 as an int;
but "1.5002K" returns None, i.e. since 1.5002K is 1500.2 which is not an int.
"""

if not (isinstance(value, str) and (value := value.strip())):
return value if isinstance(value, (int, float)) else fallback
if as_float is True:
return float(value) if isinstance(value, (float, int)) else fallback
else:
return value if isinstance(value, int) else fallback

value_multiplier = 1
value_negative = False
Expand All @@ -1048,21 +1056,28 @@ def to_number(value: str,

if allow_multiplier_suffix is True:
value_upper = value.upper()
for suffix in _MULTIPLIER_SUFFIXES:
for suffix in _TO_NUMBER_MULTIPLIER_SUFFIXES:
if value_upper.endswith(suffix):
value_multiplier *= _MULTIPLIER_SUFFIXES[suffix]
value_multiplier *= _TO_NUMBER_MULTIPLIER_SUFFIXES[suffix]
if not (value := value[:-len(suffix)].strip()):
return fallback
break

if allow_float is True:
if (allow_multiplier_suffix is True) or (as_float is True):
# Allow for example "1.5K" to mean 1500 (integer).
if (dot_index := value.rfind(".")) >= 0:
if value_fraction := value[dot_index + 1:].strip():
try:
value_fraction = float(f"0.{value_fraction}")
except Exception:
return fallback
value = value[:dot_index].strip()
if not (value := value[:dot_index].strip()):
if not value_fraction:
return fallback
value = "0"
elif (as_float is not True) and (value_dot_zero_suffix := re.search(r"\.0*$", value)):
# Allow for example "123.00" to mean 123 (integer).
value = value[:value_dot_zero_suffix.start()]

if (allow_commas is True) and ("," in value):
if not re.fullmatch(r"(-?\d{1,3}(,\d{3})*)", value):
Expand All @@ -1072,28 +1087,23 @@ def to_number(value: str,
if not value.isdigit():
return fallback

result = int(value)
value = float(value) if as_float is True else int(value)

if value_fraction:
result += value_fraction

result *= value_multiplier
value_float = (float(value) + value_fraction) * float(value_multiplier)
if as_float is True:
value = value_float
else:
value = int(value_float)
if value_float != value:
return fallback
else:
value *= value_multiplier

if value_negative:
result = -result

if allow_float is True:
if result == int(result):
result = int(result)

return result
value = -value


def to_float(value: str, fallback: Optional[Any] = None) -> Optional[Any]:
try:
return float(value)
except Exception:
return fallback
return value


def to_boolean(value: str, fallback: Optional[Any]) -> Optional[Any]:
Expand Down
14 changes: 7 additions & 7 deletions dcicutils/structured_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from dcicutils.datetime_utils import normalize_date_string, normalize_datetime_string
from dcicutils.misc_utils import (create_dict, create_readonly_object, is_uuid, load_json_if,
merge_objects, remove_empty_properties, right_trim, split_string,
to_boolean, to_enum, to_integer, to_number, VirtualApp)
to_boolean, to_enum, to_float, to_integer, VirtualApp)
from dcicutils.portal_object_utils import PortalObject
from dcicutils.portal_utils import Portal as PortalBase
from dcicutils.submitr.progress_constants import PROGRESS_PARSE as PROGRESS
Expand Down Expand Up @@ -727,19 +727,19 @@ def _map_function_integer(self, typeinfo: dict) -> Callable:
allow_multiplier_suffix = typeinfo.get("allow_multiplier_suffix") is True
def map_integer(value: str, src: Optional[str]) -> Any: # noqa
nonlocal allow_commas, allow_multiplier_suffix
return to_number(value, fallback=value, allow_float=False,
allow_commas=allow_commas,
allow_multiplier_suffix=allow_multiplier_suffix)
return to_integer(value, fallback=value,
allow_commas=allow_commas,
allow_multiplier_suffix=allow_multiplier_suffix)
return map_integer

def _map_function_number(self, typeinfo: dict) -> Callable:
allow_commas = typeinfo.get("allow_commas") is True
allow_multiplier_suffix = typeinfo.get("allow_multiplier_suffix") is True
def map_number(value: str, src: Optional[str]) -> Any: # noqa
nonlocal allow_commas, allow_multiplier_suffix
return to_number(value, fallback=value, allow_float=True,
allow_commas=allow_commas,
allow_multiplier_suffix=allow_multiplier_suffix)
return to_float(value, fallback=value,
allow_commas=allow_commas,
allow_multiplier_suffix=allow_multiplier_suffix)
return map_number

def _map_function_string(self, typeinfo: dict) -> Callable:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "8.13.3.1b25" # TODO: To become 8.14.0
version = "8.13.3.1b26" # TODO: To become 8.14.0
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
73 changes: 37 additions & 36 deletions test/test_misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn,
deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime,
MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents,
pad_to, JsonLinesReader, split_string, merge_objects, to_integer, to_number,
pad_to, JsonLinesReader, split_string, merge_objects, to_float, to_integer,
load_json_from_file_expanding_environment_variables, create_readonly_object
)
from dcicutils.qa_utils import (
Expand Down Expand Up @@ -3703,12 +3703,45 @@ def test_merge_objects_9():
def test_to_integer():
assert to_integer("17") == 17
assert to_integer("17.0") == 17
assert to_integer("17.1") == 17
assert to_integer("17.9", "123") == 17
assert to_integer("17.1") is None
assert to_integer("17.9", fallback="123") == "123"
assert to_integer("0") == 0
assert to_integer("0.0") == 0
assert to_integer("asdf") is None
assert to_integer("asdf", "myfallback") == "myfallback"
assert to_integer("asdf", fallback="myfallback") == "myfallback"

assert to_integer("1234") == 1234
assert to_integer("1,234,567") is None
assert to_integer("27500") == 27500
assert to_integer("27500", allow_commas=True) == 27500
assert to_integer("1,234,567", allow_commas=True) == 1234567
assert to_integer("1K", allow_multiplier_suffix=True) == 1000
assert to_integer("1Kb", allow_multiplier_suffix=True) == 1000
assert to_integer("1kB", allow_multiplier_suffix=True) == 1000
assert to_integer("2M", allow_multiplier_suffix=True) == 2000000
assert to_integer("2Mb", allow_multiplier_suffix=True) == 2000000
assert to_integer("2MB", allow_multiplier_suffix=True) == 2000000
assert to_integer("3G", allow_multiplier_suffix=True) == 3000000000
assert to_integer("3Gb", allow_multiplier_suffix=True) == 3000000000
assert to_integer("3GB", allow_multiplier_suffix=True) == 3000000000
assert to_integer("4T", allow_multiplier_suffix=True) == 4000000000000
assert to_integer("4Tb", allow_multiplier_suffix=True) == 4000000000000
assert to_integer("4TB", allow_multiplier_suffix=True) == 4000000000000
assert to_integer("1,234,567K", allow_commas=True) is None
assert to_integer("1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == 1234567000
assert to_integer("-1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == -1234567000
assert to_integer(4321) == 4321
# TODO: More ...
pass


def test_to_float():
assert to_float("789") == 789.0
assert type(to_float("789")) == float
assert to_float("1234.0567") == 1234.0567
assert to_float("1.5K", allow_multiplier_suffix=True) == 1500
assert type(to_float("1.5K", allow_multiplier_suffix=True)) == float
assert to_float(4321.1234) == 4321.1234


def test_load_json_from_file_expanding_environment_variables():
Expand All @@ -3727,35 +3760,3 @@ def test_create_readonly_object():
assert a.ghi == 456
assert a.jk == "xyzzy"
assert a.lmnop == {"greeting": "Hello, world!"}


def test_to_number():
assert to_number("1234") == 1234
assert to_number("1,234,567") is None
assert to_number("27500") == 27500
assert to_number("789", allow_float=True) == 789.0
assert type(to_number("789", allow_float=True)) == int
assert to_number("27500", allow_commas=True) == 27500
assert to_number("1,234,567", allow_commas=True) == 1234567
assert to_number("1234.0567", allow_float=True) == 1234.0567
assert to_number("1K", allow_multiplier_suffix=True) == 1000
assert to_number("1Kb", allow_multiplier_suffix=True) == 1000
assert to_number("1kB", allow_multiplier_suffix=True) == 1000
assert to_number("2M", allow_multiplier_suffix=True) == 2000000
assert to_number("2Mb", allow_multiplier_suffix=True) == 2000000
assert to_number("2MB", allow_multiplier_suffix=True) == 2000000
assert to_number("3G", allow_multiplier_suffix=True) == 3000000000
assert to_number("3Gb", allow_multiplier_suffix=True) == 3000000000
assert to_number("3GB", allow_multiplier_suffix=True) == 3000000000
assert to_number("4T", allow_multiplier_suffix=True) == 4000000000000
assert to_number("4Tb", allow_multiplier_suffix=True) == 4000000000000
assert to_number("4TB", allow_multiplier_suffix=True) == 4000000000000
assert to_number("1,234,567K", allow_commas=True) is None
assert to_number("1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == 1234567000
assert to_number("-1,234,567K", allow_commas=True, allow_multiplier_suffix=True) == -1234567000
assert to_number(4321) == 4321
assert to_number(4321.1234) == 4321.1234
assert to_number("1.5K", allow_multiplier_suffix=True, allow_float=True) == 1500
assert type(to_number("1.5K", allow_multiplier_suffix=True, allow_float=True)) == int
# TODO: More ...
pass

0 comments on commit 887b0cc

Please sign in to comment.