diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e1fb7d186..1054c4c08 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,21 @@ Change Log ---------- +8.8.4 +===== +* Minor fix in structured_data to not try to resolve empty refs in norefs mode; + and added StructuredDataSet.unchecked_refs; not functionally substantive as + used (only) with smaht-submitr/submit-metadata-bundle --info --refs. +* Added nrows and nsheets to data_reader; convenience for smaht-submitr/submit-metadata-bundle --info. +* Added test_progress_bar module for progress_bar testing; would like to add more tests. +* Fixed up captured_output module to handle UTF-8 encoding to help unit testing progress_bar. +* Added hooks to progress_bar to help unit testing. +* Added a find_nth_from_end and set_nth to misc_utils to help progress_bar unit testing. +* Added format_size and format_duration misc_utils; refactor from smaht-submitr. +* Added format_datetime, parse_datetime to datetime_utils; refactor from smaht-submitr; and some tests. +* Added check_only flag to portal_utils.Portal.{post,patch}_metadata (came up in ad hoc troubleshooting). + + 8.8.3 ===== * Minor fix in structured_data related to smaht-submitr progress monitoring. diff --git a/dcicutils/captured_output.py b/dcicutils/captured_output.py index be22cd16a..1b96161d5 100644 --- a/dcicutils/captured_output.py +++ b/dcicutils/captured_output.py @@ -9,7 +9,7 @@ @contextmanager -def captured_output(capture: bool = True): +def captured_output(capture: bool = True, encoding: Optional[str] = None): """ Context manager to capture any/all output to stdout or stderr, and not actually output it to stdout or stderr. Yields and object with a get_captured_output() method to get the output captured thus far, @@ -24,7 +24,9 @@ def captured_output(capture: bool = True): original_stdout = _real_stdout original_stderr = _real_stderr - captured_output = io.StringIO() + # FYI: This encoding business with _EncodedStringIO was introduced (circa April 2024) + # when ran into issues unit testing progress_bar which outputs those funny block characters. + captured_output = io.StringIO() if not encoding else _EncodedStringIO(encoding) def set_original_output() -> None: sys.stdout = original_stdout @@ -68,3 +70,19 @@ def uncaptured_output(): finally: sys.stdout = original_stdout sys.stderr = original_stderr + + +class _EncodedStringIO: + def __init__(self, encoding: str = "utf-8"): + self.encoding = encoding + self.buffer = io.BytesIO() + def write(self, s): # noqa + self.buffer.write(s.encode(self.encoding)) + def flush(self): # noqa + self.buffer.flush() + def getvalue(self): # noqa + return self.buffer.getvalue().decode(self.encoding) + def __str__(self): # noqa + return self.getvalue() + def __repr__(self): # noqa + return repr(self.getvalue()) diff --git a/dcicutils/data_readers.py b/dcicutils/data_readers.py index 6fc3bfc0d..42436c2ea 100644 --- a/dcicutils/data_readers.py +++ b/dcicutils/data_readers.py @@ -66,6 +66,13 @@ def cell_value(self, value: Optional[Any]) -> str: else: return value + @property + def nrows(self) -> int: + nrows = 0 + for row in self: + nrows += 1 + return nrows + def open(self) -> None: pass @@ -192,6 +199,10 @@ def is_hidden_sheet(self, sheet: openpyxl.worksheet.worksheet.Worksheet) -> bool return True return False + @property + def nsheets(self) -> int: + return len(self.sheet_names) + def __del__(self) -> None: if (workbook := self._workbook) is not None: self._workbook = None diff --git a/dcicutils/datetime_utils.py b/dcicutils/datetime_utils.py index bd8d599d6..c28e7553f 100644 --- a/dcicutils/datetime_utils.py +++ b/dcicutils/datetime_utils.py @@ -1,6 +1,23 @@ from dcicutils.misc_utils import normalize_spaces from datetime import datetime, timedelta, timezone -from typing import Optional, Tuple +from dateutil import parser as datetime_parser +from typing import Optional, Tuple, Union + +TIMEZONE_LOCAL = datetime.now().astimezone().tzinfo # type: datetime.timezone +TIMEZONE_LOCAL_NAME = TIMEZONE_LOCAL.tzname(None) # type: str +TIMEZONE_LOCAL_OFFSET = TIMEZONE_LOCAL.utcoffset(None) # type: datetime.timedelta +TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES = int(TIMEZONE_LOCAL_OFFSET.total_seconds()) // 60 # type: int +TIMEZONE_LOCAL_OFFSET_HOURS = TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES // 60 # type: int +TIMEZONE_LOCAL_OFFSET_MINUTES = TIMEZONE_LOCAL_OFFSET_TOTAL_MINUTES % 60 # type: int +TIMEZONE_LOCAL_SUFFIX = f"{TIMEZONE_LOCAL_OFFSET_HOURS:+03d}:{TIMEZONE_LOCAL_OFFSET_MINUTES:02d}" # type: str + +TIMEZONE_UTC = timezone.utc # type: datetime.timezone +TIMEZONE_UTC_NAME = TIMEZONE_UTC.tzname(None) # type: str +TIMEZONE_UTC_OFFSET = timedelta(0) # type: datetime.timedelta +TIMEZONE_UTC_OFFSET_TOTAL_MINUTES = 0 # type: int +TIMEZONE_UTC_OFFSET_HOURS = 0 # type: int +TIMEZONE_UTC_OFFSET_MINUTES = 0 # type: int +TIMEZONE_UTC_SUFFIX = "Z" # type: str def parse_datetime_string(value: str) -> Optional[datetime]: @@ -82,17 +99,203 @@ def normalize_date_string(value: str) -> Optional[str]: return d.strftime("%Y-%m-%d") if d else None +def get_timezone(hours_or_timedelta: Union[int, timedelta], minutes: Optional[int] = None) -> timezone: + try: + if isinstance(hours_or_timedelta, timedelta): + return timezone(hours_or_timedelta) + return timezone(timedelta(hours=hours_or_timedelta, minutes=minutes or 0)) + except Exception: + return TIMEZONE_LOCAL + + +def get_timezone_offset(tz: timezone) -> timedelta: + try: + return tz.utcoffset(None) + except Exception: + return TIMEZONE_LOCAL_OFFSET + + +def get_timezone_hours_minutes(tz: timezone) -> Tuple[int, int]: + """ + Returns a tuple with the integer hours and minutes offset for the given timezone. + If negative then only the hours is negative; the mintutes is always positive; + this is okay because there are no timezones less than one hour from UTC. + """ + tz_offset = get_timezone_offset(tz) + tz_offset_total_minutes = int(tz_offset.total_seconds()) // 60 + tz_offset_hours = tz_offset_total_minutes // 60 + tz_offset_minutes = abs(tz_offset_total_minutes % 60) + return tz_offset_hours, tz_offset_minutes + + +def get_utc_timezone() -> timezone: + return TIMEZONE_UTC + + +def get_local_timezone() -> timezone: + """ + Returns current/local timezone as a datetime.timezone object. + """ + return TIMEZONE_LOCAL + + def get_local_timezone_string() -> str: """ Returns current/local timezone in format like: "-05:00". """ - tz_hours, tz_minutes = get_local_timezone_hours_minutes() - return f"{tz_hours:+03d}:{tz_minutes:02d}" + return TIMEZONE_LOCAL_SUFFIX def get_local_timezone_hours_minutes() -> Tuple[int, int]: """ Returns a tuple with the integer hours and minutes offset for the current/local timezone. + If negative then only the hours is negative; the mintutes is always positive; + this is okay because there are no timezones less than one hour from UTC. """ - tz_minutes = datetime.now(timezone.utc).astimezone().utcoffset().total_seconds() / 60 - return int(tz_minutes // 60), int(abs(tz_minutes % 60)) + return TIMEZONE_LOCAL_OFFSET_HOURS, TIMEZONE_LOCAL_OFFSET_MINUTES + + +def parse_datetime(value: str, utc: bool = False, tz: Optional[timezone] = None) -> Optional[datetime]: + """ + Parses the given string into a datetime, if possible, and returns that value, + or None if not able to parse. The timezone of the returned datetime will be the + local timezone; or if the given utc argument is True then it will be UTC; or if the + given tz argument is a datetime.timezone then return datetime will be in that timezone. + """ + if isinstance(value, datetime): + return value + elif not isinstance(value, str): + return None + try: + # This dateutil.parser handles quite a wide variety of formats and suits our needs. + value = datetime_parser.parse(value) + if utc is True: + # If the given utc argument is True then it trumps any tz argument if given. + tz = timezone.utc + if value.tzinfo is not None: + # The given value had an explicit timezone specified. + if isinstance(tz, timezone): + return value.astimezone(tz) + return value + return value.replace(tzinfo=tz if isinstance(tz, timezone) else get_local_timezone()) + except Exception: + return None + + +def format_datetime(value: datetime, + utc: bool = False, + tz: Optional[Union[timezone, bool]] = None, + iso: bool = False, + notz: bool = False, + noseconds: bool = False, + ms: bool = False, + verbose: bool = False, + noseparator: bool = False, + noday: bool = False, + nodate: bool = False, + notime: bool = False) -> str: + """ + Returns the given datetime as a string in "YYYY:MM:DD hh:mm:ss tz" format, for + example "2024-04-17 15:42:26 EDT". If the given notz argument is True then omits + the timezone; if the noseconds argument is given the omits the seconds. If the given + verbose argument is True then returns a really verbose version of the datetime, for + example "Wednesday, April 17, 2024 | 15:42:26 EDT"; if the noseparator argument is + True then omits the "|" separator; if the noday argument is True then omits the day + of week part. The timezone of the returned datetime string will default to the local + one; if the given utc argument is True then it will be UTC; or if the given tz + argument is a datetime.timezone it will be in that timezone. + """ + if nodate is True and notime is True: + return "" + if not isinstance(value, datetime): + if not isinstance(value, str) or not (value := parse_datetime(value)): + return "" + try: + if utc is True: + tz = timezone.utc + elif not isinstance(tz, timezone): + tz = get_local_timezone() + if tz is True: + notz = False + elif tz is False: + notz = True + if noseconds is True: + ms = False + value = value.astimezone(tz) + if iso: + if notz is True: + value = value.replace(tzinfo=None) + if not (ms is True): + value = value.replace(microsecond=0) + if noseconds is True: + if notz is True: + if nodate is True: + return value.strftime(f"%H:%M") + elif notime is True: + return value.strftime(f"%Y-%m-%d") + else: + return value.strftime(f"%Y-%m-%dT%H:%M") + if len(tz := value.strftime("%z")) > 3: + tz = tz[:3] + ":" + tz[3:] + if nodate is True: + return value.strftime(f"%H:%M") + tz + elif notime is True: + return value.strftime(f"%Y-%m-%d") + tz + else: + return value.strftime(f"%Y-%m-%dT%H:%M") + tz + if nodate is True: + if (not (notz is True)) and len(tz := value.strftime("%z")) > 3: + tz = tz[:3] + ":" + tz[3:] + else: + tz = "" + return value.strftime(f"%H:%M:%S{f'.%f' if ms is True else ''}") + tz + elif notime is True: + return value.strftime(f"%Y-%m-%d") + else: + return value.isoformat() + if verbose: + if nodate is True: + return value.strftime( + f"%-I:%M{'' if noseconds is True else ':%S'}" + f"{f'.%f' if ms is True else ''} %p{'' if notz is True else ' %Z'}") + elif notime is True: + return value.strftime(f"{'' if noday is True else '%A, '}%B %-d, %Y") + else: + return value.strftime( + f"{'' if noday is True else '%A, '}%B %-d, %Y{'' if noseparator is True else ' |'}" + f" %-I:%M{'' if noseconds is True else ':%S'}" + f"{f'.%f' if ms is True else ''} %p{'' if notz is True else ' %Z'}") + else: + if nodate is True: + return value.strftime( + f"%H:%M{'' if noseconds is True else ':%S'}" + f"{f'.%f' if ms is True else ''}{'' if notz is True else ' %Z'}") + elif notime is True: + return value.strftime(f"%Y-%m-%d") + else: + return value.strftime( + f"%Y-%m-%d %H:%M{'' if noseconds is True else ':%S'}" + f"{f'.%f' if ms is True else ''}{'' if notz is True else ' %Z'}") + except Exception: + return None + + +def format_date(value: datetime, + utc: bool = False, + tz: Optional[Union[timezone, bool]] = None, + verbose: bool = False, + noday: bool = False) -> str: + return format_datetime(value, utc=utc, tz=tz, verbose=verbose, noday=noday, notime=True) + + +def format_time(value: datetime, + utc: bool = False, + iso: bool = False, + tz: Optional[Union[timezone, bool]] = None, + ms: bool = False, + notz: bool = False, + noseconds: bool = False, + verbose: bool = False, + noday: bool = False) -> str: + return format_datetime(value, utc=utc, tz=tz, iso=iso, ms=ms, notz=notz, + noseconds=noseconds, verbose=verbose, nodate=True) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 219b3997f..a5979fb36 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -2548,6 +2548,71 @@ def normalize_spaces(value: str) -> str: return re.sub(r"\s+", " ", value).strip() +def find_nth_from_end(string: str, substring: str, nth: int) -> int: + """ + Returns the index of the nth occurrence of the given substring within + the given string from the END of the given string; or -1 if not found. + """ + index = -1 + string = string[::-1] + for i in range(0, nth): + index = string.find(substring, index + 1) + return len(string) - index - 1 if index >= 0 else -1 + + +def set_nth(string: str, nth: int, replacement: str) -> str: + """ + Sets the nth character of the given string to the given replacement string. + """ + if not isinstance(string, str) or not isinstance(nth, int) or not isinstance(replacement, str): + return string + if nth < 0: + nth += len(string) + return string[:nth] + replacement + string[nth + 1:] if 0 <= nth < len(string) else string + + +def format_size(nbytes: Union[int, float], precision: int = 2, nospace: bool = False, terse: bool = False) -> str: + if isinstance(nbytes, str) and nbytes.isdigit(): + nbytes = int(nbytes) + elif not isinstance(nbytes, (int, float)): + return "" + UNITS = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'] + UNITS_TERSE = ['b', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] + MAX_UNITS_INDEX = len(UNITS) - 1 + ONE_K = 1024 + index = 0 + if (precision := max(precision, 0)) and (nbytes <= ONE_K): + precision -= 1 + while abs(nbytes) >= ONE_K and index < MAX_UNITS_INDEX: + nbytes /= ONE_K + index += 1 + if index == 0: + nbytes = int(nbytes) + return f"{nbytes} byte{'s' if nbytes != 1 else ''}" + unit = (UNITS_TERSE if terse else UNITS)[index] + return f"{nbytes:.{precision}f}{'' if nospace else ' '}{unit}" + + +def format_duration(seconds: Union[int, float]) -> str: + seconds_actual = seconds + seconds = round(max(seconds, 0)) + durations = [("year", 31536000), ("day", 86400), ("hour", 3600), ("minute", 60), ("second", 1)] + parts = [] + for name, duration in durations: + if seconds >= duration: + count = seconds // duration + seconds %= duration + if count != 1: + name += "s" + parts.append(f"{count} {name}") + if len(parts) == 0: + return f"{seconds_actual:.1f} seconds" + elif len(parts) == 1: + return f"{seconds_actual:.1f} seconds" + else: + return " ".join(parts[:-1]) + " " + parts[-1] + + class JsonLinesReader: def __init__(self, fp, padded=False, padding=None): diff --git a/dcicutils/portal_utils.py b/dcicutils/portal_utils.py index 6a0bd3928..b6bc16684 100644 --- a/dcicutils/portal_utils.py +++ b/dcicutils/portal_utils.py @@ -282,15 +282,17 @@ def get_metadata(self, object_id: str, raw: bool = False, except Exception: return None - def patch_metadata(self, object_id: str, data: dict) -> Optional[dict]: + def patch_metadata(self, object_id: str, data: dict, check_only: bool = False) -> Optional[dict]: if self.key: - return patch_metadata(obj_id=object_id, patch_item=data, key=self.key) - return self.patch(f"/{object_id}", data).json() + return patch_metadata(obj_id=object_id, patch_item=data, key=self.key, + add_on="check_only=True" if check_only else "") + return self.patch(f"/{object_id}{'?check_only=True' if check_only else ''}", data).json() - def post_metadata(self, object_type: str, data: dict) -> Optional[dict]: + def post_metadata(self, object_type: str, data: dict, check_only: bool = False) -> Optional[dict]: if self.key: - return post_metadata(schema_name=object_type, post_item=data, key=self.key) - return self.post(f"/{object_type}", data).json() + return post_metadata(schema_name=object_type, post_item=data, key=self.key, + add_on="check_only=True" if check_only else "") + return self.post(f"/{object_type}{'?check_only=True' if check_only else ''}", data).json() def get_health(self) -> OptionalResponse: return self.get("/health") diff --git a/dcicutils/progress_bar.py b/dcicutils/progress_bar.py index 4ff85b9c4..ac7cfebb3 100644 --- a/dcicutils/progress_bar.py +++ b/dcicutils/progress_bar.py @@ -1,13 +1,15 @@ from collections import namedtuple +import re from signal import signal, SIGINT import sys import threading import time from tqdm import tqdm from types import FrameType as frame -from typing import Callable, Optional, Union +from typing import Callable, List, Optional, Union from contextlib import contextmanager from dcicutils.command_utils import yes_or_no +from dcicutils.misc_utils import find_nth_from_end, format_size, set_nth class TQDM(tqdm): @@ -47,6 +49,8 @@ def define(*args, **kwargs): def __init__(self, total: Optional[int] = None, description: Optional[str] = None, + use_byte_size_for_rate: bool = False, + use_ascii: bool = False, catch_interrupt: bool = True, interrupt: Optional[Callable] = None, interrupt_continue: Optional[Callable] = None, @@ -54,15 +58,16 @@ def __init__(self, total: Optional[int] = None, interrupt_exit: bool = False, interrupt_exit_message: Optional[Union[Callable, str]] = None, interrupt_message: Optional[str] = None, - printf: Optional[Callable] = None, - tidy_output_hack: bool = True) -> None: + tidy_output_hack: bool = True, + capture_output_for_testing: bool = False) -> None: self._bar = None + self._started = 0 self._disabled = False self._done = False - self._printf = printf if callable(printf) else print self._tidy_output_hack = (tidy_output_hack is True) - self._started = time.time() self._stop_requested = False + self._use_byte_size_for_rate = (use_byte_size_for_rate is True and self._tidy_output_hack) + self._use_ascii = (use_ascii is True) # Interrupt handling. We do not do the actual (signal) interrupt setup # in self._initialize as that could be called from a (sub) thread; and in # Python we can only set a signal (SIGINT in our case) on the main thread. @@ -90,24 +95,26 @@ def __init__(self, total: Optional[int] = None, self._tidy_output_hack = self._define_tidy_output_hack() self._total = total if isinstance(total, int) and total >= 0 else 0 self._description = self._format_description(description) - # self._initialize() + self._captured_output_for_testing = [] if capture_output_for_testing else None def _initialize(self) -> bool: # Do not actually create the tqdm object unless/until we have a positive total. if (self._bar is None) and (self._total > 0): - bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} | {rate_fmt} | {elapsed}{postfix} | ETA: {remaining} " + if self._use_byte_size_for_rate: + bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} | [rate] | {elapsed}{postfix} | ETA: {remaining} " + else: + bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} | {rate_fmt} | {elapsed}{postfix} | ETA: {remaining} " self._bar = TQDM(total=self._total, desc=self._description, - dynamic_ncols=True, bar_format=bar_format, unit="", file=sys.stdout) + dynamic_ncols=True, bar_format=bar_format, unit="", file=sys.stdout, ascii=self._use_ascii) + self._started = time.time() if self._disabled: self._bar.disable = True return True return False - def set_total(self, value: int, reset_eta: bool = False) -> None: + def set_total(self, value: int, _norefresh: bool = False) -> None: if value == self._total: # If the total has not changed since last set then do nothing. - if reset_eta and self._bar is not None: - self._bar.reset() return if isinstance(value, int) and value > 0: self._total = value @@ -116,41 +123,55 @@ def set_total(self, value: int, reset_eta: bool = False) -> None: # the total during the course of a single ProgressBar instance. self._bar.reset() self._bar.total = value - self._bar.refresh() + if not _norefresh: + self._bar.refresh() - def reset_eta(self) -> None: - # Since set_total does nothing if total is the same, provide - # a way to reset the ETA if starting over with the same total. - if self._bar is not None: - progress = self._bar.n - self._bar.reset() - self._bar.total = self._total - self._bar.n = progress - self._bar.refresh() - - def set_progress(self, value: int) -> None: + def set_progress(self, value: int, _norefresh: bool = False) -> None: if isinstance(value, int) and value >= 0: if (self._bar is not None) or self._initialize(): self._bar.n = value - self._bar.refresh() + if not _norefresh: + self._bar.refresh() - def increment_progress(self, value: int) -> None: + def increment_progress(self, value: int = 1) -> None: if isinstance(value, int) and value > 0: if (self._bar is not None) or self._initialize(): self._bar.update(value) self._bar.refresh() def set_description(self, value: str) -> None: - self._description = self._format_description(value) + if isinstance(value, str): + self._description = self._format_description(value) + if self._bar is not None: + # FYI: tqdm.set_description seems to imply a refresh. + self._bar.set_description(self._description) + + def reset_eta(self) -> None: + # Since set_total does nothing if total is the same, provide + # a way to reset the ETA if starting over with the same total. + # But NOTE that resetting ETA will ALSO reset the ELAPSED time. if self._bar is not None: - self._bar.set_description(self._description) + progress = self._bar.n + self._bar.reset() + self._bar.total = self._total + self._bar.n = progress + self._bar.refresh() + + def reset(self, total: int, progress: int = 0, description: Optional[str] = None) -> None: + self.set_total(total, _norefresh=True) + self.set_progress(progress, _norefresh=True) + self.set_description(description) + self.enable() + self._done = False + self._bar.reset() + self._started = time.time() - def done(self) -> None: + def done(self, description: Optional[str] = None) -> None: if self._done or self._bar is None: return self._ended = time.time() self.set_progress(self.total) - self._bar.set_description(self._description) + self.set_description(description) self._bar.refresh() # FYI: Do NOT do a bar.disable = True before a bar.close() or it messes up output # on multiple calls; found out the hard way; a couple hours will never get back :-/ @@ -190,12 +211,14 @@ def stop_requested(self) -> bool: return self._stop_requested @property - def started(self) -> None: - return self._started + def captured_output_for_testing(self) -> Optional[List[str]]: + return self._captured_output_for_testing - @property - def duration(self) -> None: - return time.time() - self._started + @staticmethod + def format_captured_output_for_testing(description: str, total: int, progress: int) -> str: + percent = round((progress / total) * 100.0) + separator = "✓" if percent == 100 else "|" + return f"{description} {separator} {percent:>3}% ◀|### | {progress}/{total} | 0.0/s | 00:00 | ETA: 00:00" def _format_description(self, value: str) -> str: if not isinstance(value, str): @@ -208,8 +231,7 @@ def _define_interrupt_handler(self) -> None: def handle_interrupt(signum: int, frame: frame) -> None: # noqa nonlocal self def handle_secondary_interrupt(signum: int, frame: frame) -> None: # noqa - nonlocal self - self._printf("\nEnter 'yes' or 'no' or CTRL-\\ to completely abort ...") + print("\nEnter 'yes' or 'no' or CTRL-\\ to completely abort ...") self.disable() self._interrupt(self) if self._interrupt else None set_interrupt_handler(handle_secondary_interrupt) @@ -226,7 +248,7 @@ def handle_secondary_interrupt(signum: int, frame: frame) -> None: # noqa restore_interrupt_handler() if self._interrupt_exit_message: if isinstance(interrupt_exit_message := self._interrupt_exit_message(self), str): - self._printf(interrupt_exit_message) + print(interrupt_exit_message) exit(1) elif interrupt_stop is False or ((interrupt_stop is None) and (self._interrupt_exit is False)): set_interrupt_handler(handle_interrupt) @@ -257,30 +279,92 @@ def _define_tidy_output_hack(self) -> None: # string in the display string where the progress bar should actually go, # which we do in _format_description. Other minor things too; see below. sys_stdout_write = sys.stdout.write + last_text = None ; last_captured_output_text = None ; last_spin_change_time = None # noqa def tidy_stdout_write(text: str) -> None: # noqa nonlocal self, sys_stdout_write, sentinel_internal, spina, spini, spinn + nonlocal last_text, last_captured_output_text, last_spin_change_time def replace_first(value: str, match: str, replacement: str) -> str: # noqa return value[:i] + replacement + value[i + len(match):] if (i := value.find(match)) >= 0 else value + def remove_extra_trailing_spaces(text: str) -> str: # noqa + while text.endswith(" "): + text = text[:-1] + return text + if (not text) or (last_text == text): + return + last_text = text + now = time.time() if (self._disabled or self._done) and sentinel_internal in text: # Another hack to really disable output on interrupt; in this case we set # tqdm.disable to True, but output can still dribble out, so if the output # looks like it is from tqdm and we are disabled/done then do no output. return if sentinel_internal in text: - spinc = spina[spini % spinn] if not ("100%|" in text) else "| ✓" ; spini += 1 # noqa + spinc = spina[spini % spinn] if not ("100%|" in text) else "✓" + if last_spin_change_time is None or ((now - last_spin_change_time) >= 0.06): + spini += 1 + last_spin_change_time = now text = replace_first(text, sentinel_internal, f" {spinc}") text = replace_first(text, "%|", "% ◀|") # Another oddity: for the rate sometimes tqdm intermittently prints # something like "1.54s/" rather than "1.54/s"; something to do with # the unit we gave, which is empty; idunno; just replace it here. text = replace_first(text, "s/ ", "/s ") + if self._use_byte_size_for_rate and self._bar: + rate = self._bar.n / (now - self._started) + text = text.replace("[rate]", f"{format_size(rate)}/s") sys_stdout_write(text) sys.stdout.flush() + if self._captured_output_for_testing is not None: + # For testing only we replace vacilliting values in the out like rate, + # time elapsed, and ETA with static values; so that something like this: + # > Working ⣾ 20% ◀|█████████▌ | 1/5 | 536.00/s | 00:01 | ETA: 00:02 ⣾ + # becomes something more static like this after calling this function: + # > Working | 20% ◀|### | 1/5 | 0.0/s | 00:00 | ETA: 00:00 + # This function obviously has intimate knowledge of the output; better here than in tests. + def replace_time_dependent_values_with_static(text: str) -> str: + blocks = "\u2587|\u2588|\u2589|\u258a|\u258b|\u258c|\u258d|\u258e|\u258f" + if (n := find_nth_from_end(text, "|", 5)) >= 8: + pattern = re.compile( + rf"(\s*)(\d*%? ◀\|)(?:\s*{blocks}|#)*\s*(\|\s*\d+/\d+)?(\s*\|\s*)" + rf"(?:\d+\.?\d*|\?)(\/s\s*\|\s*)(?:\d+:\d+)?(\s*\|\s*ETA:\s*)(?:\d+:\d+|\?)?") + if match := pattern.match(text[n - 6:]): + if text[n - 8:n - 7] != "✓": text = set_nth(text, n - 8, "|") # noqa + return (text[0:n - 6].replace("\r", "") + + match.expand(rf"\g<1>\g<2>### \g<3>\g<4>0.0\g<5>00:00\g<6>00:00")) + return text + if text != "\n": + captured_output_text = replace_time_dependent_values_with_static(text) + if captured_output_text != last_captured_output_text: + self._captured_output_for_testing.append(captured_output_text) + last_captured_output_text = captured_output_text def restore_stdout_write() -> None: # noqa nonlocal sys_stdout_write if sys_stdout_write is not None: sys.stdout.write = sys_stdout_write + def ascii_spinners() -> list: # noqa + # Fun with ASCII spinner characters. + # Dots borrowed from "rich" python package (others: ⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏). + # Others: "◴◷◶◵" "◰◳◲◱" "◡⊙◠" "⠁⠂⠄⡀⢀⠠⠐⠈" "▁▃▄▅▆▇█▇▆▅▄▃" "◢◣◤◥" "◐◓◑◒" "✶✸✹✺✹✷" "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏" + spinner_chars_a = list("⣾⣽⣻⢿⡿⣟⣯⣷"[::-1]) * 8 + spinner_chars_b = list("⠋⠙⠹⠸⢰⣰⣠⣄⣆⡆⡖⠖⠚⠙⠋⠏⠇⡆⣆⣄⣠⣰⢰⢲⠲") + spinner_chars_c = list("⣀⣤⣶⣿⣶⣤") + spinner_chars_d = list("⡀⡄⡆⠇⠋⠙⠸⢰⢠⢀⢠⢰⠸⠙⠋⠇⡆⡄") + spinner_chars_e = list("⠀⡀⠄⠂⠁⠈⠐⠠⢀⣀⢄⢂⢁⢈⢐⢠⣠⢤⢢⢡⢨⢰⣰⢴⢲⢱⢸⣸⢼⢺⢹⣹⢽⢻⣻⢿⣿⣶⣤⣀") + spinner_chars_f = list("⠉⠒⠤⣀⠤⠒") + spinner_chars_g = list("⠋⠙⠹⠸⢰⣰⣠⣄⣆⡆⠇⠏") + spinner_chars_h = list("⠁⠉⠙⠚⠒⠂⠂⠒⠲⠴⠤⠄⠄⠤⢤⣠⣀⢀⢀⣀⣠⢤⠤⠄⠄⠤⠴⠲⠒⠂⠂⠒⠚⠙⠉⠁") + spinner_chars_i = list("◐◓◑◒") + spinner_chars_j = list("|/—*—\\") + return (spinner_chars_a + (spinner_chars_b * 2) + + spinner_chars_a + (spinner_chars_c * 4) + + spinner_chars_a + (spinner_chars_d * 2) + + spinner_chars_a + (spinner_chars_e * 2) + + spinner_chars_a + (spinner_chars_f * 4) + + spinner_chars_a + (spinner_chars_g * 5) + + spinner_chars_a + (spinner_chars_h * 2) + + spinner_chars_a + (spinner_chars_i * 5) + + spinner_chars_a + (spinner_chars_j * 4)) sys.stdout.write = tidy_stdout_write - spina = ["|", "/", "—", "◦", "\\"] ; spini = 0 ; spinn = len(spina) # noqa + spina = ascii_spinners() ; spini = 0 ; spinn = len(spina) # noqa sentinel = "[progress]" ; sentinel_internal = f"{sentinel}:" # noqa return namedtuple("tidy_output_hack", ["restore", "sentinel"])(restore_stdout_write, sentinel) diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py index 977703295..36ee036f8 100644 --- a/dcicutils/structured_data.py +++ b/dcicutils/structured_data.py @@ -154,7 +154,44 @@ def resolved_refs(self) -> List[str]: @property def resolved_refs_with_uuids(self) -> List[str]: - return list([{"path": resolved_ref[0], "uuid": resolved_ref[1]} for resolved_ref in self._resolved_refs]) + return list([{"path": resolved_ref[0], + "uuid": resolved_ref[1] if len(resolved_ref) >= 2 else None} + for resolved_ref in self._resolved_refs]) + + @property + def unchecked_refs(self) -> List[str]: + """ + Returns list of unchecked (for existence) references, grouped by reference path; + each object in the list has a path property and a srcs property which is a list of + src objects containing the type, column and row of the reference to the reference. + Note that this is only populated if the norefs option is specified. + """ + def load_json(value: str) -> Optional[dict]: + try: + return json.loads(value) + except Exception: + return None + result = [] + if self._norefs: + for ref in self._resolved_refs: + # The structure of this self._resolved_refs is setup in Schema._map_function_ref, + # which is called whenever a reference (linkTo) is encountered. It is a set of + # tuples containing three items: [0] the ref path, [1] its uuid (if applicable), + # and [2] its src. The src identifies the place where this ref occurred and is a + # dictionary containing file, type, column, and row properties. For this case, of + # norefs (i.e. unchecked refs), the uuid ([1]) is None because we are skipping + # ref resolution. But the src is actually a *string* dump of the dictionary, only + # because dictionaries cannot be put in a set (which is what _resolved_refs is); + # this dump is also done in Schema._map_function_ref (should probably change this + # to be a list to avoid this - TODO); we only even store this src info for this + # norefs case, as not really needed otherwise. This is just to support the + # useful-for-troublehsooting options --info --refs for smaht-submitr. + if len(ref) >= 3 and (ref_path := ref[0]) and (ref_src := load_json(ref[2])): + if existing_ref := [item for item in result if item.get("path") == ref_path]: + existing_ref[0]["srcs"].append(ref_src) + else: + result.append({"path": ref_path, "srcs": [ref_src]}) + return result @property def upload_files(self) -> List[str]: @@ -653,7 +690,16 @@ def _map_function_ref(self, typeinfo: dict) -> Callable: def map_ref(value: str, link_to: str, portal: Optional[Portal], src: Optional[str]) -> Any: nonlocal self, typeinfo if self._norefs: - self._resolved_refs.add((f"/{link_to}/{value}", None)) + # Here the caller has specified the (StructuredDataSet) norefs option + # which means we do not check for the existence of references at all. + if value: + # Dump the src as a JSON string because a dictionary cannot be added to a set; + # this is ONLY used for smaht-submitr/submit-metadata-bundle --info --refs. + # This info exposed via StructureDataSet.unchecked_refs. TODO: Should probably + # make this not a set type so we dont' have to do this dump (and corresponding + # load, in StructureDataSet.unchecked_refs). + self._resolved_refs.add((f"/{link_to}/{value}", None, + json.dumps(src) if isinstance(src, dict) else None)) return value if not value: if (column := typeinfo.get("column")) and column in self.data.get("required", []): diff --git a/pyproject.toml b/pyproject.toml index f59cd3018..e34d933d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.8.3" +version = "8.8.4" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_datetime_utils.py b/test/test_datetime_utils.py index d969beb60..116f39d61 100644 --- a/test/test_datetime_utils.py +++ b/test/test_datetime_utils.py @@ -1,4 +1,12 @@ -from dcicutils.datetime_utils import get_local_timezone_string, normalize_date_string, normalize_datetime_string +from datetime import datetime, timezone, timedelta +from typing import Optional +from dcicutils.datetime_utils import ( + get_local_timezone_string, normalize_date_string, normalize_datetime_string, +) +from dcicutils.datetime_utils import ( + format_date, format_datetime, format_time, get_local_timezone, get_timezone, + get_timezone_hours_minutes, get_utc_timezone, parse_datetime +) def test_normalize_datetime_string(): @@ -28,3 +36,167 @@ def test_normalize_date_string(): value = " 2024-01-28 17:15:32 + 03:34" assert normalize_date_string(value) == "2024-01-28" + + +TZLOCAL = None +TZLOCAL_OFFSET_HOURS = None +TZLOCAL_OFFSET_MINUTES = None +TZLOCAL_SUFFIX = None +TZLOCAL_NAME = None +TZUTC = None +TZUTC_SUFFIX = None + + +def _setup_global_timezone_constants(tzlocal: Optional[timezone] = None) -> None: + + global TZLOCAL, TZLOCAL_OFFSET_HOURS, TZLOCAL_OFFSET_MINUTES, TZLOCAL_SUFFIX, TZLOCAL_NAME, TZUTC, TZUTC_SUFFIX + + TZLOCAL = tzlocal if isinstance(tzlocal, timezone) else get_local_timezone() + + if TZLOCAL != get_local_timezone(): + import dcicutils.datetime_utils + dcicutils.datetime_utils.get_local_timezone = lambda: TZLOCAL + + TZLOCAL_OFFSET_HOURS, TZLOCAL_OFFSET_MINUTES = get_timezone_hours_minutes(TZLOCAL) + TZLOCAL_SUFFIX = (f"{'-' if TZLOCAL_OFFSET_HOURS < 0 else '+'}" + f"{abs(TZLOCAL_OFFSET_HOURS):02}:{TZLOCAL_OFFSET_MINUTES:02}") + TZLOCAL_NAME = TZLOCAL.tzname(None) + + TZUTC = get_utc_timezone() + TZUTC_SUFFIX = f"+00:00" + TZUTC_SUFFIX = "Z" + + +def _assert_datetime_equals(value: datetime, year: int, month: int, day: int, + hour: int, minute: int, second: int, microsecond: Optional[int], + tz: timezone = TZLOCAL, + shift_hours: Optional[int] = None, + shift_minutes: Optional[int] = None): + + if not isinstance(tz, timezone): + tz = TZLOCAL + + expected_value = datetime(year=year, month=month, day=day, hour=hour, + minute=minute, second=second, microsecond=microsecond or 0, tzinfo=tz) + if isinstance(shift_hours, int): + expected_value = expected_value + timedelta(hours=shift_hours) + if isinstance(shift_minutes, int): + expected_value = expected_value + timedelta(hours=shift_minutes) + assert value == expected_value + + +def _test_parse_datetime_a(ms: Optional[int] = None): + + ms_suffix = f".{ms}" if isinstance(ms, int) else "" + ms = ms if isinstance(ms, int) else None + + # -------------------------------------------------------------------------------------------------- + value = f"2024-04-17T15:04:16{ms_suffix}" + parsed = parse_datetime(value) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZLOCAL) + assert format_datetime(parsed, notz=True) == f"2024-04-17 15:04:16" + assert format_datetime(parsed) == f"2024-04-17 15:04:16 {TZLOCAL_NAME}" + assert format_datetime(parsed, ms=ms is not None) == f"2024-04-17 15:04:16{ms_suffix} {TZLOCAL_NAME}" + assert format_datetime(parsed, noseconds=True) == f"2024-04-17 15:04 {TZLOCAL_NAME}" + assert format_datetime(parsed, iso=True) == f"2024-04-17T15:04:16{TZLOCAL_SUFFIX}" + assert format_datetime(parsed, iso=True, notz=True) == f"2024-04-17T15:04:16" + assert format_datetime(parsed, iso=True, ms=ms is not None) == f"2024-04-17T15:04:16{ms_suffix}{TZLOCAL_SUFFIX}" + assert (format_datetime(parsed, verbose=True, ms=ms is not None) == + f"Wednesday, April 17, 2024 | 3:04:16{ms_suffix} PM {TZLOCAL_NAME}") + assert (format_datetime(parsed, ms=ms is not None, verbose=True, noseparator=True) == + f"Wednesday, April 17, 2024 3:04:16{ms_suffix} PM {TZLOCAL_NAME}") + assert (format_datetime(parsed, ms=ms is not None, verbose=True, noseparator=True, noday=True) == + f"April 17, 2024 3:04:16{ms_suffix} PM {TZLOCAL_NAME}") + assert format_datetime(parsed, nodate=True, notz=True) == f"15:04:16" + assert format_datetime(parsed, nodate=True) == f"15:04:16 {TZLOCAL_NAME}" + assert format_datetime(parsed, notime=True) == f"2024-04-17" + assert format_time(parsed, notz=True) == f"15:04:16" + assert format_time(parsed) == f"15:04:16 {TZLOCAL_NAME}" + assert format_date(parsed) == f"2024-04-17" + assert format_date(parsed, verbose=True) == f"Wednesday, April 17, 2024" + assert format_date(parsed, verbose=True, noday=True) == f"April 17, 2024" + assert format_date(parsed) == f"2024-04-17" + assert format_date(parsed) == f"2024-04-17" + assert format_time(parsed, iso=True) == f"15:04:16{TZLOCAL_SUFFIX}" + assert format_time(parsed, iso=True, ms=ms is not None) == f"15:04:16{ms_suffix}{TZLOCAL_SUFFIX}" + assert format_time(parsed, iso=True, notz=True, ms=ms is not None) == f"15:04:16{ms_suffix}" + assert format_time(parsed, iso=True, notz=True) == f"15:04:16" + + value = f"2024-04-17T15:04:16{ms_suffix}" + parsed = parse_datetime(value, utc=True) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZUTC) + + value = f"2024-04-17T15:04:16{ms_suffix}" + parsed = parse_datetime(value, tz=TZUTC) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZUTC) + + value = f"2024-04-17T15:04:16{ms_suffix}" + parsed = parse_datetime(value, tz=TZLOCAL) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZLOCAL) + + # -------------------------------------------------------------------------------------------------- + value = f"2024-04-17T15:04:16{ms_suffix}{TZUTC_SUFFIX}" + parsed = parse_datetime(value) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZUTC) + + value = f"2024-04-17T15:04:16{ms_suffix}{TZUTC_SUFFIX}" + parsed = parse_datetime(value, utc=True) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZUTC) + + value = f"2024-04-17T15:04:16{ms_suffix}{TZUTC_SUFFIX}" + parsed = parse_datetime(value, tz=TZUTC) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZUTC) + + value = f"2024-04-17T15:04:16{ms_suffix}{TZUTC_SUFFIX}" + parsed = parse_datetime(value, tz=TZLOCAL) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZLOCAL, + shift_hours=TZLOCAL_OFFSET_HOURS, shift_minutes=TZLOCAL_OFFSET_MINUTES) + + # -------------------------------------------------------------------------------------------------- + value = f"2024-04-17T15:04:16{ms_suffix}{TZLOCAL_SUFFIX}" + parsed = parse_datetime(value) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZLOCAL) + + value = f"2024-04-17T15:04:16{ms_suffix}{TZLOCAL_SUFFIX}" + parsed = parse_datetime(value, utc=True) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZUTC, + shift_hours=-TZLOCAL_OFFSET_HOURS, shift_minutes=TZLOCAL_OFFSET_MINUTES) + + value = f"2024-04-17T15:04:16{ms_suffix}{TZLOCAL_SUFFIX}" + parsed = parse_datetime(value, tz=TZUTC) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZUTC, + shift_hours=-TZLOCAL_OFFSET_HOURS, shift_minutes=TZLOCAL_OFFSET_MINUTES) + + value = f"2024-04-17T15:04:16{ms_suffix}{TZLOCAL_SUFFIX}" + parsed = parse_datetime(value, tz=TZLOCAL) + _assert_datetime_equals(parsed, 2024, 4, 17, 15, 4, 16, ms, TZLOCAL) + + +def test_parse_datetime_a(ms: Optional[int] = None): + _setup_global_timezone_constants() + _test_parse_datetime_a() + _test_parse_datetime_a(ms=434698) + + _setup_global_timezone_constants(tzlocal=get_timezone(4)) + _test_parse_datetime_a() + _test_parse_datetime_a(ms=434698) + + _setup_global_timezone_constants(tzlocal=get_timezone(5)) + _test_parse_datetime_a() + _test_parse_datetime_a(ms=434698) + + _setup_global_timezone_constants(tzlocal=get_timezone(-4)) + _test_parse_datetime_a() + _test_parse_datetime_a(ms=434698) + + _setup_global_timezone_constants(tzlocal=get_timezone(7)) + _test_parse_datetime_a() + _test_parse_datetime_a(ms=434698) + + _setup_global_timezone_constants(tzlocal=get_timezone(9)) + _test_parse_datetime_a() + + for zone in range(-24, 24 + 1): + _setup_global_timezone_constants(tzlocal=get_timezone(zone)) + _test_parse_datetime_a() + _test_parse_datetime_a(ms=434698) diff --git a/test/test_progress_bar.py b/test/test_progress_bar.py new file mode 100644 index 000000000..d28067695 --- /dev/null +++ b/test/test_progress_bar.py @@ -0,0 +1,62 @@ +import time +from dcicutils.progress_bar import ProgressBar + +sleep_seconds = 0 +sleep = lambda: time.sleep(sleep_seconds) if sleep_seconds > 0 else None # noqa + + +def test_progress_bar_a(): + global sleep + total = 50 + description = "Working" + bar = ProgressBar(total=total, description=description, capture_output_for_testing=True) + + for i in range(total): + bar.increment_progress(1) ; sleep() # noqa + bar.done("Done") + + bar_output = bar.captured_output_for_testing + # Bar output count is total plus-one for 0/total and and other plus-one for "Done" after total/total (100%). + assert len(bar_output) == total + 2 + + i = 0 + for line in bar_output: + if i <= total: + expected_line = bar.format_captured_output_for_testing(description, total, i) + elif i == total + 1: + expected_line = bar.format_captured_output_for_testing("Done", total, total) + assert line == expected_line + i += 1 + + +def test_progress_bar_b(): + + def run_single_task(bar: ProgressBar, total: int, task_number: int) -> None: + global sleep + bar.reset(total=total, progress=0, description=f"Task-{task_number}") + for i in range(total): + bar.increment_progress(1) ; sleep() # noqa + + ntasks = 10 + total = 50 + description = "Working" + bar = ProgressBar(total=total, description=description, capture_output_for_testing=True) + + for i in range(ntasks): + run_single_task(bar, total, i + 1) + bar.done("Done") + + # i = 0 + # for line in bar.captured_output_for_testing: + # print(f"{i}: {line}") + # i += 1 + + bar_output = bar.captured_output_for_testing + assert len(bar_output) == 1 + (ntasks * (total + 1)) + 1 + assert bar_output[0] == bar.format_captured_output_for_testing("Working", total, 0) + assert bar_output[len(bar_output) - 1] == bar.format_captured_output_for_testing("Done", total, total) + + bar_output = bar_output[1:] + for n in range(ntasks): + for i in range(total + 1): + assert bar_output[n * (total + 1) + i] == bar.format_captured_output_for_testing(f"Task-{n + 1}", total, i)