From 660df9c0000ab4d7b8a41da1a8289052dc1e6525 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 25 Aug 2023 11:42:56 -0400 Subject: [PATCH] Small cosmetic changes and some additional support for upcoming work. --- CHANGELOG.rst | 8 ++- dcicutils/misc_utils.py | 41 ++++++++++++++ dcicutils/sheet_utils.py | 119 +++++++++++++++++++++++++++------------ pyproject.toml | 2 +- test/test_misc_utils.py | 97 ++++++++++++++++++++++++++++--- test/test_sheet_utils.py | 26 ++++----- 6 files changed, 233 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b88893ac8..79f60120f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -19,7 +19,7 @@ Change Log * Function ``load_items`` that does the same as ``ItemManager.load``. - * Various low-level implementation classes such as: + * Various lower-level implementation classes such as: * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. @@ -27,7 +27,11 @@ Change Log * Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. -* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). +* New functionality in ``misc_utils``: + + * New function ``is_uuid`` (migrated from Fourfront) + * New function ``pad_to`` + * New class ``JsonLinesReader`` 7.9.0 diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index aeca4a326..de188b872 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -9,6 +9,7 @@ import inspect import math import io +import json import os import logging import pytz @@ -2329,3 +2330,43 @@ def parse_in_radix(text: str, *, radix: int): except Exception: pass raise ValueError(f"Unable to parse: {text!r}") + + +def pad_to(target_size: int, data: list, *, padding=None): + actual_size = len(data) + if actual_size < target_size: + data = data + [padding] * (target_size - actual_size) + return data + + +class JsonLinesReader: + + def __init__(self, fp, padded=False, padding=None): + self.fp = fp + self.padded: bool = padded + self.padding = padding + self.headers = None # Might change after we see first line + + def __iter__(self): + first_line = True + n_headers = 0 + for raw_line in self.fp: + line = json.loads(raw_line) + if first_line: + first_line = False + if isinstance(line, list): + self.headers = line + n_headers = len(line) + continue + # If length of line is mroe than we expect, ignore it. Let user put comments beyond our table + # But if length of line is less than we expect, extend the line with None + if self.headers: + if not isinstance(line, list): + raise Exception("If the first line is a list, all lines must be.") + if self.padded and len(line) < n_headers: + line = pad_to(n_headers, line, padding=self.padding) + yield dict(zip(self.headers, line)) + elif isinstance(line, dict): + yield line + else: + raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}") diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9a83bf6f9..df3e16e43 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -3,18 +3,19 @@ import csv import io import openpyxl +import os import uuid from dcicutils.common import AnyJsonData from dcicutils.env_utils import public_env_name, EnvUtils from dcicutils.ff_utils import get_schema from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize -from dcicutils.misc_utils import ignored, PRINT +from dcicutils.misc_utils import ignored, PRINT, pad_to from dcicutils.task_utils import pmap from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Type, Union Header = str @@ -334,7 +335,7 @@ def __init__(self, **kwargs): # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod - def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: + def load(cls, filename: str, **kwargs) -> Dict[str, List[AnyJsonData]]: """ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. For more information, see documentation of AbstractTableSetManager. @@ -354,6 +355,8 @@ class BasicTableSetManager(AbstractTableSetManager): of this where there's only one set of headers and only one block of content. """ + ALLOWED_FILE_EXTENSIONS: List[str] = [] + def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename @@ -387,17 +390,26 @@ def load_content(self) -> Any: class TableSetManager(BasicTableSetManager): - - ALLOWED_FILE_EXTENSIONS = None + """ + This is the base class for all things that read tablesets. Those may be: + * Excel workbook readers (.xlsx) + * Comma-separated file readers (.csv) + * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright + refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt) + Unimplemented formats that could easily be made to do the same thing: + * JSON files + * JSON lines files + * YAML files + """ @classmethod - def load(cls, filename: str) -> AnyJsonData: + def load(cls, filename: str, **kwargs) -> AnyJsonData: if cls.ALLOWED_FILE_EXTENSIONS: if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS): raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") - table_set_manager: TableSetManager = cls(filename) + table_set_manager: TableSetManager = cls(filename, **kwargs) return table_set_manager.load_content() def __init__(self, filename: str, **kwargs): @@ -432,6 +444,33 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: return prefer_number(value) +class TableSetManagerRegistry: + + ALL_TABLE_SET_MANAGERS: Dict[str, Type[TableSetManager]] = {} + + @classmethod + def register(cls, class_to_register: Type[TableSetManager]): + for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: + existing = cls.ALL_TABLE_SET_MANAGERS.get(ext) + if existing: + raise Exception(f"Tried to define {class_to_register} to extension {ext}," + f" but {existing} already claimed that.") + cls.ALL_TABLE_SET_MANAGERS[ext] = class_to_register + return class_to_register + + @classmethod + def manager_for_filename(cls, filename: str) -> Type[TableSetManager]: + base = os.path.basename(filename) + dotparts = base.split('.') + while dotparts: + suffix = f".{'.'.join(dotparts)}" + found = cls.ALL_TABLE_SET_MANAGERS.get(suffix) + if found: + return found + dotparts = dotparts[1:] + raise LoadArgumentsError(f"Unknown file type: {filename}") + + class XlsxManager(TableSetManager): """ This implements the mechanism to get a series of rows out of the sheets in an XLSX file. @@ -484,7 +523,7 @@ class SchemaAutoloadMixin(AbstractTableSetManager): SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. CACHE_SCHEMAS = True # Controls whether we're doing caching at all - AUTOLOAD_SCHEMAS_DEFAULT = False + AUTOLOAD_SCHEMAS_DEFAULT = True def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, **kwargs): if portal_env is None: @@ -592,6 +631,7 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: return ItemTools.parse_item_value(value, context=self._instaguid_context_table) +@TableSetManagerRegistry.register class XlsxItemManager(ItemManagerMixin, XlsxManager): """ This layers item-style row processing functionality on an XLSX file. @@ -599,29 +639,35 @@ class XlsxItemManager(ItemManagerMixin, XlsxManager): pass -class CsvManager(TableSetManager): - """ - This implements the mechanism to get a series of rows out of the sheet in a csv file, - returning a result that still looks like there could have been multiple tabs. - """ - - ALLOWED_FILE_EXTENSIONS = ['.csv'] +class SingleTableMixin(AbstractTableSetManager): DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): - super().__init__(filename=filename, **kwargs) + def __init__(self, tab_name: Optional[str] = None, **kwargs): + super().__init__(**kwargs) self.tab_name = tab_name or self.DEFAULT_TAB_NAME @property def tabnames(self) -> List[str]: return [self.tab_name] + +class CsvManager(SingleTableMixin, TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheet in a csv file, + returning a result that still looks like there could have been multiple tabs. + """ + + ALLOWED_FILE_EXTENSIONS = ['.csv'] + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + def _get_reader_agent(self) -> CsvReader: - return self._get_csv_reader(self.filename) + return self._get_reader_agent_for_filename(self.filename) @classmethod - def _get_csv_reader(cls, filename) -> CsvReader: + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) PAD_TRAILING_TABS = True @@ -630,9 +676,8 @@ def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: headers = self.tab_headers(tabname) n_headers = len(headers) for row_data in self.reader_agent: - n_cols = len(row_data) - if self.PAD_TRAILING_TABS and n_cols < n_headers: - row_data = row_data + [''] * (n_headers - n_cols) + if self.PAD_TRAILING_TABS: + row_data = pad_to(n_headers, row_data, padding='') yield row_data def _create_tab_processor_state(self, tabname: str) -> Headers: @@ -647,6 +692,7 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An for i, row_datum in enumerate(row_data)} +@TableSetManagerRegistry.register class CsvItemManager(ItemManagerMixin, CsvManager): """ This layers item-style row processing functionality on a CSV file. @@ -666,7 +712,7 @@ def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): self.escaping: bool = escaping or False @classmethod - def _get_csv_reader(cls, filename) -> CsvReader: + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: @@ -699,6 +745,7 @@ def expand_escape_sequences(cls, text: str) -> str: return s.getvalue() +@TableSetManagerRegistry.register class TsvItemManager(ItemManagerMixin, TsvManager): """ This layers item-style row processing functionality on a TSV file. @@ -714,24 +761,22 @@ class ItemManager(AbstractTableSetManager): @classmethod def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: - if filename.endswith(".xlsx"): - reader_agent = XlsxItemManager(filename, **kwargs) - elif filename.endswith(".csv"): - tab_name = kwargs.pop('tab_name', None) - reader_agent = CsvItemManager(filename, tab_name=tab_name, **kwargs) - elif filename.endswith(".tsv"): - escaping = kwargs.pop('escaping', None) - tab_name = kwargs.pop('tab_name', None) - reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) - else: - raise LoadArgumentsError(f"Unknown file type: {filename}") + reader_agent_class = TableSetManagerRegistry.manager_for_filename(filename) + reader_agent = reader_agent_class(filename, **kwargs) return reader_agent @classmethod - def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None) -> AnyJsonData: + def load(cls, filename: str, + tab_name: Optional[str] = None, + escaping: Optional[bool] = None, + schemas: Optional[Dict] = None, + autoload_schemas: Optional[bool] = None, + **kwargs) -> Dict[str, List[AnyJsonData]]: + """ + Given a filename and various options + """ manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas, - autoload_schemas=autoload_schemas) + autoload_schemas=autoload_schemas, **kwargs) return manager.load_content() diff --git a/pyproject.toml b/pyproject.toml index aaa4371f7..b3e907b9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.9.0.1b2" # to become "7.10.0" +version = "7.9.0.1b3" # to become "7.10.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index b940877f5..0017bd16e 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -30,12 +30,13 @@ classproperty, classproperty_cached, classproperty_cached_each_subclass, Singleton, NamedObject, obsolete, ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, - MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, + MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, pad_to, JsonLinesReader, ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, raises_regexp, MockId, MockLog, input_series, ) +from typing import Any, Dict, List from unittest import mock @@ -1094,7 +1095,7 @@ def test_lockout_manager(): protected_action = "simulated action" - # The function now() will get us the time. This assure us that binding datetime.datetime + # The function now() will get us the time. This assures us that binding datetime.datetime # will not be affecting us. now = datetime_module.datetime.now @@ -1197,7 +1198,7 @@ def test_rate_manager(): # PyCharm thinks this is not used. -kmp 26-Jul-2020 # r = RateManager(interval_seconds=60, safety_seconds=1, allowed_attempts=4) - # The function now() will get us the time. This assure us that binding datetime.datetime + # The function now() will get us the time. This assures us that binding datetime.datetime # will not be affecting us. now = datetime_module.datetime.now @@ -1885,7 +1886,7 @@ def test_cached_field_mocked(self): assert field.get() == val5 assert field.get() == val5 - dt.sleep(self.DEFAULT_TIMEOUT) # Fast forward to where we're going to refill again + dt.sleep(self.DEFAULT_TIMEOUT) # Fast-forward to where we're going to refill again val6 = field.get() assert val6 != val5 @@ -2077,7 +2078,7 @@ def test_copy_json(obj): def test_copy_json_side_effects(): - obj = {'foo': [1, 2, 3], 'bar': [{'x': 4, 'y': 5}, {'x': 2, 'y': 7}]} + obj: Dict[str, Any] = {'foo': [1, 2, 3], 'bar': [{'x': 4, 'y': 5}, {'x': 2, 'y': 7}]} obj_copy = copy_json(obj) obj['foo'][1] = 20 obj['bar'][0]['y'] = 500 # NoQA - PyCharm wrongly fears there are type errors in this line, that it will fail. @@ -2931,7 +2932,7 @@ class SubClock(Clock): assert str(exc.value) == ("The subclasses= argument to classproperty_cached.reset must not be False" " because classproperty_cached does not use per-subclass caches.") - # This will clear SubClock cache, bu that's shared with the Clock cache, so both will clear. + # This will clear SubClock cache, but that's shared with the Clock cache, so both will clear. assert classproperty_cached.reset(instance_class=SubClock, attribute_name='sample') is True c_t5 = Clock.sample # This should recompute Clock.sample cache, which is shared by SubCLock @@ -3285,7 +3286,7 @@ def test_deduplicate_list(): xlen = len(x) assert sorted(deduplicate_list(x)) == ['a', 'b', 'c'] - assert len(x) == xlen # make sure there was no side-effect to the original list + assert len(x) == xlen # make sure there was no side effect to the original list y = ['a'] y0 = deduplicate_list(y) @@ -3495,3 +3496,85 @@ def test_map_chunked(): res = map_chunked(lambda x: ''.join(x), "abcdefghij", chunk_size=4, reduce=lambda x: '.'.join(x)) assert res == 'abcd.efgh.ij' + + +def test_pad_to(): + + assert pad_to(5, []) == [None, None, None, None, None] + assert pad_to(5, [], padding='foo') == ['foo', 'foo', 'foo', 'foo', 'foo'] + + assert pad_to(5, ['x']) == ['x', None, None, None, None] + assert pad_to(5, ['x'], padding='foo') == ['x', 'foo', 'foo', 'foo', 'foo'] + + six_elements = ['a', 'b', 'c', 'd', 'e', 'f'] + + assert pad_to(5, six_elements) == six_elements + assert pad_to(5, six_elements, padding='foo') + + +def test_json_lines_reader_dicts(): + + print() # start on a fresh line + + mfs = MockFileSystem() + + with mfs.mock_exists_open_remove(): + + item1 = {"foo": 1, "bar": 2} + item2 = {"foo": 3, "bar": 4} + + item1_str = json.dumps(item1) + item2_str = json.dumps(item2) + + sample_lines = [item1_str, item2_str] + + sample_filename = "somefile.jsonl" + + with io.open(sample_filename, 'w') as fp: + for line in sample_lines: + print(line, file=fp) + + for file, content in mfs.files.items(): + print("=" * 20, file, "=" * 20) + print(content.decode('utf-8')) + print("=" * 80) + + with io.open(sample_filename) as fp: + assert [line for line in JsonLinesReader(fp)] == [item1, item2] + + +def test_json_lines_reader_lists(): + + print() # start on a fresh line + + mfs = MockFileSystem() + + with mfs.mock_exists_open_remove(): + + item1 = {"foo": 1, "bar": 2} + item2 = {"foo": 3, "bar": 4} + + headers: List[str] = list(item1.keys()) + + item1_str = json.dumps([item1[header] for header in headers]) + item2_str = json.dumps([item2[header] for header in headers]) + + sample_lines = [item1_str, item2_str] + + sample_filename = "somefile.jsonl" + + with io.open(sample_filename, 'w') as fp: + + print(json.dumps(headers), file=fp) + for line in sample_lines: + print(line, file=fp) + + for file, content in mfs.files.items(): + print("=" * 20, file, "=" * 20) + print(content.decode('utf-8')) + print("=" * 80) + + with io.open(sample_filename) as fp: + parsed = [line for line in JsonLinesReader(fp)] + expected = [item1, item2] + assert parsed == expected diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 8557e1278..ae3096632 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -376,13 +376,13 @@ def test_xlsx_manager_load_csv(): def test_xlsx_item_manager_load_content(): - it = XlsxItemManager(SAMPLE_XLSX_FILE) + it = XlsxItemManager(SAMPLE_XLSX_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_item_manager_load(): - assert XlsxItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert XlsxItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_item_manager_load_csv(): @@ -414,19 +414,19 @@ def test_csv_manager_load_csv(): def test_csv_item_manager_load_content(): - it = CsvItemManager(SAMPLE_CSV_FILE) + it = CsvItemManager(SAMPLE_CSV_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT def test_csv_item_manager_load(): - assert CsvItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert CsvItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT def test_csv_item_manager_load_csv(): with pytest.raises(LoadArgumentsError) as exc: - CsvItemManager.load(SAMPLE_XLSX_FILE) + CsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' ' expects only .csv filenames:') @@ -460,30 +460,30 @@ def test_tsv_manager_load_csv(): def test_tsv_item_manager_load_content(): - it = TsvItemManager(SAMPLE_TSV_FILE) + it = TsvItemManager(SAMPLE_TSV_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT def test_tsv_item_manager_load(): - assert TsvItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + assert TsvItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT def test_tsv_item_manager_load_csv(): with pytest.raises(LoadArgumentsError) as exc: - TsvItemManager.load(SAMPLE_XLSX_FILE) + TsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' ' expects only .tsv or .tsv.txt filenames:') def test_item_manager_load(): - assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT with pytest.raises(LoadArgumentsError) as exc: ItemManager.load("something.else") @@ -492,9 +492,9 @@ def test_item_manager_load(): def test_load_items(): - assert load_items(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert load_items(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert load_items(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT with pytest.raises(LoadArgumentsError) as exc: load_items("something.else")