diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index d5b6262e4..ba2282aa5 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -6,7 +6,7 @@ import uuid from dcicutils.common import AnyJsonData -from dcicutils.lang_utils import conjoined_list, maybe_pluralize +from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook @@ -23,6 +23,30 @@ CsvReader = type(csv.reader(TemporaryFile())) +class LoadFailure(Exception): + """ + In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail, + but some errors are so confusing or so problematic that we need to just fail the load right away. + """ + pass + + +class LoadArgumentsError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with the given arguments. + """ + pass + + +class LoadTableError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with some table's syntax, for example headers that don't make sense. + """ + pass + + def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): if kwargs: unwanted = [f"{argname}={value!r}" if detailed else argname @@ -30,8 +54,8 @@ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): if value is not None] if unwanted: does_not = "don't" if context_plural else "doesn't" - raise ValueError(f"{context} {does_not} use" - f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + raise LoadArgumentsError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") def prefer_number(value: SheetCellValue): @@ -165,7 +189,7 @@ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): for parsed_header in parsed_headers: parsed_header0 = parsed_header[0] if isinstance(parsed_header0, int): - raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}") + raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) return prototype @@ -184,7 +208,7 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed if key0 == n: parent.append(placeholder) elif key0 > n: - raise Exception("Numeric items must occur sequentially.") + raise LoadTableError("Numeric items must occur sequentially.") elif isinstance(key0, str): if key0 not in parent: parent[key0] = placeholder @@ -311,7 +335,7 @@ def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. For more information, see documentation of AbstractTableSetManager. """ - raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") + raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA class BasicTableSetManager(AbstractTableSetManager): @@ -335,7 +359,8 @@ def tab_headers(self, tabname: str) -> Headers: def tab_content(self, tabname: str) -> List[AnyJsonData]: return self.content_by_tabname[tabname] - def _create_tab_processor_state(self, tabname: str) -> Any: + @classmethod + def _create_tab_processor_state(cls, tabname: str) -> Any: """ This method provides for the possibility that some parsers will want auxiliary state, (such as parsed headers or a line count or a table of temporary names for objects to cross-link @@ -347,16 +372,23 @@ def _create_tab_processor_state(self, tabname: str) -> Any: def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" - raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA def load_content(self) -> Any: - raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA class TableSetManager(BasicTableSetManager): + ALLOWED_FILE_EXTENSIONS = None + @classmethod def load(cls, filename: str) -> AnyJsonData: + if cls.ALLOWED_FILE_EXTENSIONS: + if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS): + raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" + f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") + table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() @@ -365,13 +397,13 @@ def __init__(self, filename: str, **kwargs): @property def tabnames(self) -> List[str]: - raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") + raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") # noQA def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: """ Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. """ - raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") # noQA def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: """ @@ -379,7 +411,7 @@ def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> A must return a list of objects representing column values. What constitutes a processed up to the class, but other than that the result must be a JSON dictionary. """ - raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA def load_content(self) -> AnyJsonData: for tabname in self.tabnames: @@ -401,6 +433,8 @@ class XlsxManager(TableSetManager): This implements the mechanism to get a series of rows out of the sheets in an XLSX file. """ + ALLOWED_FILE_EXTENSIONS = ['.xlsx'] + @classmethod def _all_rows(cls, sheet: Worksheet): row_max = sheet.max_row @@ -510,7 +544,7 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: return ItemTools.parse_item_value(value, context=self._instaguid_context_table) -class ItemXlsxManager(ItemManagerMixin, XlsxManager): +class XlsxItemManager(ItemManagerMixin, XlsxManager): """ This layers item-style row processing functionality on an XLSX file. """ @@ -523,6 +557,8 @@ class CsvManager(TableSetManager): returning a result that still looks like there could have been multiple tabs. """ + ALLOWED_FILE_EXTENSIONS = ['.csv'] + DEFAULT_TAB_NAME = 'Sheet1' def __init__(self, filename: str, tab_name: Optional[str] = None, escaping: bool = False, **kwargs): @@ -564,7 +600,7 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An for i, row_datum in enumerate(row_data)} -class ItemCsvManager(ItemManagerMixin, CsvManager): +class CsvItemManager(ItemManagerMixin, CsvManager): """ This layers item-style row processing functionality on a CSV file. """ @@ -576,12 +612,14 @@ class TsvManager(CsvManager): TSV files are just CSV files with tabs instead of commas as separators. (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.) """ + ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt'] + @classmethod def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') -class ItemTsvManager(ItemManagerMixin, TsvManager): +class TsvItemManager(ItemManagerMixin, TsvManager): """ This layers item-style row processing functionality on a TSV file. """ @@ -598,17 +636,17 @@ class ItemManager(AbstractTableSetManager): def create_implementation_manager(cls, filename: str, escaping=None, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): # unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs) - reader_agent = ItemXlsxManager(filename, escaping=escaping, **kwargs) + reader_agent = XlsxItemManager(filename, escaping=escaping, **kwargs) elif filename.endswith(".csv"): tab_name = kwargs.pop('tab_name', None) # unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs) - reader_agent = ItemCsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) + reader_agent = CsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) elif filename.endswith(".tsv"): tab_name = kwargs.pop('tab_name', None) # unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs) - reader_agent = ItemTsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) + reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) else: - raise ValueError(f"Unknown file type: {filename}") + raise LoadArgumentsError(f"Unknown file type: {filename}") return reader_agent @classmethod diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index cfd20207b..4834c9fdd 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -9,10 +9,13 @@ # High-level interfaces ItemManager, load_items, # Low-level implementation - ItemTools, XlsxManager, ItemXlsxManager, - CsvManager, ItemCsvManager, TsvManager, ItemTsvManager, + BasicTableSetManager, + ItemTools, XlsxManager, XlsxItemManager, + CsvManager, CsvItemManager, TsvManager, TsvItemManager, # TypeHint, EnumHint, BoolHint, + # Error handling + LoadFailure, LoadArgumentsError, LoadTableError, # Utilities prefer_number, unwanted_kwargs, ) @@ -20,6 +23,34 @@ from .conftest_settings import TEST_DIR +def test_load_failure(): + + sample_message = "This is a test." + + load_failure_object = LoadFailure(sample_message) + assert isinstance(load_failure_object, LoadFailure) + assert str(load_failure_object) == sample_message + + +def test_load_argument_error(): + + sample_message = "This is a test." + + load_failure_object = LoadArgumentsError(sample_message) + assert isinstance(load_failure_object, LoadArgumentsError) + assert str(load_failure_object) == sample_message + + +def test_load_table_error(): + + sample_message = "This is a test." + + load_failure_object = LoadTableError(sample_message) + assert isinstance(load_failure_object, LoadTableError) + assert str(load_failure_object) == sample_message + + + def test_prefer_number(): assert prefer_number('') is None @@ -61,11 +92,16 @@ def test_unwanted_kwargs_without_error(): ]) def test_unwanted_kwargs_with_error(context, context_plural, detailed, kwargs, message): - with pytest.raises(ValueError) as exc: + with pytest.raises(LoadArgumentsError) as exc: unwanted_kwargs(context=context, kwargs=kwargs, context_plural=context_plural, detailed=detailed) assert str(exc.value) == message +def test_back_table_set_create_state(): + + assert BasicTableSetManager._create_tab_processor_state('some-tab') is None + + def test_item_tools_parse_sheet_header(): assert ItemTools.parse_sheet_header('.a') == ['a'] assert ItemTools.parse_sheet_header('a') == ['a'] @@ -108,7 +144,7 @@ def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): def test_item_tools_compute_patch_prototype_errors(headers): parsed_headers = ItemTools.parse_sheet_headers(headers) - with pytest.raises(ValueError) as exc: + with pytest.raises(LoadTableError) as exc: ItemTools.compute_patch_prototype(parsed_headers) assert str(exc.value) == "A header cannot begin with a numeric ref: 0" @@ -205,6 +241,8 @@ def test_item_tools_set_path_value(): def test_item_tools_find_type_hint(): + assert ItemTools.find_type_hint(None, 'anything') is None + assert ItemTools.find_type_hint(['foo', 'bar'], None) is None assert ItemTools.find_type_hint(['foo', 'bar'], "something") is None assert ItemTools.find_type_hint(['foo', 'bar'], {}) is None @@ -304,13 +342,13 @@ def test_item_tools_find_type_hint(): SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_ITEM_CONTENT = {CsvItemManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') SAMPLE_TSV_FILE_RAW_CONTENT = {TsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_TSV_FILE_ITEM_CONTENT = {ItemTsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_TSV_FILE_ITEM_CONTENT = {TsvItemManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} def test_xlsx_manager_load_content(): @@ -326,26 +364,29 @@ def test_xlsx_manager_load(): def test_xlsx_manager_load_csv(): - with pytest.raises(Exception): + with pytest.raises(LoadArgumentsError) as exc: XlsxManager.load(SAMPLE_CSV_FILE) + assert str(exc.value).startswith('The TableSetManager subclass XlsxManager' + ' expects only .xlsx filenames:') -def test_item_xlsx_manager_load_content(): +def test_xlsx_item_manager_load_content(): - it = ItemXlsxManager(SAMPLE_XLSX_FILE) + it = XlsxItemManager(SAMPLE_XLSX_FILE) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_xlsx_manager_load(): - - assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT +def test_xlsx_item_manager_load(): + assert XlsxItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_xlsx_manager_load_csv(): - with pytest.raises(Exception): - ItemXlsxManager.load(SAMPLE_CSV_FILE) +def test_xlsx_item_manager_load_csv(): + with pytest.raises(LoadArgumentsError) as exc: + XlsxItemManager.load(SAMPLE_CSV_FILE) + assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' + ' expects only .xlsx filenames:') def test_csv_manager_load_content(): @@ -360,26 +401,29 @@ def test_csv_manager_load(): def test_csv_manager_load_csv(): - with pytest.raises(Exception): + with pytest.raises(LoadArgumentsError) as exc: CsvManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass CsvManager' + ' expects only .csv filenames:') -def test_item_csv_manager_load_content(): +def test_csv_item_manager_load_content(): - it = ItemCsvManager(SAMPLE_CSV_FILE) + it = CsvItemManager(SAMPLE_CSV_FILE) assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT -def test_item_csv_manager_load(): +def test_csv_item_manager_load(): - assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert CsvItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT -def test_item_csv_manager_load_csv(): - - with pytest.raises(Exception): - ItemCsvManager.load(SAMPLE_XLSX_FILE) +def test_csv_item_manager_load_csv(): + with pytest.raises(LoadArgumentsError) as exc: + CsvItemManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' + ' expects only .csv filenames:') def test_tsv_manager_load_content(): @@ -394,25 +438,29 @@ def test_tsv_manager_load(): def test_tsv_manager_load_csv(): - with pytest.raises(Exception): + with pytest.raises(LoadArgumentsError) as exc: TsvManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass TsvManager' + ' expects only .tsv or .tsv.txt filenames:') -def test_item_tsv_manager_load_content(): +def test_tsv_item_manager_load_content(): - it = ItemTsvManager(SAMPLE_TSV_FILE) + it = TsvItemManager(SAMPLE_TSV_FILE) assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT -def test_item_tsv_manager_load(): +def test_tsv_item_manager_load(): - assert ItemTsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + assert TsvItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT -def test_item_tsv_manager_load_csv(): +def test_tsv_item_manager_load_csv(): - with pytest.raises(Exception): - ItemTsvManager.load(SAMPLE_XLSX_FILE) + with pytest.raises(LoadArgumentsError) as exc: + TsvItemManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' + ' expects only .tsv or .tsv.txt filenames:') def test_item_manager_load(): @@ -421,8 +469,11 @@ def test_item_manager_load(): assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT - with pytest.raises(ValueError): + assert ItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + + with pytest.raises(LoadArgumentsError) as exc: ItemManager.load("something.else") + assert str(exc.value) == "Unknown file type: something.else" def test_load_items(): @@ -431,8 +482,9 @@ def test_load_items(): assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT - with pytest.raises(ValueError): + with pytest.raises(LoadArgumentsError) as exc: load_items("something.else") + assert str(exc.value) == "Unknown file type: something.else" SAMPLE_CSV_FILE2_SCHEMAS = { @@ -456,7 +508,7 @@ def test_load_items(): } SAMPLE_CSV_FILE2_ITEM_CONTENT = { - ItemCsvManager.DEFAULT_TAB_NAME: [ + CsvItemManager.DEFAULT_TAB_NAME: [ {"name": "john", "sex": "M", "member": False}, {"name": "juan", "sex": "male", "member": True}, {"name": "igor", "sex": "unknown", "member": None},