Skip to content

Commit

Permalink
Fix error handling to be clearer.
Browse files Browse the repository at this point in the history
  • Loading branch information
netsettler committed Aug 24, 2023
1 parent 477c7a2 commit 09b4c43
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 54 deletions.
76 changes: 57 additions & 19 deletions dcicutils/sheet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import uuid

from dcicutils.common import AnyJsonData
from dcicutils.lang_utils import conjoined_list, maybe_pluralize
from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize
from dcicutils.misc_utils import ignored
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.workbook.workbook import Workbook
Expand All @@ -23,15 +23,39 @@
CsvReader = type(csv.reader(TemporaryFile()))


class LoadFailure(Exception):
"""
In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail,
but some errors are so confusing or so problematic that we need to just fail the load right away.
"""
pass


class LoadArgumentsError(LoadFailure):
"""
Errors of this class represent situations where we can't get started because
there's a problem with the given arguments.
"""
pass


class LoadTableError(LoadFailure):
"""
Errors of this class represent situations where we can't get started because
there's a problem with some table's syntax, for example headers that don't make sense.
"""
pass


def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False):
if kwargs:
unwanted = [f"{argname}={value!r}" if detailed else argname
for argname, value in kwargs.items()
if value is not None]
if unwanted:
does_not = "don't" if context_plural else "doesn't"
raise ValueError(f"{context} {does_not} use"
f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.")
raise LoadArgumentsError(f"{context} {does_not} use"
f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.")


def prefer_number(value: SheetCellValue):
Expand Down Expand Up @@ -165,7 +189,7 @@ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders):
for parsed_header in parsed_headers:
parsed_header0 = parsed_header[0]
if isinstance(parsed_header0, int):
raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}")
raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}")
cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header)
return prototype

Expand All @@ -184,7 +208,7 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed
if key0 == n:
parent.append(placeholder)
elif key0 > n:
raise Exception("Numeric items must occur sequentially.")
raise LoadTableError("Numeric items must occur sequentially.")
elif isinstance(key0, str):
if key0 not in parent:
parent[key0] = placeholder
Expand Down Expand Up @@ -311,7 +335,7 @@ def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]:
Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
For more information, see documentation of AbstractTableSetManager.
"""
raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.")
raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA


class BasicTableSetManager(AbstractTableSetManager):
Expand All @@ -335,7 +359,8 @@ def tab_headers(self, tabname: str) -> Headers:
def tab_content(self, tabname: str) -> List[AnyJsonData]:
return self.content_by_tabname[tabname]

def _create_tab_processor_state(self, tabname: str) -> Any:
@classmethod
def _create_tab_processor_state(cls, tabname: str) -> Any:
"""
This method provides for the possibility that some parsers will want auxiliary state,
(such as parsed headers or a line count or a table of temporary names for objects to cross-link
Expand All @@ -347,16 +372,23 @@ def _create_tab_processor_state(self, tabname: str) -> Any:

def _get_reader_agent(self) -> Any:
"""This function is responsible for opening the workbook and returning a workbook object."""
raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.")
raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA

def load_content(self) -> Any:
raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.")
raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA


class TableSetManager(BasicTableSetManager):

ALLOWED_FILE_EXTENSIONS = None

@classmethod
def load(cls, filename: str) -> AnyJsonData:
if cls.ALLOWED_FILE_EXTENSIONS:
if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS):
raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only"
f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}")

table_set_manager: TableSetManager = cls(filename)
return table_set_manager.load_content()

Expand All @@ -365,21 +397,21 @@ def __init__(self, filename: str, **kwargs):

@property
def tabnames(self) -> List[str]:
raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..")
raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") # noQA

def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
"""
Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values.
"""
raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.")
raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") # noQA

def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData:
"""
This needs to take a state and whatever represents a row and
must return a list of objects representing column values.
What constitutes a processed up to the class, but other than that the result must be a JSON dictionary.
"""
raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.")
raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA

def load_content(self) -> AnyJsonData:
for tabname in self.tabnames:
Expand All @@ -401,6 +433,8 @@ class XlsxManager(TableSetManager):
This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
"""

ALLOWED_FILE_EXTENSIONS = ['.xlsx']

@classmethod
def _all_rows(cls, sheet: Worksheet):
row_max = sheet.max_row
Expand Down Expand Up @@ -510,7 +544,7 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
return ItemTools.parse_item_value(value, context=self._instaguid_context_table)


class ItemXlsxManager(ItemManagerMixin, XlsxManager):
class XlsxItemManager(ItemManagerMixin, XlsxManager):
"""
This layers item-style row processing functionality on an XLSX file.
"""
Expand All @@ -523,6 +557,8 @@ class CsvManager(TableSetManager):
returning a result that still looks like there could have been multiple tabs.
"""

ALLOWED_FILE_EXTENSIONS = ['.csv']

DEFAULT_TAB_NAME = 'Sheet1'

def __init__(self, filename: str, tab_name: Optional[str] = None, escaping: bool = False, **kwargs):
Expand Down Expand Up @@ -564,7 +600,7 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An
for i, row_datum in enumerate(row_data)}


class ItemCsvManager(ItemManagerMixin, CsvManager):
class CsvItemManager(ItemManagerMixin, CsvManager):
"""
This layers item-style row processing functionality on a CSV file.
"""
Expand All @@ -576,12 +612,14 @@ class TsvManager(CsvManager):
TSV files are just CSV files with tabs instead of commas as separators.
(We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.)
"""
ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt']

@classmethod
def _get_csv_reader(cls, filename) -> CsvReader:
return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')


class ItemTsvManager(ItemManagerMixin, TsvManager):
class TsvItemManager(ItemManagerMixin, TsvManager):
"""
This layers item-style row processing functionality on a TSV file.
"""
Expand All @@ -598,17 +636,17 @@ class ItemManager(AbstractTableSetManager):
def create_implementation_manager(cls, filename: str, escaping=None, **kwargs) -> BasicTableSetManager:
if filename.endswith(".xlsx"):
# unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs)
reader_agent = ItemXlsxManager(filename, escaping=escaping, **kwargs)
reader_agent = XlsxItemManager(filename, escaping=escaping, **kwargs)
elif filename.endswith(".csv"):
tab_name = kwargs.pop('tab_name', None)
# unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs)
reader_agent = ItemCsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs)
reader_agent = CsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs)
elif filename.endswith(".tsv"):
tab_name = kwargs.pop('tab_name', None)
# unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs)
reader_agent = ItemTsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs)
reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs)
else:
raise ValueError(f"Unknown file type: {filename}")
raise LoadArgumentsError(f"Unknown file type: {filename}")
return reader_agent

@classmethod
Expand Down
Loading

0 comments on commit 09b4c43

Please sign in to comment.