Skip to content

Commit

Permalink
Small cosmetic changes and some additional support for upcoming work.
Browse files Browse the repository at this point in the history
  • Loading branch information
netsettler committed Aug 25, 2023
1 parent 3852e56 commit 660df9c
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 60 deletions.
8 changes: 6 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,19 @@ Change Log

* Function ``load_items`` that does the same as ``ItemManager.load``.

* Various low-level implementation classes such as:
* Various lower-level implementation classes such as:

* Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data
from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.

* Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data
from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.

* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 <https://hms-dbmi.atlassian.net/browse/C4-1086>`_).
* New functionality in ``misc_utils``:

* New function ``is_uuid`` (migrated from Fourfront)
* New function ``pad_to``
* New class ``JsonLinesReader``


7.9.0
Expand Down
41 changes: 41 additions & 0 deletions dcicutils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import inspect
import math
import io
import json
import os
import logging
import pytz
Expand Down Expand Up @@ -2329,3 +2330,43 @@ def parse_in_radix(text: str, *, radix: int):
except Exception:
pass
raise ValueError(f"Unable to parse: {text!r}")


def pad_to(target_size: int, data: list, *, padding=None):
actual_size = len(data)
if actual_size < target_size:
data = data + [padding] * (target_size - actual_size)
return data


class JsonLinesReader:

def __init__(self, fp, padded=False, padding=None):
self.fp = fp
self.padded: bool = padded
self.padding = padding
self.headers = None # Might change after we see first line

def __iter__(self):
first_line = True
n_headers = 0
for raw_line in self.fp:
line = json.loads(raw_line)
if first_line:
first_line = False
if isinstance(line, list):
self.headers = line
n_headers = len(line)
continue
# If length of line is mroe than we expect, ignore it. Let user put comments beyond our table
# But if length of line is less than we expect, extend the line with None
if self.headers:
if not isinstance(line, list):
raise Exception("If the first line is a list, all lines must be.")
if self.padded and len(line) < n_headers:
line = pad_to(n_headers, line, padding=self.padding)
yield dict(zip(self.headers, line))
elif isinstance(line, dict):
yield line
else:
raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}")
119 changes: 82 additions & 37 deletions dcicutils/sheet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
import csv
import io
import openpyxl
import os
import uuid

from dcicutils.common import AnyJsonData
from dcicutils.env_utils import public_env_name, EnvUtils
from dcicutils.ff_utils import get_schema
from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize
from dcicutils.misc_utils import ignored, PRINT
from dcicutils.misc_utils import ignored, PRINT, pad_to
from dcicutils.task_utils import pmap
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.workbook.workbook import Workbook
from tempfile import TemporaryFile
from typing import Any, Dict, Iterable, List, Optional, Union
from typing import Any, Dict, Iterable, List, Optional, Type, Union


Header = str
Expand Down Expand Up @@ -334,7 +335,7 @@ def __init__(self, **kwargs):

# TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
@classmethod
def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]:
def load(cls, filename: str, **kwargs) -> Dict[str, List[AnyJsonData]]:
"""
Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
For more information, see documentation of AbstractTableSetManager.
Expand All @@ -354,6 +355,8 @@ class BasicTableSetManager(AbstractTableSetManager):
of this where there's only one set of headers and only one block of content.
"""

ALLOWED_FILE_EXTENSIONS: List[str] = []

def __init__(self, filename: str, **kwargs):
super().__init__(**kwargs)
self.filename: str = filename
Expand Down Expand Up @@ -387,17 +390,26 @@ def load_content(self) -> Any:


class TableSetManager(BasicTableSetManager):

ALLOWED_FILE_EXTENSIONS = None
"""
This is the base class for all things that read tablesets. Those may be:
* Excel workbook readers (.xlsx)
* Comma-separated file readers (.csv)
* Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright
refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt)
Unimplemented formats that could easily be made to do the same thing:
* JSON files
* JSON lines files
* YAML files
"""

@classmethod
def load(cls, filename: str) -> AnyJsonData:
def load(cls, filename: str, **kwargs) -> AnyJsonData:
if cls.ALLOWED_FILE_EXTENSIONS:
if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS):
raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only"
f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}")

table_set_manager: TableSetManager = cls(filename)
table_set_manager: TableSetManager = cls(filename, **kwargs)
return table_set_manager.load_content()

def __init__(self, filename: str, **kwargs):
Expand Down Expand Up @@ -432,6 +444,33 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
return prefer_number(value)


class TableSetManagerRegistry:

ALL_TABLE_SET_MANAGERS: Dict[str, Type[TableSetManager]] = {}

@classmethod
def register(cls, class_to_register: Type[TableSetManager]):
for ext in class_to_register.ALLOWED_FILE_EXTENSIONS:
existing = cls.ALL_TABLE_SET_MANAGERS.get(ext)
if existing:
raise Exception(f"Tried to define {class_to_register} to extension {ext},"
f" but {existing} already claimed that.")
cls.ALL_TABLE_SET_MANAGERS[ext] = class_to_register
return class_to_register

@classmethod
def manager_for_filename(cls, filename: str) -> Type[TableSetManager]:
base = os.path.basename(filename)
dotparts = base.split('.')
while dotparts:
suffix = f".{'.'.join(dotparts)}"
found = cls.ALL_TABLE_SET_MANAGERS.get(suffix)
if found:
return found
dotparts = dotparts[1:]
raise LoadArgumentsError(f"Unknown file type: {filename}")


class XlsxManager(TableSetManager):
"""
This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
Expand Down Expand Up @@ -484,7 +523,7 @@ class SchemaAutoloadMixin(AbstractTableSetManager):

SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
CACHE_SCHEMAS = True # Controls whether we're doing caching at all
AUTOLOAD_SCHEMAS_DEFAULT = False
AUTOLOAD_SCHEMAS_DEFAULT = True

def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, **kwargs):
if portal_env is None:
Expand Down Expand Up @@ -592,36 +631,43 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
return ItemTools.parse_item_value(value, context=self._instaguid_context_table)


@TableSetManagerRegistry.register
class XlsxItemManager(ItemManagerMixin, XlsxManager):
"""
This layers item-style row processing functionality on an XLSX file.
"""
pass


class CsvManager(TableSetManager):
"""
This implements the mechanism to get a series of rows out of the sheet in a csv file,
returning a result that still looks like there could have been multiple tabs.
"""

ALLOWED_FILE_EXTENSIONS = ['.csv']
class SingleTableMixin(AbstractTableSetManager):

DEFAULT_TAB_NAME = 'Sheet1'

def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs):
super().__init__(filename=filename, **kwargs)
def __init__(self, tab_name: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
self.tab_name = tab_name or self.DEFAULT_TAB_NAME

@property
def tabnames(self) -> List[str]:
return [self.tab_name]


class CsvManager(SingleTableMixin, TableSetManager):
"""
This implements the mechanism to get a series of rows out of the sheet in a csv file,
returning a result that still looks like there could have been multiple tabs.
"""

ALLOWED_FILE_EXTENSIONS = ['.csv']

def __init__(self, filename: str, **kwargs):
super().__init__(filename=filename, **kwargs)

def _get_reader_agent(self) -> CsvReader:
return self._get_csv_reader(self.filename)
return self._get_reader_agent_for_filename(self.filename)

@classmethod
def _get_csv_reader(cls, filename) -> CsvReader:
def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
return csv.reader(open_text_input_file_respecting_byte_order_mark(filename))

PAD_TRAILING_TABS = True
Expand All @@ -630,9 +676,8 @@ def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
headers = self.tab_headers(tabname)
n_headers = len(headers)
for row_data in self.reader_agent:
n_cols = len(row_data)
if self.PAD_TRAILING_TABS and n_cols < n_headers:
row_data = row_data + [''] * (n_headers - n_cols)
if self.PAD_TRAILING_TABS:
row_data = pad_to(n_headers, row_data, padding='')
yield row_data

def _create_tab_processor_state(self, tabname: str) -> Headers:
Expand All @@ -647,6 +692,7 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An
for i, row_datum in enumerate(row_data)}


@TableSetManagerRegistry.register
class CsvItemManager(ItemManagerMixin, CsvManager):
"""
This layers item-style row processing functionality on a CSV file.
Expand All @@ -666,7 +712,7 @@ def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs):
self.escaping: bool = escaping or False

@classmethod
def _get_csv_reader(cls, filename) -> CsvReader:
def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')

def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
Expand Down Expand Up @@ -699,6 +745,7 @@ def expand_escape_sequences(cls, text: str) -> str:
return s.getvalue()


@TableSetManagerRegistry.register
class TsvItemManager(ItemManagerMixin, TsvManager):
"""
This layers item-style row processing functionality on a TSV file.
Expand All @@ -714,24 +761,22 @@ class ItemManager(AbstractTableSetManager):

@classmethod
def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager:
if filename.endswith(".xlsx"):
reader_agent = XlsxItemManager(filename, **kwargs)
elif filename.endswith(".csv"):
tab_name = kwargs.pop('tab_name', None)
reader_agent = CsvItemManager(filename, tab_name=tab_name, **kwargs)
elif filename.endswith(".tsv"):
escaping = kwargs.pop('escaping', None)
tab_name = kwargs.pop('tab_name', None)
reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs)
else:
raise LoadArgumentsError(f"Unknown file type: {filename}")
reader_agent_class = TableSetManagerRegistry.manager_for_filename(filename)
reader_agent = reader_agent_class(filename, **kwargs)
return reader_agent

@classmethod
def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None) -> AnyJsonData:
def load(cls, filename: str,
tab_name: Optional[str] = None,
escaping: Optional[bool] = None,
schemas: Optional[Dict] = None,
autoload_schemas: Optional[bool] = None,
**kwargs) -> Dict[str, List[AnyJsonData]]:
"""
Given a filename and various options
"""
manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas,
autoload_schemas=autoload_schemas)
autoload_schemas=autoload_schemas, **kwargs)
return manager.load_content()


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "7.9.0.1b2" # to become "7.10.0"
version = "7.9.0.1b3" # to become "7.10.0"
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
Loading

0 comments on commit 660df9c

Please sign in to comment.