Small cosmetic changes and some additional support for upcoming work.

4dn-dcic · Aug 25, 2023 · 660df9c · 660df9c
1 parent 3852e56
commit 660df9c
Show file tree

Hide file tree

Showing 6 changed files with 233 additions and 60 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -19,15 +19,19 @@ Change Log
 
     * Function ``load_items`` that does the same as ``ItemManager.load``.
 
-  * Various low-level implementation classes such as:
+  * Various lower-level implementation classes such as:
 
     * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data
       from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.
 
     * Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data
       from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.
 
-* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 <https://hms-dbmi.atlassian.net/browse/C4-1086>`_).
+* New functionality in ``misc_utils``:
+
+  * New function ``is_uuid`` (migrated from Fourfront)
+  * New function ``pad_to``
+  * New class ``JsonLinesReader``
 
 
 7.9.0

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
@@ -9,6 +9,7 @@
 import inspect
 import math
 import io
+import json
 import os
 import logging
 import pytz
@@ -2329,3 +2330,43 @@ def parse_in_radix(text: str, *, radix: int):
     except Exception:
         pass
     raise ValueError(f"Unable to parse: {text!r}")
+
+
+def pad_to(target_size: int, data: list, *, padding=None):
+    actual_size = len(data)
+    if actual_size < target_size:
+        data = data + [padding] * (target_size - actual_size)
+    return data
+
+
+class JsonLinesReader:
+
+    def __init__(self, fp, padded=False, padding=None):
+        self.fp = fp
+        self.padded: bool = padded
+        self.padding = padding
+        self.headers = None  # Might change after we see first line
+
+    def __iter__(self):
+        first_line = True
+        n_headers = 0
+        for raw_line in self.fp:
+            line = json.loads(raw_line)
+            if first_line:
+                first_line = False
+                if isinstance(line, list):
+                    self.headers = line
+                    n_headers = len(line)
+                    continue
+            # If length of line is mroe than we expect, ignore it. Let user put comments beyond our table
+            # But if length of line is less than we expect, extend the line with None
+            if self.headers:
+                if not isinstance(line, list):
+                    raise Exception("If the first line is a list, all lines must be.")
+                if self.padded and len(line) < n_headers:
+                    line = pad_to(n_headers, line, padding=self.padding)
+                yield dict(zip(self.headers, line))
+            elif isinstance(line, dict):
+                yield line
+            else:
+                raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}")
diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py
@@ -3,18 +3,19 @@
 import csv
 import io
 import openpyxl
+import os
 import uuid
 
 from dcicutils.common import AnyJsonData
 from dcicutils.env_utils import public_env_name, EnvUtils
 from dcicutils.ff_utils import get_schema
 from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize
-from dcicutils.misc_utils import ignored, PRINT
+from dcicutils.misc_utils import ignored, PRINT, pad_to
 from dcicutils.task_utils import pmap
 from openpyxl.worksheet.worksheet import Worksheet
 from openpyxl.workbook.workbook import Workbook
 from tempfile import TemporaryFile
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Type, Union
 
 
 Header = str
@@ -334,7 +335,7 @@ def __init__(self, **kwargs):
 
     # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.)
     @classmethod
-    def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]:
+    def load(cls, filename: str, **kwargs) -> Dict[str, List[AnyJsonData]]:
         """
         Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data.
         For more information, see documentation of AbstractTableSetManager.
@@ -354,6 +355,8 @@ class BasicTableSetManager(AbstractTableSetManager):
     of this where there's only one set of headers and only one block of content.
     """
 
+    ALLOWED_FILE_EXTENSIONS: List[str] = []
+
     def __init__(self, filename: str, **kwargs):
         super().__init__(**kwargs)
         self.filename: str = filename
@@ -387,17 +390,26 @@ def load_content(self) -> Any:
 
 
 class TableSetManager(BasicTableSetManager):
-
-    ALLOWED_FILE_EXTENSIONS = None
+    """
+    This is the base class for all things that read tablesets. Those may be:
+    * Excel workbook readers (.xlsx)
+    * Comma-separated file readers (.csv)
+    * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright
+      refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt)
+    Unimplemented formats that could easily be made to do the same thing:
+    * JSON files
+    * JSON lines files
+    * YAML files
+    """
 
     @classmethod
-    def load(cls, filename: str) -> AnyJsonData:
+    def load(cls, filename: str, **kwargs) -> AnyJsonData:
         if cls.ALLOWED_FILE_EXTENSIONS:
             if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS):
                 raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only"
                                          f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}")
 
-        table_set_manager: TableSetManager = cls(filename)
+        table_set_manager: TableSetManager = cls(filename, **kwargs)
         return table_set_manager.load_content()
 
     def __init__(self, filename: str, **kwargs):
@@ -432,6 +444,33 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData:
         return prefer_number(value)
 
 
+class TableSetManagerRegistry:
+
+    ALL_TABLE_SET_MANAGERS: Dict[str, Type[TableSetManager]] = {}
+
+    @classmethod
+    def register(cls, class_to_register: Type[TableSetManager]):
+        for ext in class_to_register.ALLOWED_FILE_EXTENSIONS:
+            existing = cls.ALL_TABLE_SET_MANAGERS.get(ext)
+            if existing:
+                raise Exception(f"Tried to define {class_to_register} to extension {ext},"
+                                f" but {existing} already claimed that.")
+            cls.ALL_TABLE_SET_MANAGERS[ext] = class_to_register
+        return class_to_register
+
+    @classmethod
+    def manager_for_filename(cls, filename: str) -> Type[TableSetManager]:
+        base = os.path.basename(filename)
+        dotparts = base.split('.')
+        while dotparts:
+            suffix = f".{'.'.join(dotparts)}"
+            found = cls.ALL_TABLE_SET_MANAGERS.get(suffix)
+            if found:
+                return found
+            dotparts = dotparts[1:]
+        raise LoadArgumentsError(f"Unknown file type: {filename}")
+
+
 class XlsxManager(TableSetManager):
     """
     This implements the mechanism to get a series of rows out of the sheets in an XLSX file.
@@ -484,7 +523,7 @@ class SchemaAutoloadMixin(AbstractTableSetManager):
 
     SCHEMA_CACHE = {}  # Shared cache. Do not override. Use .clear_schema_cache() to clear it.
     CACHE_SCHEMAS = True  # Controls whether we're doing caching at all
-    AUTOLOAD_SCHEMAS_DEFAULT = False
+    AUTOLOAD_SCHEMAS_DEFAULT = True
 
     def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, **kwargs):
         if portal_env is None:
@@ -592,36 +631,43 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
         return ItemTools.parse_item_value(value, context=self._instaguid_context_table)
 
 
+@TableSetManagerRegistry.register
 class XlsxItemManager(ItemManagerMixin, XlsxManager):
     """
     This layers item-style row processing functionality on an XLSX file.
     """
     pass
 
 
-class CsvManager(TableSetManager):
-    """
-    This implements the mechanism to get a series of rows out of the sheet in a csv file,
-    returning a result that still looks like there could have been multiple tabs.
-    """
-
-    ALLOWED_FILE_EXTENSIONS = ['.csv']
+class SingleTableMixin(AbstractTableSetManager):
 
     DEFAULT_TAB_NAME = 'Sheet1'
 
-    def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs):
-        super().__init__(filename=filename, **kwargs)
+    def __init__(self, tab_name: Optional[str] = None, **kwargs):
+        super().__init__(**kwargs)
         self.tab_name = tab_name or self.DEFAULT_TAB_NAME
 
     @property
     def tabnames(self) -> List[str]:
         return [self.tab_name]
 
+
+class CsvManager(SingleTableMixin, TableSetManager):
+    """
+    This implements the mechanism to get a series of rows out of the sheet in a csv file,
+    returning a result that still looks like there could have been multiple tabs.
+    """
+
+    ALLOWED_FILE_EXTENSIONS = ['.csv']
+
+    def __init__(self, filename: str, **kwargs):
+        super().__init__(filename=filename, **kwargs)
+
     def _get_reader_agent(self) -> CsvReader:
-        return self._get_csv_reader(self.filename)
+        return self._get_reader_agent_for_filename(self.filename)
 
     @classmethod
-    def _get_csv_reader(cls, filename) -> CsvReader:
+    def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
         return csv.reader(open_text_input_file_respecting_byte_order_mark(filename))
 
     PAD_TRAILING_TABS = True
@@ -630,9 +676,8 @@ def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]:
         headers = self.tab_headers(tabname)
         n_headers = len(headers)
         for row_data in self.reader_agent:
-            n_cols = len(row_data)
-            if self.PAD_TRAILING_TABS and n_cols < n_headers:
-                row_data = row_data + [''] * (n_headers - n_cols)
+            if self.PAD_TRAILING_TABS:
+                row_data = pad_to(n_headers, row_data, padding='')
             yield row_data
 
     def _create_tab_processor_state(self, tabname: str) -> Headers:
@@ -647,6 +692,7 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An
                 for i, row_datum in enumerate(row_data)}
 
 
+@TableSetManagerRegistry.register
 class CsvItemManager(ItemManagerMixin, CsvManager):
     """
     This layers item-style row processing functionality on a CSV file.
@@ -666,7 +712,7 @@ def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs):
         self.escaping: bool = escaping or False
 
     @classmethod
-    def _get_csv_reader(cls, filename) -> CsvReader:
+    def _get_reader_agent_for_filename(cls, filename) -> CsvReader:
         return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t')
 
     def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData:
@@ -699,6 +745,7 @@ def expand_escape_sequences(cls, text: str) -> str:
         return s.getvalue()
 
 
+@TableSetManagerRegistry.register
 class TsvItemManager(ItemManagerMixin, TsvManager):
     """
     This layers item-style row processing functionality on a TSV file.
@@ -714,24 +761,22 @@ class ItemManager(AbstractTableSetManager):
 
     @classmethod
     def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager:
-        if filename.endswith(".xlsx"):
-            reader_agent = XlsxItemManager(filename, **kwargs)
-        elif filename.endswith(".csv"):
-            tab_name = kwargs.pop('tab_name', None)
-            reader_agent = CsvItemManager(filename, tab_name=tab_name, **kwargs)
-        elif filename.endswith(".tsv"):
-            escaping = kwargs.pop('escaping', None)
-            tab_name = kwargs.pop('tab_name', None)
-            reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs)
-        else:
-            raise LoadArgumentsError(f"Unknown file type: {filename}")
+        reader_agent_class = TableSetManagerRegistry.manager_for_filename(filename)
+        reader_agent = reader_agent_class(filename, **kwargs)
         return reader_agent
 
     @classmethod
-    def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None,
-             schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None) -> AnyJsonData:
+    def load(cls, filename: str,
+             tab_name: Optional[str] = None,
+             escaping: Optional[bool] = None,
+             schemas: Optional[Dict] = None,
+             autoload_schemas: Optional[bool] = None,
+             **kwargs) -> Dict[str, List[AnyJsonData]]:
+        """
+        Given a filename and various options
+        """
         manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas,
-                                                    autoload_schemas=autoload_schemas)
+                                                    autoload_schemas=autoload_schemas, **kwargs)
         return manager.load_content()
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "7.9.0.1b2"  # to become "7.10.0"
+version = "7.9.0.1b3"  # to become "7.10.0"
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"