justin13601 · mmcdermott · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/.github/workflows/python-build.yaml b/.github/workflows/python-build.yaml
@@ -0,0 +1,95 @@
+name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
+
+on: push
+
+jobs:
+  build:
+    name: Build distribution 📦
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+      - name: Install pypa/build
+        run: >-
+          python3 -m
+          pip install
+          build
+          --user
+      - name: Build a binary wheel and a source tarball
+        run: python3 -m build
+      - name: Store the distribution packages
+        uses: actions/upload-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+
+  publish-to-pypi:
+    name: >-
+      Publish Python 🐍 distribution 📦 to PyPI
+    if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
+    needs:
+      - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/MEDS-transforms # Replace <package-name> with your PyPI project name
+    permissions:
+      id-token: write # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+
+      - name: Publish distribution 📦 to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
+  github-release:
+    name: >-
+      Sign the Python 🐍 distribution 📦 with Sigstore
+      and upload them to GitHub Release
+    needs:
+      - publish-to-pypi
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write # IMPORTANT: mandatory for making GitHub Releases
+      id-token: write # IMPORTANT: mandatory for sigstore
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+
+      - name: Sign the dists with Sigstore
+        uses: sigstore/[email protected]
+        with:
+          inputs: >-
+            ./dist/*.tar.gz
+            ./dist/*.whl
+      - name: Create GitHub Release
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: >-
+          gh release create
+          '${{ github.ref_name }}'
+          --repo '${{ github.repository }}'
+          --notes ""
+      - name: Upload artifact signatures to GitHub Release
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        # Upload to GitHub Release using the `gh` CLI.
+        # `dist/` contains the built packages, and the
+        # sigstore-produced signatures and certificates.
+        run: >-
+          gh release upload
+          '${{ github.ref_name }}' dist/**
+          --repo '${{ github.repository }}'
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,6 +38,7 @@ repos:
     rev: v2.2.0
     hooks:
       - id: autoflake
+        args: [--in-place, --remove-all-unused-imports]
 
   # python upgrading syntax to newer version
   - repo: https://github.com/asottile/pyupgrade

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,10 @@
+[build-system]
+requires = ["setuptools>=64", "setuptools-scm>=8.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
 [project]
 name = "es-aces"
-version = "0.3.0"
+dynamic = ["version"]
 authors = [
   { name="Justin Xu", email="[email protected]" },
   { name="Matthew McDermott", email="[email protected]" },
@@ -29,10 +33,6 @@ dependencies = [
 aces-cli = "aces.__main__:main"
 expand_shards = "aces.expand_shards:main"
 
-[build-system]
-requires = ["setuptools>=64", "setuptools-scm>=8.0", "wheel"]
-build-backend = "setuptools.build_meta"
-
 [project.optional-dependencies]
 dev = [
   "pre-commit", "pytest", "pytest-cov", "pytest-subtests", "rootutils", "hypothesis"

diff --git a/src/aces/__main__.py b/src/aces/__main__.py
@@ -153,16 +153,27 @@
 
     # query results
     result = query.query(task_cfg, predicates_df)
+    result_is_empty = len(result) == 0
 
     # save results to parquet
     os.makedirs(os.path.dirname(cfg.output_filepath), exist_ok=True)
 
     if cfg.data.standard.lower() == "meds":
-        result = result.rename({"subject_id": "patient_id"})
-        if "index_timestamp" in result.columns:
-            result = result.rename({"index_timestamp": "prediction_time"})
-        if "label" in result.columns:
-            result = result.rename({"label": "boolean_value"})
+        for in_col, out_col in [
+            ("subject_id", "patient_id"),
+            ("index_timestamp", "prediction_time"),
+            ("label", "boolean_value"),
+        ]:
+            if in_col in result.columns:
+                result = result.rename({in_col: out_col})
+        if "patient_id" not in result.columns:
+            if not result_is_empty:
+                raise ValueError("Output dataframe is missing a 'patient_id' column.")
+            else:
+                logger.warning("Output dataframe is empty; adding an empty patient ID column.")
+                result = result.with_columns(pl.lit(None, dtype=pl.Int64).alias("patient_id"))
+                result = result.head(0)
+
         result = get_and_validate_label_schema(result)
         pq.write_table(result, cfg.output_filepath)
     else:

diff --git a/src/aces/config.py b/src/aces/config.py
@@ -1110,6 +1110,65 @@
         Raises:
             FileNotFoundError: If the file does not exist.
             ValueError: If the file is not a ".yaml" file.
+
+        Examples:
+            >>> import tempfile
+            >>> yaml = ruamel.yaml.YAML(typ="safe", pure=True)
+            >>> config_dict = {
+            ...     "metadata": {'description': 'A test configuration file'},
+            ...     "description": 'this is a test',
+            ...     "predicates": {"admission": {"code": "admission"}},
+            ...     "trigger": "admission",
+            ...     "windows": {
+            ...         "start": {
+            ...             "start": None, "end": "trigger + 24h", "start_inclusive": True,
+            ...             "end_inclusive": True,
+            ...         }
+            ...     },
+            ... }
+            >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as f:
+            ...     config_path = Path(f.name)
+            ...     yaml.dump(config_dict, f)
+            ...     cfg = TaskExtractorConfig.load(config_path)
+            >>> cfg # doctest: +NORMALIZE_WHITESPACE
+            TaskExtractorConfig(predicates={'admission': PlainPredicateConfig(code='admission',
+                                                           value_min=None, value_max=None,
+                                                           value_min_inclusive=None, value_max_inclusive=None,
+                                                           static=False, other_cols={})},
+                                trigger=EventConfig(predicate='admission'),
+                                windows={'start': WindowConfig(start=None, end='trigger + 24h',
+                                                    start_inclusive=True, end_inclusive=True, has={},
+                                                    label=None, index_timestamp=None)},
+                                label_window=None, index_timestamp_window=None)
+            >>> predicates_dict = {
+            ...     "metadata": {'description': 'A test predicates file'},
+            ...     "description": 'this is a test',
+            ...     "patient_demographics": {"brown_eyes": {"code": "eye_color//BR"}},
+            ...     "predicates": {"admission": {"code": "admission"}},
+            ... }
+            >>> no_predicates_config = {k: v for k, v in config_dict.items() if k != "predicates"}
+            >>> with (tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as config_fp,
+            ...      tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as pred_fp):
+            ...     config_path = Path(config_fp.name)
+            ...     pred_path = Path(pred_fp.name)
+            ...     yaml.dump(no_predicates_config, config_fp)
+            ...     yaml.dump(predicates_dict, pred_fp)
+            ...     cfg = TaskExtractorConfig.load(config_path, pred_path)
+            >>> cfg # doctest: +NORMALIZE_WHITESPACE
+            TaskExtractorConfig(predicates={'admission': PlainPredicateConfig(code='admission',
+                                                           value_min=None, value_max=None,
+                                                           value_min_inclusive=None, value_max_inclusive=None,
+                                                           static=False, other_cols={}),
+                                            'brown_eyes': PlainPredicateConfig(code='eye_color//BR',
+                                                            value_min=None, value_max=None,
+                                                            value_min_inclusive=None,
+                                                            value_max_inclusive=None, static=True,
+                                                            other_cols={})},
+                                trigger=EventConfig(predicate='admission'),
+                                windows={'start': WindowConfig(start=None, end='trigger + 24h',
+                                                    start_inclusive=True, end_inclusive=True, has={},
+                                                    label=None, index_timestamp=None)},
+                                label_window=None, index_timestamp_window=None)
         """
         if isinstance(config_path, str):
             config_path = Path(config_path)
@@ -1146,8 +1205,10 @@
             predicates = predicates_dict.pop("predicates")
             patient_demographics = predicates_dict.pop("patient_demographics", None)
 
-            # Remove the description if it exists - currently unused except for readability in the YAML
+            # Remove the description or metadata keys if they exist - currently unused except for readability
+            # in the YAML
             _ = predicates_dict.pop("description", None)
+            _ = predicates_dict.pop("metadata", None)
 
             if predicates_dict:
                 raise ValueError(
@@ -1160,8 +1221,10 @@
         trigger = loaded_dict.pop("trigger")
         windows = loaded_dict.pop("windows", None)
 
-        # Remove the description if it exists - currently unused except for readability in the YAML
+        # Remove the description or metadata keys if they exist - currently unused except for readability
+        # in the YAML
         _ = loaded_dict.pop("description", None)
+        _ = loaded_dict.pop("metadata", None)
 
         if loaded_dict:
             raise ValueError(f"Unrecognized keys in configuration file: '{', '.join(loaded_dict.keys())}'")
@@ -1180,6 +1243,15 @@
 
         referenced_predicates = {pred for w in windows.values() for pred in w.referenced_predicates}
         referenced_predicates.add(trigger.predicate)
+        current_predicates = set(referenced_predicates)
+        special_predicates = {ANY_EVENT_COLUMN, START_OF_RECORD_KEY, END_OF_RECORD_KEY}
+        for pred in current_predicates - special_predicates:
+            if pred not in predicates:
+                raise KeyError(
+                    f"Something referenced predicate {pred} that wasn't defined in the configuration."
+                )
+            if "expr" in predicates[pred]:
+                referenced_predicates.update(DerivedPredicateConfig(**predicates[pred]).input_predicates)
 
         logger.info("Parsing predicates...")
         predicates_to_parse = {k: v for k, v in predicates.items() if k in referenced_predicates}

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -4,13 +4,13 @@
 
 root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
 
-import subprocess
 import tempfile
 from pathlib import Path
 
 import polars as pl
 from loguru import logger
-from polars.testing import assert_frame_equal
+
+from .utils import assert_df_equal, run_command
 
 pl.enable_string_cache()
 
@@ -186,44 +186,7 @@
 }
 
 
-def run_command(script: str, hydra_kwargs: dict[str, str], test_name: str, expected_returncode: int = 0):
-    command_parts = [script] + [f"{k}={v}" for k, v in hydra_kwargs.items()]
-    command_out = subprocess.run(" ".join(command_parts), shell=True, capture_output=True)
-    stderr = command_out.stderr.decode()
-    stdout = command_out.stdout.decode()
-    if command_out.returncode != expected_returncode:
-        raise AssertionError(
-            f"{test_name} returned {command_out.returncode} (expected {expected_returncode})!\n"
-            f"stdout:\n{stdout}\nstderr:\n{stderr}"
-        )
-    return stderr, stdout
-
-
-def assert_df_equal(want: pl.DataFrame, got: pl.DataFrame, msg: str = None, **kwargs):
-    try:
-        assert_frame_equal(want, got, **kwargs)
-    except AssertionError as e:
-        pl.Config.set_tbl_rows(-1)
-        print(f"DFs are not equal: {msg}\nWant:")
-        print(want)
-        print("Got:")
-        print(got)
-        raise AssertionError(f"{msg}\n{e}") from e
-
-
 def test_e2e():
-    # Testing expand_shards
-    es_stderr, es_stdout = run_command("expand_shards train/3 tuning/1", {}, "expand_shards")
-    assert (
-        es_stdout == "train/0,train/1,train/2,tuning/0\n"
-    ), f"Expected 'train/0,train/1,train/2,tuning/0' but got '{es_stdout}'"
-
-    # Running with the empty directory
-    help_stderr, help_stdout = run_command("aces-cli", {}, "help", expected_returncode=1)
-    assert (
-        "Usage: aces-cli [OPTIONS]" in help_stdout
-    ), f"Expected help message not found in stdout. Got {help_stdout}"
-
     with tempfile.TemporaryDirectory() as d:
         data_dir = Path(d) / "sample_data"
         configs_dir = Path(d) / "sample_configs"

diff --git a/tests/test_expand_shards.py b/tests/test_expand_shards.py
@@ -0,0 +1,30 @@
+"""Tests the extract_shards CLI process."""
+
+import rootutils
+
+root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
+
+import tempfile
+from pathlib import Path
+
+from .utils import run_command
+
+
+def test_e2e():
+    es_stderr, es_stdout = run_command("expand_shards train/3 tuning/1", {}, "expand_shards")
+    assert (
+        es_stdout == "train/0,train/1,train/2,tuning/0\n"
+    ), f"Expected 'train/0,train/1,train/2,tuning/0' but got '{es_stdout}'"
+
+    with tempfile.TemporaryDirectory() as d:
+        data_dir = Path(d) / "sample_data"
+
+        want_shards = ["train/0", "train/1", "train_2", "tuning/0/1"]
+        for shard in want_shards:
+            shard_fp = data_dir / f"{shard}.parquet"
+            shard_fp.mkdir(parents=True)
+            shard_fp.touch()
+
+        es_stderr, es_stdout = run_command(f"expand_shards {data_dir}", {}, "expand_shards")
+        got_shards = es_stdout.strip().split(",")
+        assert sorted(got_shards) == sorted(want_shards), f"Expected {want_shards} but got {got_shards}"
diff --git a/tests/test_help_message.py b/tests/test_help_message.py
@@ -0,0 +1,20 @@
+"""Tests the help message."""
+
+import rootutils
+
+root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
+
+
+from .utils import run_command
+
+
+def test_e2e():
+    # Running with the empty directory
+    help_stderr, help_stdout = run_command("aces-cli", {}, "help", expected_returncode=1)
+    assert (
+        "Usage: aces-cli [OPTIONS]" in help_stdout
+    ), f"Expected help message not found in stdout. Got {help_stdout}"
+
+    # Running with the empty directory
+    help_stderr, help_stdout = run_command("aces-cli -h", {}, "help", expected_returncode=0)
+    assert "== aces-cli ==" in help_stdout, f"Expected help message not found in stdout. Got {help_stdout}"