Use the Pandas expr tree for preflighting.

Requires `extract_nest_names` to be a method on `NestedFrame` so that the evaluation context is available at parsing time, since the Pandas Expr parsing does some eager evaluation. Resolves #174 .
lincc-frameworks · Nov 14, 2024 · a86a532 · a86a532
1 parent 402ab66
commit a86a532
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 76 deletions.
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -10,9 +10,11 @@
 from pandas._libs import lib
 from pandas._typing import Any, AnyAll, Axis, IndexLabel
 from pandas.api.extensions import no_default
+from pandas.core.computation import ops
+from pandas.core.computation.eval import Expr, ensure_scope
 from pandas.core.computation.expr import PARSERS, PandasExprVisitor
+from pandas.core.computation.parsing import clean_column_name
 
-from nested_pandas.nestedframe.utils import extract_nest_names
 from nested_pandas.series.dtype import NestedDtype
 from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct
 
@@ -79,6 +81,22 @@ class _NestResolver(dict):
     def __init__(self, outer: NestedFrame):
         self._outer = outer
         super().__init__()
+        # Pre-load the field resolvers for all columns which are known at present.
+        for column in outer.nested_columns:
+            self._initialize_field_resolver(column, outer)
+
+    def _initialize_field_resolver(self, column: str, outer: NestedFrame):
+        """
+        Initialize a resolver for the given nested column, and also an alias
+        for it, in the case of column names that have spaces or are otherwise
+        not identifier-like.
+        """
+        super().__setitem__(column, _NestedFieldResolver(column, outer))
+        clean_id = clean_column_name(column)
+        # And once more for the cleaned name, if it's different.
+        # This allows us to capture references to it from the Pandas evaluator.
+        if clean_id != column:
+            super().__setitem__(clean_id, _NestedFieldResolver(column, outer))
 
     def __contains__(self, item):
         top_nest = item if "." not in item else item.split(".")[0].strip()
@@ -89,7 +107,7 @@ def __getitem__(self, item):
         if not super().__contains__(top_nest):
             if top_nest not in self._outer.nested_columns:
                 raise KeyError(f"Unknown nest {top_nest}")
-            super().__setitem__(top_nest, _NestedFieldResolver(top_nest, self._outer))
+            self._initialize_field_resolver(top_nest, self._outer)
         return super().__getitem__(top_nest)
 
     def __setitem__(self, item, _):
@@ -133,6 +151,48 @@ def __getattr__(self, item_name: str):
         raise AttributeError(f"No attribute {item_name}")
 
 
+def _subexprs_by_nest(parents: list, node) -> dict[str, list]:
+    """
+    Given an expression which contains references to both base and nested
+    columns, return a dictionary of the sub-expressions that should be
+    evaluated independently, keyed by nesting context.
+
+    The key of the dictionary is the name of the nested column, and will
+    be a blank string in the case of base columns.  The value is a list
+    of the parent nodes that lead to sub-expressions that can be evaluated
+    successfully.
+
+    While this is not in use today for automatically splitting expressions,
+    it can be used to detect whether an expression is suitably structured
+    for evaluation: the returned dictionary should have a single key.
+    """
+    if isinstance(node, ops.Term) and not isinstance(node, ops.Constant):
+        if isinstance(node.value, _SeriesFromNest):
+            return {node.value.nest_name: parents}
+        return {getattr(node, "upper_name", ""): parents}
+    if not isinstance(node, ops.Op):
+        return {}
+    sources = [getattr(node, "lhs", None), getattr(node, "rhs", None)]
+    result: dict[str, list] = {}
+    for source in sources:
+        child = _subexprs_by_nest(parents, source)
+        for k, v in child.items():
+            result.setdefault(k, []).append(v)
+    # After a complete traversal across sources, check for any necessary splits.
+    # If it's homogenous, move the split-node up the tree.
+    if len(result) == 1:
+        # Let the record of each parent node drift up the tree,
+        # and merge the subtrees into a single node, since by definition,
+        # this node is homogeneous over all of its children, and can
+        # be evaluated in a single step.
+        result = {k: [node] for k in result}
+    # If the result is either empty or has more than one key, leave the result
+    # alone.  Each key represents a different nest (with a blank string for the base),
+    # and the value is the highest point in the expression tree where the expression
+    # was still within a single nest.
+    return result
+
+
 class NestedFrame(pd.DataFrame):
     """A Pandas Dataframe extension with support for nested structure.
 
@@ -457,6 +517,39 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
         kwargs["parser"] = "nested-pandas"
         return super().eval(expr, **kwargs)
 
+    def extract_nest_names(
+        self,
+        expr: str,
+        local_dict=None,
+        global_dict=None,
+        resolvers=(),
+        level: int = 0,
+        target=None,
+        **kwargs,
+    ) -> set[str]:
+        """
+        Given a string expression, parse it and visit the resulting expression tree,
+        surfacing the nesting types.  The purpose is to identify expressions that attempt
+        to mix base and nested columns, or columns from two different nests.
+        """
+        index_resolvers = self._get_index_resolvers()
+        column_resolvers = self._get_cleaned_column_resolvers()
+        resolvers = resolvers + (_NestResolver(self), column_resolvers, index_resolvers)
+        # Parser needs to be the "nested-pandas" parser.
+        # We also need the same variable context that eval() will have, so that
+        # backtick-quoted names are substituted as expected.
+        env = ensure_scope(
+            level + 1,
+            global_dict=global_dict,
+            local_dict=local_dict,
+            resolvers=resolvers,
+            target=target,
+        )
+        parsed_expr = Expr(expr, parser="nested-pandas", env=env)
+        expr_tree = parsed_expr.terms
+        separable = _subexprs_by_nest([], expr_tree)
+        return set(separable.keys())
+
     def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None:
         """
         Query the columns of a NestedFrame with a boolean expression. Specified
@@ -514,7 +607,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
         # At present, the query expression must be either entirely within a
         # single nest, or have nothing but base columns.  Mixed structures are not
         # supported, so preflight the expression.
-        nest_names = extract_nest_names(expr)
+        nest_names = self.extract_nest_names(expr, **kwargs)
         if len(nest_names) > 1:
             raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
         result = self.eval(expr, **kwargs)

diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -594,6 +594,25 @@ def test_query():
     assert base["nested.d"].shape == (2,)
 
 
+def test_query_on_non_identifier_columns():
+    """
+    Column names very often follow the same rules as Python identifiers, but
+    they are not required to.  Test that query() can handle such names.
+    """
+    # Taken from GH#174
+    nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
+    nested = pd.DataFrame(
+        data={"a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+    nf = nf.add_nested(nested, "bad dog")
+    nf2 = nf.query("`good dog` > 3")
+    assert nf.shape == (3, 3)
+    assert nf2.shape == (2, 3)
+    nf3 = nf.query("`bad dog`.a > 2")
+    assert nf3["bad dog"].nest["a"].size == 4
+
+
 def test_dropna():
     """Test that dropna works on all layers"""
 

diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py
@@ -2,7 +2,6 @@
 import pandas as pd
 import pytest
 from nested_pandas import NestedFrame
-from nested_pandas.nestedframe.utils import extract_nest_names
 from nested_pandas.utils import count_nested
 
 
@@ -52,16 +51,41 @@ def test_check_expr_nesting():
     used to ensure that an expression-based query does not try to combine base and nested
     sub-expressions.
     """
-    assert extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
-    assert extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
-    assert extract_nest_names("-1.52e-5 < abc < 35.2e2") == {""}
-    assert extract_nest_names("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""}
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[0, 1, 2])
+    nested = pd.DataFrame(
+        data={
+            "c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1],
+            "d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
+            "label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"],
+        },
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+    b1 = base.add_nested(nested, "nested")
+    assert b1.extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
+    assert b1.extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
+    assert b1.extract_nest_names("-1.52e-5 < b < 35.2e2") == {""}
+
+    b2 = base.add_nested(nested.copy(), "n")
+    assert b2.extract_nest_names("(n.c > 1) and ((b + a) > (b - 1e-8)) or n.d > a") == {"n", ""}
+
+    abc = pd.DataFrame(
+        data={
+            "c": [3, 1, 4, 1, 5, 9, 2, 6, 5],
+            "d": [1, 4, 1, 2, 1, 3, 5, 6, 2],
+            "g": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
+        },
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+    b3 = base.add_nested(abc, "abc").add_nested(abc, "c")
+    assert b3.extract_nest_names("abc.c > 2 & c.d < 5") == {"abc", "c"}
+
+    assert b3.extract_nest_names("(abc.d > 3) & (abc.c == [2, 5])") == {"abc"}
+    assert b3.extract_nest_names("(abc.d > 3)&(abc.g == 'f')") == {"abc"}
+    assert b3.extract_nest_names("(abc.d > 3) & (abc.g == 'f')") == {"abc"}
 
-    assert extract_nest_names("a.b > 2 & c.d < 5") == {"a", "c"}
+    assert b1.extract_nest_names("a>3") == {""}
+    assert b1.extract_nest_names("a > 3") == {""}
 
-    assert extract_nest_names("a>3") == {""}
-    assert extract_nest_names("a > 3") == {""}
-    assert extract_nest_names("test.a>5&b==2") == {"test", ""}
-    assert extract_nest_names("test.a > 5 & b == 2") == {"test", ""}
-    assert extract_nest_names("(a.b > 3)&(a.c == 'f')") == {"a"}
-    assert extract_nest_names("(a.b > 3) & (a.c == 'f')") == {"a"}
+    b4 = base.add_nested(nested, "test")
+    assert b4.extract_nest_names("test.c>5&b==2") == {"test", ""}
+    assert b4.extract_nest_names("test.c > 5 & b == 2") == {"test", ""}