From a86a532e6caa3b258da762c3b031af0b52296d12 Mon Sep 17 00:00:00 2001 From: "Derek T. Jones" Date: Thu, 7 Nov 2024 02:05:09 +0000 Subject: [PATCH] Use the Pandas expr tree for preflighting. Requires `extract_nest_names` to be a method on `NestedFrame` so that the evaluation context is available at parsing time, since the Pandas Expr parsing does some eager evaluation. Resolves #174 . --- src/nested_pandas/nestedframe/core.py | 99 ++++++++++++++++++- src/nested_pandas/nestedframe/utils.py | 61 ------------ .../nestedframe/test_nestedframe.py | 19 ++++ tests/nested_pandas/utils/test_utils.py | 48 ++++++--- 4 files changed, 151 insertions(+), 76 deletions(-) delete mode 100644 src/nested_pandas/nestedframe/utils.py diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 192a421..26e0467 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -10,9 +10,11 @@ from pandas._libs import lib from pandas._typing import Any, AnyAll, Axis, IndexLabel from pandas.api.extensions import no_default +from pandas.core.computation import ops +from pandas.core.computation.eval import Expr, ensure_scope from pandas.core.computation.expr import PARSERS, PandasExprVisitor +from pandas.core.computation.parsing import clean_column_name -from nested_pandas.nestedframe.utils import extract_nest_names from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct @@ -79,6 +81,22 @@ class _NestResolver(dict): def __init__(self, outer: NestedFrame): self._outer = outer super().__init__() + # Pre-load the field resolvers for all columns which are known at present. + for column in outer.nested_columns: + self._initialize_field_resolver(column, outer) + + def _initialize_field_resolver(self, column: str, outer: NestedFrame): + """ + Initialize a resolver for the given nested column, and also an alias + for it, in the case of column names that have spaces or are otherwise + not identifier-like. + """ + super().__setitem__(column, _NestedFieldResolver(column, outer)) + clean_id = clean_column_name(column) + # And once more for the cleaned name, if it's different. + # This allows us to capture references to it from the Pandas evaluator. + if clean_id != column: + super().__setitem__(clean_id, _NestedFieldResolver(column, outer)) def __contains__(self, item): top_nest = item if "." not in item else item.split(".")[0].strip() @@ -89,7 +107,7 @@ def __getitem__(self, item): if not super().__contains__(top_nest): if top_nest not in self._outer.nested_columns: raise KeyError(f"Unknown nest {top_nest}") - super().__setitem__(top_nest, _NestedFieldResolver(top_nest, self._outer)) + self._initialize_field_resolver(top_nest, self._outer) return super().__getitem__(top_nest) def __setitem__(self, item, _): @@ -133,6 +151,48 @@ def __getattr__(self, item_name: str): raise AttributeError(f"No attribute {item_name}") +def _subexprs_by_nest(parents: list, node) -> dict[str, list]: + """ + Given an expression which contains references to both base and nested + columns, return a dictionary of the sub-expressions that should be + evaluated independently, keyed by nesting context. + + The key of the dictionary is the name of the nested column, and will + be a blank string in the case of base columns. The value is a list + of the parent nodes that lead to sub-expressions that can be evaluated + successfully. + + While this is not in use today for automatically splitting expressions, + it can be used to detect whether an expression is suitably structured + for evaluation: the returned dictionary should have a single key. + """ + if isinstance(node, ops.Term) and not isinstance(node, ops.Constant): + if isinstance(node.value, _SeriesFromNest): + return {node.value.nest_name: parents} + return {getattr(node, "upper_name", ""): parents} + if not isinstance(node, ops.Op): + return {} + sources = [getattr(node, "lhs", None), getattr(node, "rhs", None)] + result: dict[str, list] = {} + for source in sources: + child = _subexprs_by_nest(parents, source) + for k, v in child.items(): + result.setdefault(k, []).append(v) + # After a complete traversal across sources, check for any necessary splits. + # If it's homogenous, move the split-node up the tree. + if len(result) == 1: + # Let the record of each parent node drift up the tree, + # and merge the subtrees into a single node, since by definition, + # this node is homogeneous over all of its children, and can + # be evaluated in a single step. + result = {k: [node] for k in result} + # If the result is either empty or has more than one key, leave the result + # alone. Each key represents a different nest (with a blank string for the base), + # and the value is the highest point in the expression tree where the expression + # was still within a single nest. + return result + + class NestedFrame(pd.DataFrame): """A Pandas Dataframe extension with support for nested structure. @@ -457,6 +517,39 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: kwargs["parser"] = "nested-pandas" return super().eval(expr, **kwargs) + def extract_nest_names( + self, + expr: str, + local_dict=None, + global_dict=None, + resolvers=(), + level: int = 0, + target=None, + **kwargs, + ) -> set[str]: + """ + Given a string expression, parse it and visit the resulting expression tree, + surfacing the nesting types. The purpose is to identify expressions that attempt + to mix base and nested columns, or columns from two different nests. + """ + index_resolvers = self._get_index_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() + resolvers = resolvers + (_NestResolver(self), column_resolvers, index_resolvers) + # Parser needs to be the "nested-pandas" parser. + # We also need the same variable context that eval() will have, so that + # backtick-quoted names are substituted as expected. + env = ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + parsed_expr = Expr(expr, parser="nested-pandas", env=env) + expr_tree = parsed_expr.terms + separable = _subexprs_by_nest([], expr_tree) + return set(separable.keys()) + def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None: """ Query the columns of a NestedFrame with a boolean expression. Specified @@ -514,7 +607,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | # At present, the query expression must be either entirely within a # single nest, or have nothing but base columns. Mixed structures are not # supported, so preflight the expression. - nest_names = extract_nest_names(expr) + nest_names = self.extract_nest_names(expr, **kwargs) if len(nest_names) > 1: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") result = self.eval(expr, **kwargs) diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py deleted file mode 100644 index 4e56b5b..0000000 --- a/src/nested_pandas/nestedframe/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -# typing.Self and "|" union syntax don't exist in Python 3.9 -from __future__ import annotations - -import ast - - -def _subexprs_by_nest(parents: list[ast.expr], node: ast.expr | None) -> dict[str, list]: - """ - Given an expression which contains references to both base and nested - columns, return a dictionary of the sub-expressions that should be - evaluated independently, keyed by nesting context. - - The key of the dictionary is the name of the nested column, and will - be a blank string in the case of base columns. The value is a list - of the parent nodes that lead to sub-expressions that can be evaluated - successfully. - - While this is not in use today for automatically splitting expressions, - it can be used to detect whether an expression is suitably structured - for evaluation: the returned dictionary should have a single key. - """ - if not isinstance(node, ast.expr): - return {} - if isinstance(node, ast.Name): - return {"": parents} - if isinstance(node, ast.Attribute) and isinstance(node.value, ast.Name): - return {node.value.id: parents} - sources = ( - [getattr(node, "left", None), getattr(node, "right", None)] - + getattr(node, "values", []) - + getattr(node, "comparators", []) - ) - result: dict[str, list] = {} - for source in sources: - child = _subexprs_by_nest(parents, source) - for k, v in child.items(): - result.setdefault(k, []).append(v) - # After a complete traversal across sources, check for any necessary splits. - # If it's homogenous, move the split-node up the tree. - if len(result) == 1: - # Let the record of each parent node drift up the tree, - # and merge the subtrees into a single node, since by definition, - # this node is homogeneous over all of its children, and can - # be evaluated in a single step. - result = {k: [node] for k in result} - # If the result is either empty or has more than one key, leave the result - # alone. Each key represents a different nest (with a blank string for the base), - # and the value is the highest point in the expression tree where the expression - # was still within a single nest. - return result - - -def extract_nest_names(expr: str) -> set[str]: - """ - Given a string expression, parse it and visit the resulting AST, surfacing - the nesting types. The purpose is to identify expressions that attempt - to mix base and nested columns, or columns from two different nests. - """ - expr_tree = ast.parse(expr, mode="eval").body - separable = _subexprs_by_nest([], expr_tree) - return set(separable.keys()) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 938476f..252386e 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -594,6 +594,25 @@ def test_query(): assert base["nested.d"].shape == (2,) +def test_query_on_non_identifier_columns(): + """ + Column names very often follow the same rules as Python identifiers, but + they are not required to. Test that query() can handle such names. + """ + # Taken from GH#174 + nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2]) + nested = pd.DataFrame( + data={"a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "b": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + nf = nf.add_nested(nested, "bad dog") + nf2 = nf.query("`good dog` > 3") + assert nf.shape == (3, 3) + assert nf2.shape == (2, 3) + nf3 = nf.query("`bad dog`.a > 2") + assert nf3["bad dog"].nest["a"].size == 4 + + def test_dropna(): """Test that dropna works on all layers""" diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py index e2ec526..5c1abc9 100644 --- a/tests/nested_pandas/utils/test_utils.py +++ b/tests/nested_pandas/utils/test_utils.py @@ -2,7 +2,6 @@ import pandas as pd import pytest from nested_pandas import NestedFrame -from nested_pandas.nestedframe.utils import extract_nest_names from nested_pandas.utils import count_nested @@ -52,16 +51,41 @@ def test_check_expr_nesting(): used to ensure that an expression-based query does not try to combine base and nested sub-expressions. """ - assert extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"} - assert extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"} - assert extract_nest_names("-1.52e-5 < abc < 35.2e2") == {""} - assert extract_nest_names("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""} + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[0, 1, 2]) + nested = pd.DataFrame( + data={ + "c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1], + "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], + "label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"], + }, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + b1 = base.add_nested(nested, "nested") + assert b1.extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"} + assert b1.extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"} + assert b1.extract_nest_names("-1.52e-5 < b < 35.2e2") == {""} + + b2 = base.add_nested(nested.copy(), "n") + assert b2.extract_nest_names("(n.c > 1) and ((b + a) > (b - 1e-8)) or n.d > a") == {"n", ""} + + abc = pd.DataFrame( + data={ + "c": [3, 1, 4, 1, 5, 9, 2, 6, 5], + "d": [1, 4, 1, 2, 1, 3, 5, 6, 2], + "g": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], + }, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + b3 = base.add_nested(abc, "abc").add_nested(abc, "c") + assert b3.extract_nest_names("abc.c > 2 & c.d < 5") == {"abc", "c"} + + assert b3.extract_nest_names("(abc.d > 3) & (abc.c == [2, 5])") == {"abc"} + assert b3.extract_nest_names("(abc.d > 3)&(abc.g == 'f')") == {"abc"} + assert b3.extract_nest_names("(abc.d > 3) & (abc.g == 'f')") == {"abc"} - assert extract_nest_names("a.b > 2 & c.d < 5") == {"a", "c"} + assert b1.extract_nest_names("a>3") == {""} + assert b1.extract_nest_names("a > 3") == {""} - assert extract_nest_names("a>3") == {""} - assert extract_nest_names("a > 3") == {""} - assert extract_nest_names("test.a>5&b==2") == {"test", ""} - assert extract_nest_names("test.a > 5 & b == 2") == {"test", ""} - assert extract_nest_names("(a.b > 3)&(a.c == 'f')") == {"a"} - assert extract_nest_names("(a.b > 3) & (a.c == 'f')") == {"a"} + b4 = base.add_nested(nested, "test") + assert b4.extract_nest_names("test.c>5&b==2") == {"test", ""} + assert b4.extract_nest_names("test.c > 5 & b == 2") == {"test", ""}