Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update decision tree to include an option for handling masked points #2035

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,12 @@ accessed with this key contains the essentials that make the node function.
must be exceeded or not exceeded (see threshold_condition) for the node to
progress to the succeed target. Two values required if condition_combination
is being used.
- **threshold_condition** (str): Defines the inequality test to be applied to
- **threshold_condition** (str or list(str)): Defines the inequality test to be applied to
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should have a unit test that demonstrates this new behaviour explicitly, rather than via the if_masked functionality.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've separated the masked data tests into their own test rather than parameterising it which I think is what you were asking for?

the probability threshold(s). Inequalities that can be used are "<=", "<",
">", ">=".
">", ">=". If multiple tests are being applied in a single node, this can be
a list of the same length as the probability_thresholds list with each element
defining the inequality for the corresponding threshold. Alternatively a single
inequality can be applied to all thresholds.
- **condition_combination** (str): If multiple tests are being applied in a
single node, this value determines the logic with which they are combined.
The values can be "AND", "OR".
Expand Down Expand Up @@ -128,6 +131,14 @@ following additional keys:
- **deterministic** (boolean): Determines whether the node is expecting a deterministic
input.

Additionally a node can be set up to handle masked points in the diagnostic_fields. By default if
an input cube has masked points the decision tree will return a masked point.
However an additional key can be added to the node to specify what path to take if the input cube
has masked points. This key is:

- **if_masked** (str,optional): The next node if the input cube has masked points. This can
be the same as the "if_true" or "if_false" keys or can be a different node.

The first leaf node above is encoded as follows::

{
Expand Down
67 changes: 50 additions & 17 deletions improver/categorical/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,9 +378,17 @@ def create_condition_chain(self, test_conditions: Dict) -> List:
coord = "thresholds"
else:
coord = "probability_thresholds"
if isinstance(test_conditions["threshold_condition"], str):
comparator = [test_conditions["threshold_condition"]] * len(
test_conditions[coord]
)
else:
comparator = test_conditions["threshold_condition"]

for index, (diagnostic, p_threshold) in enumerate(
zip(test_conditions["diagnostic_fields"], test_conditions[coord])
for index, (diagnostic, comp, p_threshold) in enumerate(
zip(
test_conditions["diagnostic_fields"], comparator, test_conditions[coord]
)
):

d_threshold = test_conditions.get("diagnostic_thresholds")
Expand Down Expand Up @@ -416,13 +424,7 @@ def create_condition_chain(self, test_conditions: Dict) -> List:
)
else:
extract_constraint = iris.Constraint(diagnostic)
conditions.append(
[
extract_constraint,
test_conditions["threshold_condition"],
p_threshold,
]
)
conditions.append([extract_constraint, comp, p_threshold])
condition_chain = [conditions, test_conditions["condition_combination"]]
return condition_chain

Expand Down Expand Up @@ -639,28 +641,36 @@ def compare_array_to_threshold(
Args:
arr
comparator:
One of '<', '>', '<=', '>='.
One of '<', '>', '<=', '>=','is_masked'.
threshold

Returns:
Array of booleans.

Raises:
ValueError: If comparator is not one of '<', '>', '<=', '>='.
ValueError: If comparator is not one of '<', '>', '<=', '>=','is_masked'.
"""
if comparator == "<":
return arr < threshold
result = arr < threshold
elif comparator == ">":
return arr > threshold
result = arr > threshold
elif comparator == "<=":
return arr <= threshold
result = arr <= threshold
elif comparator == ">=":
return arr >= threshold
result = arr >= threshold
elif comparator == "is_masked":
if np.ma.is_masked(arr):
result = arr.mask
else:
result = arr != arr
else:
raise ValueError(
f"Invalid comparator: {comparator}. "
"Comparator must be one of '<', '>', '<=', '>='."
"Comparator must be one of '<', '>', '<=', '>=','is_masked'."
)
mspelman07 marked this conversation as resolved.
Show resolved Hide resolved
if np.ma.is_masked(result):
result[result.mask] = False
return result

def evaluate_extract_expression(
self, cubes: CubeList, expression: Union[Constraint, List]
Expand Down Expand Up @@ -818,7 +828,11 @@ def process(self, *cubes: Union[Cube, CubeList]) -> Cube:
graph = {
key: [self.queries[key]["leaf"]]
if "leaf" in self.queries[key].keys()
else [self.queries[key]["if_true"], self.queries[key]["if_false"]]
else [
self.queries[key]["if_true"],
self.queries[key]["if_false"],
self.queries[key].get("if_masked"),
]
for key in self.queries
}
# Search through tree for all leaves (category end points)
Expand Down Expand Up @@ -847,6 +861,25 @@ def process(self, *cubes: Union[Cube, CubeList]) -> Cube:
current["threshold_condition"],
current["condition_combination"],
) = self.invert_condition(current)
if current.get("if_masked") == next_node and current.get(
"if_masked"
) not in [current.get("if_false"), current.get("if_true")]:
# if if_masked is not the same as if_false or if_true, then
# it is a separate branch of the tree and we need to replace
# the condition.
current["diagnostic_fields"] = current["diagnostic_fields"]
current["threshold_condition"] = "is_masked"
current["condition_combination"] = ""
elif current.get("if_masked") == next_node:
# if masked is the same as if_false or if_true, then we need
# to add the masked condition to the existing condition.
current["diagnostic_fields"] = current["diagnostic_fields"] * 2
current["threshold_condition"] = [
current["threshold_condition"],
"is_masked",
]
current["condition_combination"] = "OR"
current["thresholds"] = current["thresholds"] * 2
if "leaf" not in current.keys():
conditions.append(self.create_condition_chain(current))
test_chain = [conditions, "AND"]
Expand Down
9 changes: 7 additions & 2 deletions improver/categorical/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
LEAF_OPTIONAL_KEY_WORDS = ["if_night", "is_unreachable", "group"]

OPTIONAL_KEY_WORDS = [
"if_masked",
"if_diagnostic_missing",
"deterministic",
"diagnostic_thresholds",
Expand Down Expand Up @@ -493,9 +494,13 @@ def check_tree(
f"Node {node} uses invalid threshold condition {threshold}"
)

# Check the succeed and fail destinations are valid; that is a valid
# Check the succeed fail and masked destinations are valid; that is a valid
# category for leaf nodes, and other tree nodes otherwise
for result in "if_true", "if_false":
if items.get("if_masked"):
nodes = ["if_true", "if_false", "if_masked"]
else:
nodes = ["if_true", "if_false"]
for result in nodes:
value = items[result]
if isinstance(value, str):
if value not in decision_tree.keys():
Expand Down
23 changes: 22 additions & 1 deletion improver_tests/categorical/decision_tree/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,14 +569,34 @@ def deterministic_diagnostic_tree() -> Dict[str, Dict[str, Any]]:
queries = {
"meta": {"name": "precipitation_type"},
"precip_rate": {
"if_true": "hail_rate",
"if_true": "cloud_top_temp",
"if_false": "dry",
"thresholds": [0],
"threshold_condition": ">",
"condition_combination": "",
"diagnostic_fields": ["precipitation_rate"],
"deterministic": True,
},
"cloud_top_temp": {
"if_true": "cloud_base_temp",
"if_false": "snow",
"if_masked": "dry",
"thresholds": [258.15],
"threshold_condition": "<=",
"condition_combination": "",
"diagnostic_fields": ["cloud_top_temperature"],
"deterministic": True,
},
"cloud_base_temp": {
"if_true": "hail_rate",
"if_false": "snow",
"if_masked": "hail_rate",
"thresholds": [268.15],
"threshold_condition": ">=",
"condition_combination": "",
"diagnostic_fields": ["cloud_base_temperature"],
"deterministic": True,
},
"hail_rate": {
"if_true": "hail",
"if_false": "rain",
Expand All @@ -589,6 +609,7 @@ def deterministic_diagnostic_tree() -> Dict[str, Dict[str, Any]]:
"dry": {"leaf": 0},
"rain": {"leaf": 1},
"hail": {"leaf": 2},
"snow": {"leaf": 3},
}

return queries
79 changes: 75 additions & 4 deletions improver_tests/categorical/decision_tree/test_ApplyDecisionTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,20 +81,90 @@ def hail_cube() -> Cube:
return cube


@pytest.fixture()
def cloud_top_temp() -> Cube:
"""
Set up a cloud top temperature, deterministic cube.
"""
data = np.full((7, 3), dtype=np.float32, fill_value=250)
cube = set_up_variable_cube(
data,
name="cloud_top_temperature",
units="K",
time=dt(2017, 10, 10, 12, 0),
frt=dt(2017, 10, 10, 11, 0),
)
return cube


@pytest.fixture()
def cloud_base_temp() -> Cube:
"""
Set up a cloud base temperature, deterministic cube.
"""
data = np.full((7, 3), dtype=np.float32, fill_value=270)
cube = set_up_variable_cube(
data,
name="cloud_base_temperature",
units="K",
time=dt(2017, 10, 10, 12, 0),
frt=dt(2017, 10, 10, 11, 0),
)
return cube


@pytest.mark.parametrize(
"masked_data", ("no_mask", "mask_precip", "mask_cloud_top", "mask_cloud_base")
)
@pytest.mark.parametrize(
"precip_fill,hail_fill,expected", ((1, 1, 2), (1, 0, 1), (0, 0, 0), (0, 1, 0))
)
def test_non_probablistic_tree(
precip_cube, hail_cube, precip_fill, hail_fill, expected
precip_cube,
hail_cube,
cloud_top_temp,
cloud_base_temp,
precip_fill,
hail_fill,
expected,
masked_data,
):
"""Test that ApplyDecisionTree correctly manages a decision tree with deterministic
inputs"""
inputs. Also check that if data is masked the output will be masked unless the
node includes an if_masked setting."""

expected_array = np.full_like(precip_cube.data, expected)

if masked_data == "mask_precip":
# Test that if there is no if_masked setting in the decision tree, the output
# will be masked
precip_mask = np.full_like(precip_cube.data, False)
precip_mask[0, 0] = True
precip_cube.data = np.ma.masked_array(precip_cube.data, mask=precip_mask)

expected_mask = np.full_like(precip_cube.data, False)
expected_mask[0, 0] = True
expected = np.ma.masked_array(expected_array, mask=expected_mask)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is suspicious. The test at the end looks at expected_array and yet this test still passes. This problem is hidden by another problem in the test at the end - see later comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the decision tree slightly so the if masked will go to false rather than true which would stop this test passing.

elif masked_data == "mask_cloud_top":
# Testing if there is an if_masked setting in the decision tree going to a
# different node than if_true or if_false
cloud_mask = np.full_like(cloud_top_temp.data, False)
cloud_mask[0, 0] = True
cloud_top_temp.data = np.ma.masked_array(cloud_top_temp.data, mask=cloud_mask)
expected_array[0, 0] = 0
elif masked_data == "mask_cloud_base":
# Testing if there is an if_masked setting in the decision tree going to
# the same node as if_true
cloud_mask = np.full_like(cloud_base_temp.data, False)
cloud_mask[0, 0] = True
cloud_base_temp.data = np.ma.masked_array(cloud_base_temp.data, mask=cloud_mask)

precip_cube.data.fill(precip_fill)
hail_cube.data.fill(hail_fill)
result = ApplyDecisionTree(decision_tree=deterministic_diagnostic_tree())(
iris.cube.CubeList([precip_cube, hail_cube])
iris.cube.CubeList([precip_cube, hail_cube, cloud_top_temp, cloud_base_temp])
)
assert np.all(result.data == expected)
assert np.all(result.data == expected_array)
MoseleyS marked this conversation as resolved.
Show resolved Hide resolved


def test_non_probablistic_tree_missing_data(hail_cube):
Expand Down Expand Up @@ -554,6 +624,7 @@ def test_basic(self):
self.assertIsInstance(result, list)
self.assertEqual(len(result), 2)
self.assertIsInstance(result[0], list)

for i in range(2):
constraint_exp = expected[0][i][0]
constraint_res = result[0][i][0]
Expand Down
2 changes: 2 additions & 0 deletions improver_tests/categorical/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,8 @@ def test_interrogate_decision_tree_accumulation_3h():
def test_interrogate_decision_tree_deterministic():
"""Test that the function returns the right strings."""
expected = (
"\u26C5 cloud_base_temperature (deterministic)\n"
"\u26C5 cloud_top_temperature (deterministic)\n"
"\u26C5 hail_rate (deterministic)\n"
"\u26C5 precipitation_rate (deterministic)\n"
)
Expand Down
Loading