From 08f4c3c59ecb65408c8a17fed48663363e69883b Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 28 Feb 2024 21:55:28 +0100 Subject: [PATCH 1/5] Added to_feature_generator to ColumnGenerator --- src/sensai/columngen.py | 22 ++++++++++++++++++++++ src/sensai/data_transformation/dft.py | 5 +++-- src/sensai/featuregen/feature_generator.py | 7 ++++--- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/sensai/columngen.py b/src/sensai/columngen.py index 0530c13f..b2b6588f 100644 --- a/src/sensai/columngen.py +++ b/src/sensai/columngen.py @@ -5,6 +5,8 @@ import numpy as np import pandas as pd +from .data_transformation import DFTNormalisation +from .featuregen import FeatureGeneratorFromColumnGenerator from .util.cache import PersistentKeyValueCache @@ -45,6 +47,26 @@ def _generate_column(self, df: pd.DataFrame) -> Union[pd.Series, list, np.ndarra """ pass + def to_feature_generator(self, + take_input_column_if_present: bool = False, + normalisation_rule_template: DFTNormalisation.RuleTemplate = None, + is_categorical: bool = False): + """ + Transforms this column generator into a feature generator that can be used as part of a VectorModel. + + :param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists + in the input data, simply copy it to generate the output (without using the column generator); if False, always + apply the columnGen to generate the output + :param is_categorical: whether the resulting column is categorical + :param normalisation_rule_template: template for a DFTNormalisation for the resulting column. + This should only be provided if is_categorical is False + :return: + """ + return FeatureGeneratorFromColumnGenerator(self, + take_input_column_if_present=take_input_column_if_present, + normalisation_rule_template=normalisation_rule_template, + is_categorical=is_categorical) + class IndexCachedColumnGenerator(ColumnGenerator): """ diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index cfa75f80..13ed1743 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -10,7 +10,6 @@ from sklearn.preprocessing import OneHotEncoder from .sklearn_transformer import SkLearnTransformerProtocol -from ..columngen import ColumnGenerator from ..util import flatten_arguments, count_not_none from ..util.pandas import DataFrameColumnChangeTracker from ..util.pickle import setstate @@ -22,6 +21,8 @@ if TYPE_CHECKING: from ..featuregen import FeatureGenerator + from ..columngen import ColumnGenerator + log = logging.getLogger(__name__) @@ -749,7 +750,7 @@ class DFTFromColumnGenerators(RuleBasedDataFrameTransformer): """ Extends a data frame with columns generated from ColumnGenerator instances """ - def __init__(self, column_generators: Sequence[ColumnGenerator], inplace=False): + def __init__(self, column_generators: Sequence['ColumnGenerator'], inplace=False): super().__init__() self.columnGenerators = column_generators self.inplace = inplace diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py index efa28fb7..d4fadbf0 100644 --- a/src/sensai/featuregen/feature_generator.py +++ b/src/sensai/featuregen/feature_generator.py @@ -8,7 +8,6 @@ import pandas as pd from .. import util, data_transformation -from ..columngen import ColumnGenerator from ..data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer from ..util import flatten_arguments from ..util.string import or_regex_group, ToStringMixin, list_string @@ -16,6 +15,8 @@ if TYPE_CHECKING: from ..vector_model import VectorModel + from ..columngen import ColumnGenerator + log = logging.getLogger(__name__) @@ -532,7 +533,7 @@ class FeatureGeneratorFromColumnGenerator(RuleBasedFeatureGenerator): """ log = log.getChild(__qualname__) - def __init__(self, column_gen: ColumnGenerator, take_input_column_if_present=False, is_categorical=False, + def __init__(self, column_gen: 'ColumnGenerator', take_input_column_if_present=False, is_categorical=False, normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): """ :param column_gen: the underlying column generator @@ -541,7 +542,7 @@ def __init__(self, column_gen: ColumnGenerator, take_input_column_if_present=Fal apply the columnGen to generate the output :param is_categorical: whether the resulting column is categorical :param normalisation_rule_template: template for a DFTNormalisation for the resulting column. - This should only be provided if isCategorical is False + This should only be provided if is_categorical is False """ if is_categorical and normalisation_rule_template is not None: raise ValueError(f"normalisationRuleTemplate should be None when the generated column is categorical") From 3711c621debb57e34371e713fb7b16db0a9e18cd Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 28 Feb 2024 22:12:05 +0100 Subject: [PATCH 2/5] Added possibility to retrieve default evaluation metrics --- .../eval_stats/eval_stats_classification.py | 18 ++++++++++++------ .../eval_stats/eval_stats_regression.py | 12 +++++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/sensai/evaluation/eval_stats/eval_stats_classification.py b/src/sensai/evaluation/eval_stats/eval_stats_classification.py index 149c4971..a37434b9 100644 --- a/src/sensai/evaluation/eval_stats/eval_stats_classification.py +++ b/src/sensai/evaluation/eval_stats/eval_stats_classification.py @@ -326,6 +326,15 @@ def _compute_value(self, y_true, y_predicted, y_predicted_class_probabilities): return f if f is not None else self.zero_value +DEFAULT_MULTICLASS_CLASSIFICATION_METRICS = (ClassificationMetricAccuracy(), ClassificationMetricBalancedAccuracy(), + ClassificationMetricGeometricMeanOfTrueClassProbability()) + + +def create_default_binary_classification_metrics(positive_class_label: Any) -> List[BinaryClassificationMetric]: + return [BinaryClassificationMetricPrecision(positive_class_label), BinaryClassificationMetricRecall(positive_class_label), + BinaryClassificationMetricF1Score(positive_class_label)] + + class ClassificationEvalStats(PredictionEvalStats["ClassificationMetric"]): def __init__(self, y_predicted: PredictionArray = None, y_true: PredictionArray = None, @@ -340,6 +349,7 @@ def __init__(self, y_predicted: PredictionArray = None, :param y_predicted_class_probabilities: a data frame whose columns are the class labels and whose values are probabilities :param labels: the list of class labels :param metrics: the metrics to compute for evaluation; if None, use default metrics + (see DEFAULT_MULTICLASS_CLASSIFICATION_METRICS and :func:`create_default_binary_classification_metrics`) :param additional_metrics: the metrics to additionally compute :param binary_positive_label: the label of the positive class for the case where it is a binary classification, adding further binary metrics by default; @@ -381,13 +391,9 @@ def __init__(self, y_predicted: PredictionArray = None, self.is_binary = binary_positive_label is not None if metrics is None: - metrics = [ClassificationMetricAccuracy(), ClassificationMetricBalancedAccuracy(), - ClassificationMetricGeometricMeanOfTrueClassProbability()] + metrics = list(DEFAULT_MULTICLASS_CLASSIFICATION_METRICS) if self.is_binary: - metrics.extend([ - BinaryClassificationMetricPrecision(self.binary_positive_label), - BinaryClassificationMetricRecall(self.binary_positive_label), - BinaryClassificationMetricF1Score(self.binary_positive_label)]) + metrics.extend(create_default_binary_classification_metrics(self.binary_positive_label)) metrics = list(metrics) if additional_metrics is not None: diff --git a/src/sensai/evaluation/eval_stats/eval_stats_regression.py b/src/sensai/evaluation/eval_stats/eval_stats_regression.py index 91e8b1db..29b2e26e 100644 --- a/src/sensai/evaluation/eval_stats/eval_stats_regression.py +++ b/src/sensai/evaluation/eval_stats/eval_stats_regression.py @@ -112,6 +112,10 @@ def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: Vecto return np.median(cls.compute_abs_errors(y_true, y_predicted)) +DEFAULT_REGRESSION_METRICS = (RegressionMetricRRSE(), RegressionMetricR2(), RegressionMetricMAE(), + RegressionMetricMSE(), RegressionMetricRMSE(), RegressionMetricStdDevAE()) + + class RegressionEvalStats(PredictionEvalStats["RegressionMetric"]): """ Collects data for the evaluation of predicted continuous values and computes corresponding metrics @@ -126,21 +130,19 @@ class RegressionEvalStats(PredictionEvalStats["RegressionMetric"]): SCATTER_PLOT_POINT_COLOR = (0, 0, 1, 0.05) def __init__(self, y_predicted: Optional[PredictionArray] = None, y_true: Optional[PredictionArray] = None, - metrics: Sequence["RegressionMetric"] = None, additional_metrics: Sequence["RegressionMetric"] = None, + metrics: Optional[Sequence["RegressionMetric"]] = None, additional_metrics: Sequence["RegressionMetric"] = None, model: VectorRegressionModel = None, io_data: InputOutputData = None): """ :param y_predicted: the predicted values :param y_true: the true values - :param metrics: the metrics to compute for evaluation; if None, use default metrics + :param metrics: the metrics to compute for evaluation; if None, will use DEFAULT_REGRESSION_METRICS :param additional_metrics: the metrics to additionally compute """ self.model = model self.ioData = io_data if metrics is None: - metrics = [RegressionMetricRRSE(), RegressionMetricR2(), - RegressionMetricMAE(), RegressionMetricMSE(), RegressionMetricRMSE(), - RegressionMetricStdDevAE()] + metrics = DEFAULT_REGRESSION_METRICS metrics = list(metrics) super().__init__(y_predicted, y_true, metrics, additional_metrics=additional_metrics) From 35e8540ca9780f5d6a15d0e2300637812b13527e Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 28 Feb 2024 22:30:18 +0100 Subject: [PATCH 3/5] Added InMemoryKeyValueCache --- src/sensai/util/cache.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/sensai/util/cache.py b/src/sensai/util/cache.py index 16bbd512..cdfc9dbe 100644 --- a/src/sensai/util/cache.py +++ b/src/sensai/util/cache.py @@ -32,7 +32,7 @@ def __init__(self, value: TValue): self.value = value -class PersistentKeyValueCache(Generic[TKey, TValue], ABC): +class KeyValueCache(Generic[TKey, TValue], ABC): @abstractmethod def set(self, key: TKey, value: TValue): """ @@ -55,6 +55,40 @@ def get(self, key: TKey) -> Optional[TValue]: pass +class InMemoryKeyValueCache(KeyValueCache[TKey, TValue], Generic[TKey, TValue]): + """A simple in-memory cache (which uses a dictionary internally). + + This class can be instantiated directly, but for better typing support, one can instead + inherit from it and provide the types of the key and value as type arguments. For example for + a cache with string keys and integer values: + + .. code-block:: python + + class MyCache(InMemoryKeyValueCache[str, int]): + pass + """ + def __init__(self): + self.cache = {} + + def set(self, key: TKey, value: TValue): + self.cache[key] = value + + def get(self, key: TKey) -> Optional[TValue]: + return self.cache.get(key) + + def empty(self): + self.cache = {} + + def __len__(self): + return len(self.cache) + + + +# mainly kept as a marker and for backwards compatibility, but may be extended in the future +class PersistentKeyValueCache(KeyValueCache[TKey, TValue], Generic[TKey, TValue], ABC): + pass + + class PersistentList(Generic[TValue], ABC): @abstractmethod def append(self, item: TValue): From d1844b0f1a8facdb9eba783430c26dab2fef6073 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 28 Feb 2024 22:30:18 +0100 Subject: [PATCH 4/5] Fix merge conflicts at cherry-pick, minor typing improvements --- src/sensai/columngen.py | 6 +++--- src/sensai/distance_metric.py | 15 ++++++++------- src/sensai/featuregen/feature_generator.py | 2 +- src/sensai/nearest_neighbors.py | 2 +- src/sensai/util/cache.py | 4 ++-- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/sensai/columngen.py b/src/sensai/columngen.py index b2b6588f..dabc6907 100644 --- a/src/sensai/columngen.py +++ b/src/sensai/columngen.py @@ -7,7 +7,7 @@ from .data_transformation import DFTNormalisation from .featuregen import FeatureGeneratorFromColumnGenerator -from .util.cache import PersistentKeyValueCache +from .util.cache import KeyValueCache log = logging.getLogger(__name__) @@ -79,7 +79,7 @@ class IndexCachedColumnGenerator(ColumnGenerator): log = log.getChild(__qualname__) - def __init__(self, column_generator: ColumnGenerator, cache: PersistentKeyValueCache): + def __init__(self, column_generator: ColumnGenerator, cache: KeyValueCache): """ :param column_generator: the column generator with which to generate values for keys not found in the cache :param cache: the cache in which to store key-value pairs @@ -114,7 +114,7 @@ class ColumnGeneratorCachedByIndex(ColumnGenerator, ABC): log = log.getChild(__qualname__) - def __init__(self, generated_column_name: str, cache: Optional[PersistentKeyValueCache], persist_cache=False): + def __init__(self, generated_column_name: str, cache: Optional[KeyValueCache], persist_cache=False): """ :param generated_column_name: the name of the column being generated :param cache: the cache in which to store key-value pairs. If None, caching will be disabled diff --git a/src/sensai/distance_metric.py b/src/sensai/distance_metric.py index 97f0cb6b..fa13bc89 100644 --- a/src/sensai/distance_metric.py +++ b/src/sensai/distance_metric.py @@ -2,13 +2,13 @@ import math import os from abc import abstractmethod, ABC -from typing import Sequence, Tuple, List, Union +from typing import Generic, Sequence, Tuple, List, Union import numpy as np import pandas as pd from .util import cache -from .util.cache import DelayedUpdateHook +from .util.cache import DelayedUpdateHook, TValue from .util.string import object_repr from .util.typing import PandasNamedTuple @@ -42,8 +42,9 @@ def distance(self, named_tuple_a: PandasNamedTuple, named_tuple_b: PandasNamedTu return self._distance(value_a, value_b) -class DistanceMatrixDFCache(cache.PersistentKeyValueCache): - def __init__(self, pickle_path, save_on_update=True, deferred_save_delay_secs=1.0): +class DistanceMatrixDFCache(cache.PersistentKeyValueCache[Tuple[Union[str, int], Union[str, int]], TValue], Generic[TValue]): + """A cache for distance matrices, which are stored as dataframes with identifiers as both index and columns""" + def __init__(self, pickle_path: str, save_on_update: bool = True, deferred_save_delay_secs: float = 1.0): self.deferred_save_delay_secs = deferred_save_delay_secs self.save_on_update = save_on_update self.pickle_path = pickle_path @@ -65,7 +66,7 @@ def shape(self): def _assert_tuple(key): assert isinstance(key, tuple) and len(key) == 2, f"Expected a tuple of two identifiers, instead got {key}" - def set(self, key: Tuple[Union[str, int], Union[str, int]], value): + def set(self, key: Tuple[Union[str, int], Union[str, int]], value: TValue): self._assert_tuple(key) for identifier in key: if identifier not in self.distance_df.columns: @@ -83,7 +84,7 @@ def save(self): os.makedirs(os.path.dirname(self.pickle_path), exist_ok=True) self.distance_df.to_pickle(self.pickle_path) - def get(self, key: Tuple[Union[str, int], Union[str, int]]): + def get(self, key: Tuple[Union[str, int], Union[str, int]]) -> TValue: self._assert_tuple(key) i1, i2 = key try: @@ -108,7 +109,7 @@ class CachedDistanceMetric(DistanceMetric, cache.CachedValueProviderMixin): value for the given pair of identifiers is not found within the persistent cache """ - def __init__(self, distance_metric: DistanceMetric, key_value_cache: cache.PersistentKeyValueCache, persist_cache=False): + def __init__(self, distance_metric: DistanceMetric, key_value_cache: cache.KeyValueCache, persist_cache=False): cache.CachedValueProviderMixin.__init__(self, key_value_cache, persist_cache=persist_cache) self.metric = distance_metric diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py index d4fadbf0..9ef89a0a 100644 --- a/src/sensai/featuregen/feature_generator.py +++ b/src/sensai/featuregen/feature_generator.py @@ -393,7 +393,7 @@ class FeatureGeneratorFromNamedTuples(FeatureGenerator, ABC): Generates feature values for one data point at a time, creating a dictionary with feature values from each named tuple """ - def __init__(self, cache: util.cache.PersistentKeyValueCache = None, categorical_feature_names: Sequence[str] = (), + def __init__(self, cache: util.cache.KeyValueCache = None, categorical_feature_names: Sequence[str] = (), normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, diff --git a/src/sensai/nearest_neighbors.py b/src/sensai/nearest_neighbors.py index 62c1794c..af71c570 100644 --- a/src/sensai/nearest_neighbors.py +++ b/src/sensai/nearest_neighbors.py @@ -343,7 +343,7 @@ def __init__(self, num_neighbors: int, neighbor_attributes: typing.List[str], distance_metric: DistanceMetric, neighbor_provider_factory: typing.Callable[[pd.DataFrame], NeighborProvider] = AllNeighborsProvider, - cache: util.cache.PersistentKeyValueCache = None, + cache: util.cache.KeyValueCache = None, categorical_feature_names: typing.Sequence[str] = (), normalisation_rules: typing.Sequence[data_transformation.DFTNormalisation.Rule] = ()): """ diff --git a/src/sensai/util/cache.py b/src/sensai/util/cache.py index cdfc9dbe..38812441 100644 --- a/src/sensai/util/cache.py +++ b/src/sensai/util/cache.py @@ -570,8 +570,8 @@ class CachedValueProviderMixin(Generic[TKey, TValue, TData], ABC): Represents a value provider that can provide values associated with (hashable) keys via a cache or, if cached values are not yet present, by computing them. """ - def __init__(self, cache: Optional[PersistentKeyValueCache[TKey, TValue]] = None, - cache_factory: Optional[Callable[[], PersistentKeyValueCache[TKey, TValue]]] = None, persist_cache=False, box_values=False): + def __init__(self, cache: Optional[KeyValueCache[TKey, TValue]] = None, + cache_factory: Optional[Callable[[], KeyValueCache[TKey, TValue]]] = None, persist_cache=False, box_values=False): """ :param cache: the cache to use or None. If None, caching will be disabled :param cache_factory: a factory with which to create the cache (or recreate it after unpickling if `persistCache` is False, in which From 1776f6255c7bd645491121205f06e929335f62c6 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 28 Feb 2024 22:44:45 +0100 Subject: [PATCH 5/5] Minor simplification in DistanceMatrixDFCache --- src/sensai/distance_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sensai/distance_metric.py b/src/sensai/distance_metric.py index fa13bc89..1361ed41 100644 --- a/src/sensai/distance_metric.py +++ b/src/sensai/distance_metric.py @@ -92,7 +92,7 @@ def get(self, key: Tuple[Union[str, int], Union[str, int]]) -> TValue: except KeyError: return None result = self.distance_df.iloc[pos1, pos2] - if result is None or np.isnan(result): + if np.isnan(result): return None return result