From 17ef228424a1e3743bd56a4e756a884f1e9ac572 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Tue, 28 Nov 2023 16:22:21 +0100 Subject: [PATCH 1/9] Add script to generate VectorModel class hierarchy --- resources/print_vector_model_hierarchy.py | 56 +++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 resources/print_vector_model_hierarchy.py diff --git a/resources/print_vector_model_hierarchy.py b/resources/print_vector_model_hierarchy.py new file mode 100644 index 00000000..ac872ba1 --- /dev/null +++ b/resources/print_vector_model_hierarchy.py @@ -0,0 +1,56 @@ +from anytree import Node, RenderTree +import inspect + +from sensai import VectorRegressionModel, VectorModel, VectorClassificationModel + + +class ClassHierarchy: + def __init__(self, cls, skip_intermediate_abc=True, retained_intermediate_classes=()): + self.retained_intermediate_classes = retained_intermediate_classes + self.skip_intermediate_abc = skip_intermediate_abc + self.root = self._scan_subclasses(cls, None, True) + + @staticmethod + def _isabstract(cls): + return inspect.isabstract(cls) # or "Abstract" in cls.__name__ + + def _scan_subclasses(self, cls, parent, is_root): + skip_node = not is_root and self.skip_intermediate_abc \ + and self._isabstract(cls) \ + and cls not in self.retained_intermediate_classes + + if not skip_node: + node = Node(cls.__name__, parent=parent) + else: + node = parent + + subclasses = list(cls.__subclasses__()) + subclasses.sort(key=lambda x: x.__name__) + for subclass in subclasses: + self._scan_subclasses(subclass, node, False) + + return node + + def print(self): + for pre, _, node in RenderTree(self.root): + print("%s%s" % (pre, node.name)) + + +if __name__ == '__main__': + # import optional packages such that the classes will be included in the hierarchy + from sensai import nearest_neighbors + from sensai import xgboost + from sensai.util import mark_used + from sensai import torch as sensai_torch + from sensai import tensorflow as sensai_tf + from sensai import lightgbm as sensai_lgbm + from sensai.torch import torch_models + from sensai import sklearn_quantile + + mark_used(xgboost, nearest_neighbors, sensai_torch, sensai_tf, sensai_lgbm, torch_models, sklearn_quantile) + + h = ClassHierarchy(VectorModel, + skip_intermediate_abc=True, + retained_intermediate_classes=(VectorRegressionModel, VectorClassificationModel)) + h.print() + From 2e94dece2011ca20f2f7f49bab2e0f73471c0f51 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Wed, 29 Nov 2023 17:26:59 +0100 Subject: [PATCH 2/9] Add script with code snippets for README --- resources/readme_code_snippets.py | 172 ++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 resources/readme_code_snippets.py diff --git a/resources/readme_code_snippets.py b/resources/readme_code_snippets.py new file mode 100644 index 00000000..fdbe6614 --- /dev/null +++ b/resources/readme_code_snippets.py @@ -0,0 +1,172 @@ +import math +from enum import Enum +from typing import Dict, Any +import os + +from sklearn.preprocessing import StandardScaler + +from sensai import InputOutputData +from sensai.data import DataSplitterFractional +from sensai.data_transformation import DFTNormalisation, SkLearnTransformerFactoryFactory, DFTSkLearnTransformer +from sensai.evaluation import RegressionModelEvaluation, RegressionEvaluatorParams +from sensai.featuregen import MultiFeatureGenerator, FeatureGeneratorRegistry +from sensai.sklearn.sklearn_regression import SkLearnLinearRegressionVectorRegressionModel +from sensai.torch.torch_models.residualffn.residualffn_models import ResidualFeedForwardNetworkVectorRegressionModel +from sensai.tracking.mlflow_tracking import MLFlowExperiment +from sensai.util import logging +from sensai.util.io import ResultWriter +from sensai.util.logging import datetime_tag +from sensai.xgboost import XGBRandomForestVectorRegressionModel + +import random +import pandas as pd + +# feature generators example + +from sensai.featuregen import FeatureGeneratorMapColumn, FeatureGeneratorMapColumnDict, \ + FeatureGeneratorTakeColumns + +class FeatureGeneratorTemperature(FeatureGeneratorTakeColumns): + """ + Takes the input column "temperature" without modifications, adding meta-information + on how to normalize/scale the feature (using StandardScaler) + """ + def __init__(self): + super().__init__("temperature", + normalisation_rule_template=DFTNormalisation.RuleTemplate( + transformer_factory=SkLearnTransformerFactoryFactory.StandardScaler())) + + +class FeatureGeneratorWeekday(FeatureGeneratorMapColumn): + """ + Creates the categorical feature "weekday" (integer from 0=Monday to 6=Sunday) + from the "timestamp" column + """ + def __init__(self): + super().__init__(input_col_name="timestamp", feature_col_name="weekday", + categorical_feature_names="weekday") + + def _create_value(self, timestamp: pd.Timestamp): + return timestamp.weekday() + + +class FeatureGeneratorTimeOfDayCircular(FeatureGeneratorMapColumnDict): + """ + From the "timestamp" column, creates two features "time_of_day_x" and + "time_of_day_y", which correspond to the locations on the unit circle + that the hour hand of a 24-hour clock would point to + """ + def __init__(self): + super().__init__(input_col_name="timestamp", + normalisation_rule_template=DFTNormalisation.RuleTemplate(skip=True)) + + def _create_features_dict(self, timestamp: pd.Timestamp) -> Dict[str, Any]: + time_of_day_norm = (timestamp.hour + timestamp.minute / 60) / 24 + alpha = math.pi / 2 - time_of_day_norm * 2 * math.pi + return dict(time_of_day_x=math.cos(alpha), time_of_day_y=math.sin(alpha)) + + +class FeatureName(Enum): + TEMPERATURE = "temperature" + WEEKDAY = "weekday" + TIME_OF_DAY_CIRC = "time_circ" + + +registry = FeatureGeneratorRegistry() +registry.register_factory(FeatureName.TEMPERATURE, FeatureGeneratorTemperature) +registry.register_factory(FeatureName.WEEKDAY, FeatureGeneratorWeekday) +registry.register_factory(FeatureName.TIME_OF_DAY_CIRC, FeatureGeneratorTimeOfDayCircular) + + + +if __name__ == '__main__': + logging.configure() + + num_points = 200 + + jan_2023 = 1672531200 + timestamps = [jan_2023, jan_2023+6*3600, jan_2023+12*3600, jan_2023+18*3600] + for i in range(num_points): + timestamps.append(jan_2023 + random.randint(0, 24*3600)) + + temperatures = [20 + random.random() * 3 for _ in timestamps] + + df = pd.DataFrame({ + "timestamp": [pd.Timestamp(t, unit="s") for t in timestamps], + "temperature": temperatures + }) + + targets = [] + for t in df.itertuples(): + ts: pd.Timestamp = t.timestamp + result = 0 + if ts.hour >= 6 and ts.hour <= 16: + result = t.temperature + else: + result = t.temperature - 2 + targets.append(result) + + df["target"] = targets + + fg = MultiFeatureGenerator( + FeatureGeneratorWeekday(), + FeatureGeneratorTimeOfDayCircular(), + FeatureGeneratorTemperature()) + feature_df = fg.generate(df) + + + # DFT example + + feature_coll = registry.collect_features(*list(FeatureName)) + + dft_normalization = feature_coll.create_dft_normalisation() + dft_one_hot_encoder = feature_coll.create_dft_one_hot_encoder() + + + # model example + + feature_coll = registry.collect_features(*list(FeatureName)) + + model_xgb = XGBRandomForestVectorRegressionModel() \ + .with_name("XGBoost") \ + .with_feature_collector(feature_coll) \ + .with_feature_transformers( + feature_coll.create_dft_one_hot_encoder()) + model_linear = SkLearnLinearRegressionVectorRegressionModel() \ + .with_name("Linear") \ + .with_feature_collector(feature_coll) \ + .with_feature_transformers( + feature_coll.create_dft_one_hot_encoder()) + model_rffn = ResidualFeedForwardNetworkVectorRegressionModel( + hidden_dims=[10]*5, + cuda=False) \ + .with_name("RFFN") \ + .with_feature_collector(feature_coll) \ + .with_feature_transformers( + feature_coll.create_dft_one_hot_encoder(), + feature_coll.create_dft_normalisation()) \ + .with_target_transformer(DFTSkLearnTransformer(StandardScaler())) + + # evaluation example + + io_data = InputOutputData.from_data_frame(df, "target") + + ev = RegressionModelEvaluation(io_data, + RegressionEvaluatorParams(data_splitter=DataSplitterFractional(0.8))) + + ev.compare_models([model_xgb, model_linear, model_rffn]) + + # tracking example + + experiment_name = "MyRegressionExperiment" + run_id = datetime_tag() + + tracked_experiment = MLFlowExperiment(experiment_name, tracking_uri="", context_prefix=run_id + "_", + add_log_to_all_contexts=True) + + result_writer = ResultWriter(os.path.join("results", experiment_name, run_id)) + logging.add_file_logger(result_writer.path("log.txt")) + + ev.compare_models([model_xgb, model_linear, model_rffn], + tracked_experiment=tracked_experiment, + result_writer=result_writer) From d48961d9961606254b6fedbf8cebde2646c29385 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Tue, 28 Nov 2023 16:22:30 +0100 Subject: [PATCH 3/9] Extend presentation of sensAI features in README --- README.md | 429 ++++++++++++++++++++++++++++-- resources/readme_code_snippets.py | 7 + 2 files changed, 416 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 0ba19a60..f00e0697 100644 --- a/README.md +++ b/README.md @@ -16,31 +16,420 @@

-## About sensAI +# About sensAI -sensAI provides a framework for AI and machine learning applications, integrating industry-standard libraries and providing additional abstractions that facilitate rapid implementation, experimentation as well as deployment. +sensAI is a high-level AI toolkit with a specific focus on **rapid +experimentation** for machine learning applications. +Its basic interfaces are similar to sklearn's, yet we modularise data pipelines +without sacrificing semantics (retaining meta-information at every step) +and provide high-level interfaces for many canonical tasks, achieving largely +declarative semantics. -In particular, sensAI provides ... +Some of sensAI's key benefits are: -* **machine learning** methods - * **regression and classification** models - * unified interface to models and algorithms of other machine learning libraries, particularly **scikit-learn**, **PyTorch** and **TensorFlow** - * additional implementations of our own, e.g. for k-nearest neighbour models and naive Bayes models - * mechanisms for **feature generation**, which serve to decouple externally provided input data from the data that is actually required as input to particular models - * mechanisms for model-specific (input and output) **data transformation**, enabling, for example, convenient model-specific scaling/normalisation or encodings of features - * (parallelised) **hyper-parameter optimisation** methods - * **cloud-based tracking** of experimental results (with direct support for Microsoft Azure) -* **combinatorial optimisation** - * **stochastic local search** methods, including (adaptive) simulated annealing and parallel tempering -* general utilities, including ... - * extensive **caching mechanisms** (using SQLite, pickle and MySQL as backends) - * multi-processing tools, e.g. a debugger for pickle errors + * **A unifying interface to a wide variety of model classes across frameworks** -## Documentation + Apply the same principles to a wide variety of models, whether they are + neural networks, tree ensembles or non-parametric models – without + losing the ability of exploiting each model's particular strengths. + + sensAI supports models based on PyTorch, scikit-learn, XGBoost and + other libraries out of the box. + Support for custom models can straightforwardly be established. + + * **Adaptive, composable data processing pipelines** + + Modularise data pre-processing steps and features generation, representing + the properties of features explicitly. + * For each model, select a suitable subset of features, composing the + the desired feature generators in order to obtain an initial + input pipeline. + * Transform the features into representations that are optimised for + the model at hand. + Some of the respective transformations can be automatically derived from + the properties associated with features, others can be manually + designed to exploit a model's specific capabilities (e.g. a tensor-based + representation of complex, non-tabular data for neural networks). + + * **Fully integrated solutions for canonical tasks** + + Do away with boilerplate by using high-level interfaces for model + evaluation, model selection or feature selection. + Log and track all relevant parameters and results along the way, + using file-based logging or tracking frameworks such as MLflow. + + * **Declarative semantics** + + Through its high level of abstraction, sensAI achieves largely + declarative semantics: Focus on what to do rather than on how to do it. + sensAI embraces object-oriented design in order to achieve this. + +While sensAI's main focus is on supervised and unsupervised machine learning, +it also provides functionality for discrete optimisation and a wide range +of general-purpose utilities that are frequently required in AI applications. + +## Supervised Learning + +Many real-world tasks can be reduced to classification and regression problems, +and sensAI specifically caters to the needs of these problems by providing a +wide variety of concepts and abstractions that can render experimentation a +breeze. +We shall briefly review the most important ones in the following. + +sensAI's models use pandas DataFrames to represent data points. +Note that this does not limit the data to purely tabular data, as a field in a +data frame can hold arbitrarily complex data. +Yet the tabular case is, of course, a most common one. + +### Feature Generators + +A fundamental concept in sensAI is to introduce representations for features +that + * provide the logic for generating/extracting feature values from the original + data, decoupling externally provided input data from the data that is + suitable as input for various models + * hold metadata on the generated features in order to support flexible + downstream transformations. + +The fundamental abstraction for this is `FeatureGenerator`. +A `FeatureGenerator` takes as input a data frame and creates one or more features +from it, which a model is to take as input. + +To facilitate the definition of feature generators, sensAI provides a variety of +bases classes for that already cover the most common use cases. +Here are some examples built on base classes provided by sensAI: + +```python +from sensai.featuregen import FeatureGeneratorMapColumn, FeatureGeneratorMapColumnDict, \ + FeatureGeneratorTakeColumns + +class FeatureGeneratorTemperature(FeatureGeneratorTakeColumns): + """ + Takes the input column "temperature" without modifications, adding meta-information + on how to normalise/scale the feature (using StandardScaler) + """ + def __init__(self): + super().__init__("temperature", + normalisation_rule_template=DFTNormalisation.RuleTemplate( + transformer_factory=SkLearnTransformerFactoryFactory.StandardScaler())) + + +class FeatureGeneratorWeekday(FeatureGeneratorMapColumn): + """ + Creates the categorical feature "weekday" (integer from 0=Monday to 6=Sunday) + from the "timestamp" column, which is given as a pandas Timestamp object + """ + def __init__(self): + super().__init__(input_col_name="timestamp", feature_col_name="weekday", + categorical_feature_names="weekday") + + def _create_value(self, timestamp: pd.Timestamp): + return timestamp.weekday() + + +class FeatureGeneratorTimeOfDayCircular(FeatureGeneratorMapColumnDict): + """ + From the "timestamp" column, creates two features "time_of_day_x" and + "time_of_day_y", which correspond to the locations on the unit circle + that the hour hand of a 24-hour clock would point to + """ + def __init__(self): + super().__init__(input_col_name="timestamp", + normalisation_rule_template=DFTNormalisation.RuleTemplate(skip=True)) + + def _create_features_dict(self, timestamp: pd.Timestamp) -> Dict[str, Any]: + time_of_day_norm = (timestamp.hour + timestamp.minute / 60) / 24 + alpha = math.pi / 2 - time_of_day_norm * 2 * math.pi + return dict(time_of_day_x=math.cos(alpha), time_of_day_y=math.sin(alpha)) +``` + +:white_check_mark: **Modular features** + +Feature engineering can be crucial, especially in non-deep learning applications, +and crafting domain-specific feature generators will often be a critical task in +practice. + +:information_source: With every feature being represented explicitly as a +feature generator, +we can flexibly make use of them in models and choose the ones we would like to +apply for any given model. + +### Feature Generator Registry + +In order to simplify the definition of the set of features that a model is to +make use of, we add feature generators to a registry, allowing us to refer to +each feature generator by name. + +```python +registry = FeatureGeneratorRegistry() +registry.register_factory(FeatureName.TEMPERATURE, FeatureGeneratorTemperature) +registry.register_factory(FeatureName.WEEKDAY, FeatureGeneratorWeekday) +registry.register_factory(FeatureName.TIME_OF_DAY_CIRC, FeatureGeneratorTimeOfDayCircular) +``` + +Instead of plain string names, the use of an Enum (like `FeatureName` above) can +be helpful for added auto-completion support in your IDE. + +With such a registry, we can obtain any given set of features for use within +a model: + +```python +feature_collector = registry.collect_features(FeatureName.TEMPERATURE, FeatureName.WEEKDAY) +features_df = feature_collector.get_multi_feature_generator().generate(df) +``` + +:white_check_mark: **Composable feature pipelines** + +### (Model-Specific) Data Transformation + +Depending on the type of model, the representation of the input data may need to +be adapted. For instance, +some models can directly process arbitarily represented categorical data, others +require an encoding. Some models can deal with arbitrary scales of numerical +data, others work best with normalised data. + +To handle this, sensAI provides the concept of a `DataFrameTransformer` +(DFT for short), which can be used to transform the data that is fed to a model +after feature generation. +The most common transformers can conveniently be derived directly from the +meta-data that is associated with features: + +```python +feature_coll = registry.collect_features(*list(FeatureName)) + +dft_normalisation = feature_coll.create_dft_normalisation() +dft_one_hot_encoder = feature_coll.create_dft_one_hot_encoder() +``` + +`DataFrameTransformers` serve three purposes in the context of sensAI models: + * to transform the data prior to feature generation + * to transform the data after feature generation + * to transform the prediction targets (in which case the transformation + must have an inverse) + +:information_source: By using different `DataFrameTransformers`, models can +flexibly use different feature and target representations. + +:white_check_mark: **Model-specific data representations** + +### Vector Models + +Because sensAI models operate on data frames and every row in a data frame +corresponds to a vector of data, the fundamental model class in sensAI is +called `VectorModel`. (Note that, in computer science, a *vector* can hold +arbitrary types of data.) + +A `VectorModel` can be flexibly configured and provides fundamental +functionality for the composition of model-specific data pipelines. +Here are three examples of model definitions: + +```python +feature_coll = registry.collect_features(*list(FeatureName)) + +model_xgb = XGBRandomForestVectorRegressionModel() \ + .with_name("XGBoost") \ + .with_feature_collector(feature_coll) \ + +model_linear = SkLearnLinearRegressionVectorRegressionModel() \ + .with_name("Linear") \ + .with_feature_collector(feature_coll) \ + .with_feature_transformers( + feature_coll.create_dft_one_hot_encoder()) + +model_rffn = ResidualFeedForwardNetworkVectorRegressionModel( + hidden_dims=[10]*5, + cuda=False) \ + .with_name("RFFN") \ + .with_feature_collector(feature_coll) \ + .with_feature_transformers( + feature_coll.create_dft_one_hot_encoder(), + feature_coll.create_dft_normalisation()) \ + .with_target_transformer(DFTSkLearnTransformer(StandardScaler())) +``` + +:white_check_mark: **Declarative model specifications** +:white_check_mark: **Composable data pipelines** + +Notice that the torch-based RFFN model uses some additional transformations +that the other models can do without. + +As already indicated above, sensAI comes with a variety of ready-to-use model implementations based on libraries such as scikit-learn, PyTorch and XGBoost. +Here's a part of the class hierarchy: + +``` +VectorModel +├── AveragingVectorRegressionModel +├── VectorClassificationModel +│ ├── AbstractSkLearnVectorClassificationModel +│ │ ├── LightGBMVectorClassificationModel +│ │ ├── SkLearnDecisionTreeVectorClassificationModel +│ │ ├── SkLearnKNeighborsVectorClassificationModel +│ │ ├── SkLearnLogisticRegressionVectorClassificationModel +│ │ ├── SkLearnMLPVectorClassificationModel +│ │ ├── SkLearnMultinomialNBVectorClassificationModel +│ │ ├── SkLearnRandomForestVectorClassificationModel +│ │ ├── SkLearnSVCVectorClassificationModel +│ │ ├── XGBGradientBoostedVectorClassificationModel +│ │ └── XGBRandomForestVectorClassificationModel +│ ├── CategoricalNaiveBayesVectorClassificationModel +│ ├── KNearestNeighboursClassificationModel +│ └── TorchVectorClassificationModel +│ ├── LSTNetworkVectorClassificationModel +│ └── MultiLayerPerceptronVectorClassificationModel +└── VectorRegressionModel + ├── AbstractSkLearnMultiDimVectorRegressionModel + │ ├── SkLearnKNeighborsVectorRegressionModel + │ ├── SkLearnLinearLassoRegressionVectorRegressionModel + │ ├── SkLearnLinearRegressionVectorRegressionModel + │ ├── SkLearnLinearRidgeRegressionVectorRegressionModel + │ ├── SkLearnLinearSVRVectorRegressionModel + │ ├── SkLearnMultiLayerPerceptronVectorRegressionModel + │ └── SkLearnSVRVectorRegressionModel + ├── AbstractSkLearnMultipleOneDimVectorRegressionModel + │ ├── LightGBMVectorRegressionModel + │ ├── SkLearnDecisionTreeVectorRegressionModel + │ ├── SkLearnDummyVectorRegressionModel + │ ├── SkLearnExtraTreesVectorRegressionModel + │ ├── SkLearnGradientBoostingVectorRegressionModel + │ ├── SkLearnRandomForestVectorRegressionModel + │ ├── XGBGradientBoostedVectorRegressionModel + │ └── XGBRandomForestVectorRegressionModel + ├── KNearestNeighboursRegressionModel + ├── KerasMultiLayerPerceptronVectorRegressionModel + └── TorchVectorRegressionModel + ├── MultiLayerPerceptronVectorRegressionModel + └── ResidualFeedForwardNetworkVectorRegressionModel +``` + +:information_source: The implementation of custom models is straightforward. + +Especially for neural network-based models, you'll usually want to define your +own model architectures. +sensAI's base classes for torch-based models provide many high-level +abstractions to facilitate the use of arbitrarily complex models (which +may require complex transformations of the original inputs into +tensor-based representations). See our tutorial on neural network models. + + +### Evaluation + +Evaluating the performance of models can be a chore. +sensAI's high-level evaluation classes severely cut down on the boiler plate, +allowing you to focus on what matters. + +``` +io_data = InputOutputData.from_data_frame(df, "target") + +ev = RegressionModelEvaluation(io_data, + RegressionEvaluatorParams(data_splitter=DataSplitterFractional(0.8))) + +ev.compare_models([model_xgb, model_linear, model_rffn]) +``` + +:white_check_mark: **Do away with boilerplate** + +They can be flexibly adapted to your needs. +You can inject evaluation metrics, mechanisms for the splitting of data, +apply cross-validation, create plots that visualize model performance, +compare model performance using multiple datasets, and much more. + +:white_check_mark: **Retain flexibility** + +### Track Results + +sensAI supports two mechanisms for the tracking of results: + * Writing results directly to the file system + * Using a tracking framework such as MLflow + +Here's an example where we add both to our regression experiment: + +```python +sensai.util.logging.configure() + +experiment_name = "MyRegressionExperiment" +run_id = datetime_tag() + +# create experiment for tracking with MLflow +tracked_experiment = MLFlowExperiment(experiment_name, + tracking_uri="", + context_prefix=run_id + "_", + add_log_to_all_contexts=True) + +# create file system result writer and enable file logging +result_writer = ResultWriter(os.path.join("results", experiment_name, run_id)) +sensai.util.logging.add_file_logger(result_writer.path("log.txt")) + +# apply model evaluation with tracking enabled +ev.compare_models([model_xgb, model_linear, model_rffn], + tracked_experiment=tracked_experiment, + result_writer=result_writer) +``` + +:white_check_mark: **Appropriately persist results** + +### Feature and Model Selection + +sensAI provides convenient abstractions for hyperparameter optimisation, +feature selection and model selection. + +Through its modular design, sensAI's representations can also be efficiently +combined with other libraries that are specialised for such purposes, +e.g. hyperopt and optuna. + +### Peace of Mind + +sensAI developers are dedicated to providing long-term compatibility. +In contrast to other machine learning libraries, we do our best to retain +backward compatibility of newer versions of sensAI with persisted models +from older versions. + +We use semantic versioning to indicate source-level compatibility and +will indicate breaking changes in the change log. + +:white_check_mark: Backward compatibility + +## Beyond Supervised Learning + +### Unsupervised Learning + +sensAI provides extensive support for **clustering** as well as specializations +and tools for the clustering of geographic coordinates. + +It also provides a very flexible implementation of *greedy agglomerative +clustering*, an algorithm which is very useful in practice but tends to +be overlooked. + +### Combinatorial Optimisation + +sensAI supports combinatorial optimisation via + + * **stochastic local search**, via implementations of + * simulated annealing + * parallel tempering + + Both algorithms support adaptive (i.e. data-driven), + probability-based temperature schedules, greatly facilitating + parametrisation in practice. + + * **constraint programming**, by providing utilities for formulating + and solving optimisation problems in MiniZinc + +### Utilities, Utilities, Utilities + +sensAI's `util` package contains a wide range of general utilities, including + * caching mechanisms (using SQLite, pickle and MySQL as backends) + * string conversion utilities (the `ToStringMixin` is incredibly flexible) + * data structures (e.g. for tree-map-style lookups) + * logging and profiling utilities + * I/O utilities + * multi-processing tools, e.g. a debugger for pickle errors + * etc. + +# Documentation Reference documentation and tutorials can be found [here](https://aai-institute.github.io/sensAI/docs/). -### Integrating sensAI into a Project +## Integrating sensAI into a Project sensAI may be integrated into your project in several ways: @@ -52,7 +441,7 @@ sensAI may be integrated into your project in several ways: See developer documentation in README-dev.md for details on how synchronisation works. -## Contributors +# Contributors
@@ -69,7 +458,7 @@ The library was originally created by the machine intelligence group at [jambit The main contributors are Dominik Jain, Michael Panchenko, and Kristof Schröder. -### How to contribute +## How to contribute External contributions are welcome! Please issue a pull request. diff --git a/resources/readme_code_snippets.py b/resources/readme_code_snippets.py index fdbe6614..c1b2ee73 100644 --- a/resources/readme_code_snippets.py +++ b/resources/readme_code_snippets.py @@ -114,6 +114,13 @@ class FeatureName(Enum): FeatureGeneratorTemperature()) feature_df = fg.generate(df) + # feature collector example + + feature_collector = registry.collect_features( + FeatureName.TEMPERATURE, + FeatureName.WEEKDAY) + features_df = feature_collector.get_multi_feature_generator().generate(df) + # DFT example From 363c629742c500aad993e406c0380c7eb7adcc62 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Thu, 30 Nov 2023 19:11:20 +0100 Subject: [PATCH 4/9] Allow the same FeatureCollector to be soundly used for multiple models --- README.md | 6 +++--- resources/readme_code_snippets.py | 6 +++--- .../featuregen/feature_generator_registry.py | 19 +++++++++++++++++-- src/sensai/vector_model.py | 18 +++++++++++++----- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index f00e0697..f7a49b12 100644 --- a/README.md +++ b/README.md @@ -228,11 +228,11 @@ feature_coll = registry.collect_features(*list(FeatureName)) model_xgb = XGBRandomForestVectorRegressionModel() \ .with_name("XGBoost") \ - .with_feature_collector(feature_coll) \ + .with_feature_collector(feature_coll, shared=True) \ model_linear = SkLearnLinearRegressionVectorRegressionModel() \ .with_name("Linear") \ - .with_feature_collector(feature_coll) \ + .with_feature_collector(feature_coll, shared=True) \ .with_feature_transformers( feature_coll.create_dft_one_hot_encoder()) @@ -240,7 +240,7 @@ model_rffn = ResidualFeedForwardNetworkVectorRegressionModel( hidden_dims=[10]*5, cuda=False) \ .with_name("RFFN") \ - .with_feature_collector(feature_coll) \ + .with_feature_collector(feature_coll, shared=True) \ .with_feature_transformers( feature_coll.create_dft_one_hot_encoder(), feature_coll.create_dft_normalisation()) \ diff --git a/resources/readme_code_snippets.py b/resources/readme_code_snippets.py index c1b2ee73..8cf40398 100644 --- a/resources/readme_code_snippets.py +++ b/resources/readme_code_snippets.py @@ -136,19 +136,19 @@ class FeatureName(Enum): model_xgb = XGBRandomForestVectorRegressionModel() \ .with_name("XGBoost") \ - .with_feature_collector(feature_coll) \ + .with_feature_collector(feature_coll, shared=True) \ .with_feature_transformers( feature_coll.create_dft_one_hot_encoder()) model_linear = SkLearnLinearRegressionVectorRegressionModel() \ .with_name("Linear") \ - .with_feature_collector(feature_coll) \ + .with_feature_collector(feature_coll, shared=True) \ .with_feature_transformers( feature_coll.create_dft_one_hot_encoder()) model_rffn = ResidualFeedForwardNetworkVectorRegressionModel( hidden_dims=[10]*5, cuda=False) \ .with_name("RFFN") \ - .with_feature_collector(feature_coll) \ + .with_feature_collector(feature_coll, shared=True) \ .with_feature_transformers( feature_coll.create_dft_one_hot_encoder(), feature_coll.create_dft_normalisation()) \ diff --git a/src/sensai/featuregen/feature_generator_registry.py b/src/sensai/featuregen/feature_generator_registry.py index ea57bde3..bf274e8b 100644 --- a/src/sensai/featuregen/feature_generator_registry.py +++ b/src/sensai/featuregen/feature_generator_registry.py @@ -94,9 +94,16 @@ def __init__(self, """ self._feature_generators_or_names = feature_generators_or_names self._registry = registry - self._multi_feature_generator = self._create_multi_feature_generator() + self._multi_feature_generator = self.create_multi_feature_generator() def get_multi_feature_generator(self) -> MultiFeatureGenerator: + """ + Gets the multi-feature generator that was created for this collector. + To create a new, independent instance (e.g. when using this collector for multiple + models), use :meth:`create_multi_feature_generator` instead. + + :return: the multi-feature generator that was created for this instance + """ return self._multi_feature_generator def get_normalisation_rules(self, include_generated_categorical_rules=True): @@ -109,7 +116,15 @@ def get_categorical_feature_name_regex(self) -> str: """ return self.get_multi_feature_generator().get_categorical_feature_name_regex() - def _create_multi_feature_generator(self): + def create_multi_feature_generator(self): + """ + Creates a new instance of the multi-feature generator that generates the features + collected by this instance. If the feature collector instance is not used for + multiple models, use :meth:`get_multi_feature_generator` instead to obtain + the instance that has already been created. + + :return: a new multi-feature generator that generates the collected features + """ feature_generators = [] for f in self._feature_generators_or_names: if isinstance(f, FeatureGenerator): diff --git a/src/sensai/vector_model.py b/src/sensai/vector_model.py index b8801bee..ba939384 100644 --- a/src/sensai/vector_model.py +++ b/src/sensai/vector_model.py @@ -210,18 +210,26 @@ def with_feature_generator(self: TVectorModel, feature_generator: Optional[Featu self._featureGenerator = feature_generator return self - def with_feature_collector(self: TVectorModel, feature_collector: FeatureCollector) -> TVectorModel: + def with_feature_collector(self: TVectorModel, feature_collector: FeatureCollector, + shared: bool = False) -> TVectorModel: """ - Makes the model use the given feature collector's multi-feature generator + Makes the model use a multi-feature generator obtained from the given collector in order compute the underlying model's input from the data frame that is given. Overrides any feature generator previously passed to :meth:`withFeatureGenerator` (if any). - Note: Feature computation takes place before input transformation. + Note: Feature generation takes place before feature transformation. - :param feature_collector: the feature collector whose feature generator shall be used for input computation + :param feature_collector: the feature collector from which to obtain the multi-feature generator + :param shared: whether the given feature collector is shared between models (i.e. whether + the same instance is passed to multiple models). + Passing `shared=False` ensures that models using the same collector do not end up + using the same multi-feature collector. :return: self """ - self._featureGenerator = feature_collector.get_multi_feature_generator() + if shared: + self._featureGenerator = feature_collector.create_multi_feature_generator() + else: + self._featureGenerator = feature_collector.get_multi_feature_generator() return self def _pre_processors_are_fitted(self): From 5c28ee6b87b66494afe887dbb56d6d77cdb17717 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Thu, 30 Nov 2023 19:24:05 +0100 Subject: [PATCH 5/9] Further README improvements --- README.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index f7a49b12..63a6c65f 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,7 @@ sensAI is a high-level AI toolkit with a specific focus on **rapid experimentation** for machine learning applications. Its basic interfaces are similar to sklearn's, yet we modularise data pipelines without sacrificing semantics (retaining meta-information at every step) -and provide high-level interfaces for many canonical tasks, achieving largely -declarative semantics. +and provide high-level interfaces that severely cut down on boilerplate code. Some of sensAI's key benefits are: @@ -53,9 +52,9 @@ Some of sensAI's key benefits are: * **Fully integrated solutions for canonical tasks** - Do away with boilerplate by using high-level interfaces for model + Do away with boilerplate code by using high-level interfaces for model evaluation, model selection or feature selection. - Log and track all relevant parameters and results along the way, + Log and track all relevant parameters as well as results along the way, using file-based logging or tracking frameworks such as MLflow. * **Declarative semantics** @@ -326,13 +325,12 @@ ev = RegressionModelEvaluation(io_data, ev.compare_models([model_xgb, model_linear, model_rffn]) ``` -:white_check_mark: **Do away with boilerplate** - They can be flexibly adapted to your needs. You can inject evaluation metrics, mechanisms for the splitting of data, apply cross-validation, create plots that visualize model performance, compare model performance using multiple datasets, and much more. +:white_check_mark: **Do away with boilerplate** :white_check_mark: **Retain flexibility** ### Track Results @@ -403,16 +401,16 @@ be overlooked. sensAI supports combinatorial optimisation via - * **stochastic local search**, via implementations of + * **stochastic local search**, provding implementations of * simulated annealing - * parallel tempering + * parallel tempering. Both algorithms support adaptive (i.e. data-driven), probability-based temperature schedules, greatly facilitating parametrisation in practice. * **constraint programming**, by providing utilities for formulating - and solving optimisation problems in MiniZinc + and solving optimisation problems in MiniZinc. ### Utilities, Utilities, Utilities From 2e68b8561eeb96d9cf3277ed6cd4a3ca7f84a07e Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Mon, 4 Dec 2023 15:13:47 +0100 Subject: [PATCH 6/9] Improve introductory paragraph --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 63a6c65f..d99b1bac 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,11 @@ sensAI is a high-level AI toolkit with a specific focus on **rapid experimentation** for machine learning applications. -Its basic interfaces are similar to sklearn's, yet we modularise data pipelines -without sacrificing semantics (retaining meta-information at every step) -and provide high-level interfaces that severely cut down on boilerplate code. +Through a high level of abstraction and integration, +sensAI minimises overhead, achieving largely declarative semantics, + while retaining a high degree of **flexibility**. +It thus addresses the needs of developers who seek to work efficiently on +custom-tailored AI and machine learning solutions. Some of sensAI's key benefits are: From 6502ca5f19aa41d3f2eb7f85ff4e1fc014a66934 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Mon, 4 Dec 2023 20:53:44 +0100 Subject: [PATCH 7/9] Improve intro section, add TOC, add Beyond Jupyter link, minor improvements --- README.md | 105 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index d99b1bac..b686ab06 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,15 @@ sensAI is a high-level AI toolkit with a specific focus on **rapid experimentation** for machine learning applications. -Through a high level of abstraction and integration, -sensAI minimises overhead, achieving largely declarative semantics, - while retaining a high degree of **flexibility**. -It thus addresses the needs of developers who seek to work efficiently on -custom-tailored AI and machine learning solutions. +Through high levels of abstraction and integration, +sensAI minimises overhead whilst retaining a high degree of **flexibility** +for the implementation of custom solutions. + +If you would normally use a library like scikit-learn on its own, +consider adding sensAI in order to + * gain flexibility, straightforwardly supporting a greater variety of models, + * increase the level of abstraction, cutting down on boilerplate, + * improve logging and tracking with minimal effort. Some of sensAI's key benefits are: @@ -51,7 +55,11 @@ Some of sensAI's key benefits are: the properties associated with features, others can be manually designed to exploit a model's specific capabilities (e.g. a tensor-based representation of complex, non-tabular data for neural networks). - + + Strongly associate pipelines with models in order to avoid errors and + gain the flexibility of supporting highly heterogeneous models within + a single framework, bridging the gap to production along the way. + * **Fully integrated solutions for canonical tasks** Do away with boilerplate code by using high-level interfaces for model @@ -62,13 +70,43 @@ Some of sensAI's key benefits are: * **Declarative semantics** Through its high level of abstraction, sensAI achieves largely - declarative semantics: Focus on what to do rather than on how to do it. - sensAI embraces object-oriented design in order to achieve this. + declarative semantics: Focus on what to do rather than how to do it. + + Eschew the notion of external configuration for a single task, making + your high-level code read like configuration instead. + Gain the flexibility of specifying variations of your models and experiments + with minimal code changes/extensions. While sensAI's main focus is on supervised and unsupervised machine learning, it also provides functionality for discrete optimisation and a wide range of general-purpose utilities that are frequently required in AI applications. - + +
+ + +**Table of Contents** + + + + * [Supervised Learning](#supervised-learning) + + [Feature Generators](#feature-generators) + + [Feature Generator Registry](#feature-generator-registry) + + [(Model-Specific) Data Transformation](#model-specific-data-transformation) + + [Vector Models](#vector-models) + + [Evaluation](#evaluation) + + [Tracking of Results](#tracking-of-results) + + [Feature and Model Selection](#feature-and-model-selection) + + [Peace of Mind](#peace-of-mind) + * [Beyond Supervised Learning](#beyond-supervised-learning) + + [Unsupervised Learning](#unsupervised-learning) + + [Combinatorial Optimisation](#combinatorial-optimisation) + + [Utilities, Utilities, Utilities](#utilities-utilities-utilities) +- [Documentation](#documentation) + * [Integrating sensAI into a Project](#integrating-sensai-into-a-project) +- [Contributors](#contributors) + + + ## Supervised Learning Many real-world tasks can be reduced to classification and regression problems, @@ -335,7 +373,7 @@ compare model performance using multiple datasets, and much more. :white_check_mark: **Do away with boilerplate** :white_check_mark: **Retain flexibility** -### Track Results +### Tracking of Results sensAI supports two mechanisms for the tracking of results: * Writing results directly to the file system @@ -369,12 +407,12 @@ ev.compare_models([model_xgb, model_linear, model_rffn], ### Feature and Model Selection -sensAI provides convenient abstractions for hyperparameter optimisation, -feature selection and model selection. +sensAI provides convenient abstractions for feature selection, model selection +and hyperparameter optimisation. -Through its modular design, sensAI's representations can also be efficiently -combined with other libraries that are specialised for such purposes, -e.g. hyperopt and optuna. +Through its modular design, sensAI's representations can be straightforwardly +combined with other libraries that are specialised for such purposes +(e.g. hyperopt or optuna). ### Peace of Mind @@ -422,23 +460,44 @@ sensAI's `util` package contains a wide range of general utilities, including * data structures (e.g. for tree-map-style lookups) * logging and profiling utilities * I/O utilities - * multi-processing tools, e.g. a debugger for pickle errors + * multi-processing tools (e.g. a debugger for pickle errors) * etc. # Documentation -Reference documentation and tutorials can be found [here](https://aai-institute.github.io/sensAI/docs/). + * [Reference documentation and tutorials](https://aai-institute.github.io/sensAI/docs/) + + At this point, the documentation is still limited, but we plan to add + further tutorials and overview documentation in the future. + + For all the things we do not yet cover extensively, we encourage you to use + your IDE to browse class hierarchies and discover functionality by using + auto-completion. + + If you have a usage question, don't hesitate to add an issue on GitHub. + + * [Beyond Jupyter: A Refactoriung Journey](https://github.com/aai-institute/beyond-jupyter-spotify-popularity) + + Explore this lecture series on software design in machine learning, in + which sensAI is prominently featured. + Our *Refactoring Journey* shows how a use case that is + initially implemented as a Jupyter notebook can be successively refactored + in order to improve the software design, gain flexibility for experimentation, + and ultimately arrive at a solution that could directly be deployed for + production. + + ## Integrating sensAI into a Project -sensAI may be integrated into your project in several ways: +sensAI can be integrated into your project in several ways: 1. **Install it as a library** with `pip install sensai`. Choose this option as a regular user of sensAI with no intention of extending the library as part of your work. 2. **Include sensAI's source code as a package within your project** (e.g. in `src/sensai`), which you synchronise with a sensAI branch. - Choose this option if you intend to make changes to sensAI as you develop your project. When using this option, you (and others) may even make changes to sensAI in several branches of your project and even several projects using the same inclusion mechanism at the same time. - See developer documentation in README-dev.md for details on how synchronisation works. + Choose this option if you intend to make changes to sensAI as you develop your project. When using this option, you (and others) may even make changes to sensAI in several branches of your project (and even several projects) at the same time. + See developer documentation in [README-dev.md](README-dev.md) for details on how synchronisation works. # Contributors @@ -458,8 +517,4 @@ The library was originally created by the machine intelligence group at [jambit The main contributors are Dominik Jain, Michael Panchenko, and Kristof Schröder. -## How to contribute - -External contributions are welcome! Please issue a pull request. - -If you decide to contribute, please strive for consistency with the existing codebase. +External contributions are welcome. \ No newline at end of file From 19b3ea6b66a9d76aa610193e4529893cc0f83f02 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Tue, 5 Dec 2023 23:54:32 +0100 Subject: [PATCH 8/9] Add svg logo --- resources/.gitignore | 1 + resources/sensai-logo.svg | 1 + 2 files changed, 2 insertions(+) create mode 100644 resources/.gitignore create mode 100644 resources/sensai-logo.svg diff --git a/resources/.gitignore b/resources/.gitignore new file mode 100644 index 00000000..d8e5f55d --- /dev/null +++ b/resources/.gitignore @@ -0,0 +1 @@ +/sensai-logo.afdesign diff --git a/resources/sensai-logo.svg b/resources/sensai-logo.svg new file mode 100644 index 00000000..69bf3b1a --- /dev/null +++ b/resources/sensai-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file From 67732715a3d0c1c45f5224342348ccc2f8332e37 Mon Sep 17 00:00:00 2001 From: Dominik Jain Date: Tue, 5 Dec 2023 23:54:45 +0100 Subject: [PATCH 9/9] Use svg logo, minor improvements --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b686ab06..8e7e84bc 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

-
+
the Python library for sensible AI

@@ -88,6 +88,7 @@ of general-purpose utilities that are frequently required in AI applications. +- [About sensAI](#about-sensai) * [Supervised Learning](#supervised-learning) + [Feature Generators](#feature-generators) + [Feature Generator Registry](#feature-generator-registry) @@ -476,7 +477,7 @@ sensAI's `util` package contains a wide range of general utilities, including If you have a usage question, don't hesitate to add an issue on GitHub. - * [Beyond Jupyter: A Refactoriung Journey](https://github.com/aai-institute/beyond-jupyter-spotify-popularity) + * [Beyond Jupyter: A Refactoring Journey](https://github.com/aai-institute/beyond-jupyter-spotify-popularity) Explore this lecture series on software design in machine learning, in which sensAI is prominently featured.