neurodata · SamuelCarliles3 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 20, 2024
diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py
@@ -2,15 +2,58 @@
     GradientBoostingClassifier,
     HistGradientBoostingClassifier,
     RandomForestClassifier,
+    RandomForestRegressor,
 )
 
 from .common import Benchmark, Estimator, Predictor
 from .datasets import (
     _20newsgroups_highdim_dataset,
     _20newsgroups_lowdim_dataset,
     _synth_classification_dataset,
+    _synth_regression_dataset,
+    _synth_regression_sparse_dataset,
 )
-from .utils import make_gen_classif_scorers
+from .utils import make_gen_classif_scorers, make_gen_reg_scorers
+
+
+class RandomForestRegressorBenchmark(Predictor, Estimator, Benchmark):
+    """
+    Benchmarks for RandomForestRegressor.
+    """
+
+    param_names = ["representation", "n_jobs"]
+    params = (["dense", "sparse"], Benchmark.n_jobs_vals)
+
+    def setup_cache(self):
+        super().setup_cache()
+
+    def make_data(self, params):
+        representation, n_jobs = params
+
+        if representation == "sparse":
+            data = _synth_regression_sparse_dataset()
+        else:
+            data = _synth_regression_dataset()
+
+        return data
+
+    def make_estimator(self, params):
+        representation, n_jobs = params
+
+        n_estimators = 500 if Benchmark.data_size == "large" else 100
+
+        estimator = RandomForestRegressor(
+            n_estimators=n_estimators,
+            min_samples_split=10,
+            max_features="log2",
+            n_jobs=n_jobs,
+            random_state=0,
+        )
+
+        return estimator
+
+    def make_scorers(self):
+        make_gen_reg_scorers(self)
 
 
 class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark):

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
@@ -6,6 +6,7 @@
 #          Jacob Schreiber <[email protected]>
 #          Adam Li <[email protected]>
 #          Jong Shin <[email protected]>
+#          Samuel Carliles <[email protected]>
 #
 # License: BSD 3 clause
 
@@ -14,9 +15,49 @@ from libcpp.vector cimport vector
 
 from ._criterion cimport BaseCriterion, Criterion
 from ._tree cimport ParentInfo
+
 from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint32_t
 
 
+# NICE IDEAS THAT DON'T APPEAR POSSIBLE
+# - accessing elements of a memory view of cython extension types in a nogil block/function
+# - storing cython extension types in cpp vectors
-# NICE IDEAS THAT DON'T APPEAR POSSIBLE
-# - accessing elements of a memory view of cython extension types in a nogil block/function
-# - storing cython extension types in cpp vectors
+# NICE IDEAS THAT DON'T APPEAR POSSIBLE (Samuel)
+# 1. accessing elements of a memory view of cython extension types in a nogil block/function
+# 2. storing cython extension types in cpp vectors
-# NICE IDEAS THAT DON'T APPEAR POSSIBLE
-# - accessing elements of a memory view of cython extension types in a nogil block/function
-# - storing cython extension types in cpp vectors
+# NICE IDEAS THAT DON'T APPEAR POSSIBLE (Samuel)
+# 1. accessing elements of a memory view of cython extension types in a nogil block/function
+# 2. storing cython extension types in cpp vectors
+#
+# despite the fact that we can access scalar extension type properties in such a context,
+# as for instance node_split_best does with Criterion and Partition,
+# and we can access the elements of a memory view of primitive types in such a context
+#
+# SO WHERE DOES THAT LEAVE US
+# - we can transform these into cpp vectors of structs
+#   and with some minor casting irritations everything else works ok
+ctypedef void* SplitConditionEnv
+ctypedef bint (*SplitConditionFunction)(
+    Splitter splitter,
+    SplitRecord* current_split,
+    intp_t n_missing,
+    bint missing_go_to_left,
+    float64_t lower_bound,
+    float64_t upper_bound,
+    SplitConditionEnv split_condition_env
+) noexcept nogil
+
+cdef struct SplitConditionClosure:
+    SplitConditionFunction f
+    SplitConditionEnv e
+
+cdef class SplitCondition:
+    cdef SplitConditionClosure c
+
+cdef class MinSamplesLeafCondition(SplitCondition):
+    pass
+
+cdef class MinWeightLeafCondition(SplitCondition):
+    pass
+
+cdef class MonotonicConstraintCondition(SplitCondition):
+    pass
+
+
 cdef struct SplitRecord:
     # Data to track sample split
     intp_t feature         # Which feature to split on.
@@ -30,6 +71,13 @@ cdef struct SplitRecord:
     unsigned char missing_go_to_left  # Controls if missing values go to the left node.
     intp_t n_missing            # Number of missing values for the feature being split on
 
+ctypedef void* SplitRecordFactoryEnv
+ctypedef SplitRecord* (*SplitRecordFactory)(SplitRecordFactoryEnv env) except NULL nogil
+
+cdef struct SplitRecordFactoryClosure:
+    SplitRecordFactory f
+    SplitRecordFactoryEnv e
+
 cdef class BaseSplitter:
     """Abstract interface for splitter."""
 
@@ -59,6 +107,8 @@ cdef class BaseSplitter:
 
     cdef const float64_t[:] sample_weight
 
+    cdef SplitRecordFactoryClosure split_record_factory
+
     # The samples vector `samples` is maintained by the Splitter object such
     # that the samples contained in a node are contiguous. With this setting,
     # `node_split` reorganizes the node samples `samples[start:end]` in two
@@ -90,6 +140,7 @@ cdef class BaseSplitter:
     cdef void node_value(self, float64_t* dest) noexcept nogil
     cdef float64_t node_impurity(self) noexcept nogil
     cdef intp_t pointer_size(self) noexcept nogil
+    cdef SplitRecord* create_split_record(self) except NULL nogil
 
 cdef class Splitter(BaseSplitter):
     """Base class for supervised splitters."""
@@ -105,6 +156,13 @@ cdef class Splitter(BaseSplitter):
     cdef const int8_t[:] monotonic_cst
     cdef bint with_monotonic_cst
 
+    cdef SplitCondition min_samples_leaf_condition
+    cdef SplitCondition min_weight_leaf_condition
+    cdef SplitCondition monotonic_constraint_condition
+
+    cdef vector[SplitConditionClosure] presplit_conditions
+    cdef vector[SplitConditionClosure] postsplit_conditions
+
     cdef int init(
         self,
         object X,