From 8c09f7fad193bdb853325ea618b63d2c80b144e0 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 16 Feb 2024 13:36:02 -0500 Subject: [PATCH 01/29] init split condition injection --- sklearn/tree/_splitter.pxd | 5 +++++ sklearn/tree/_splitter.pyx | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index f1434f5d05cc9..3169a9198d3f1 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,6 +19,8 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion +ctypedef bint (*SplitCondition)(Splitter*) + cdef struct SplitRecord: # Data to track sample split intp_t feature # Which feature to split on. @@ -112,6 +114,9 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst + cdef SplitCondition[:] pre_split_conditions + cdef SplitCondition[:] post_split_conditions + cdef int init( self, object X, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 1f781e55350d2..2352862e67f48 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -155,6 +155,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + SplitCondition[:] pre_split_conditions=[], + SplitCondition[:] post_split_conditions=[], *argv ): """ @@ -195,6 +197,9 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.pre_split_conditions = pre_split_conditions + self.post_split_conditions = post_split_conditions + def __reduce__(self): return (type(self), (self.criterion, self.max_features, From ecfc9b1d1e6f89c476dc2231d9cda3a484c456e9 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 16 Feb 2024 14:50:27 -0500 Subject: [PATCH 02/29] wip --- sklearn/tree/_splitter.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 3169a9198d3f1..04929e679b024 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,7 +19,7 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef bint (*SplitCondition)(Splitter*) +ctypedef bint (*SplitCondition)(Splitter splitter) cdef struct SplitRecord: # Data to track sample split From 0c3d5c0f2a1ac6c8ec8ab9a7fa8bb1af8e721797 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 16 Feb 2024 15:11:51 -0500 Subject: [PATCH 03/29] wip --- sklearn/tree/_splitter.pxd | 4 ++-- sklearn/tree/_splitter.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 04929e679b024..b8f8d9cfb19f4 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -114,8 +114,8 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef SplitCondition[:] pre_split_conditions - cdef SplitCondition[:] post_split_conditions + cdef SplitCondition[] pre_split_conditions + cdef SplitCondition[] post_split_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 2352862e67f48..beb0ebae3136d 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -155,8 +155,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitCondition[:] pre_split_conditions=[], - SplitCondition[:] post_split_conditions=[], + SplitCondition[] pre_split_conditions=[], + SplitCondition[] post_split_conditions=[], *argv ): """ From 5fd12a2c42db768aaffbd73801fe5e0a2b477089 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 20 Feb 2024 11:52:26 -0500 Subject: [PATCH 04/29] wip --- sklearn/tree/_splitter.pxd | 3 --- sklearn/tree/_splitter.pyx | 5 ----- 2 files changed, 8 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index b8f8d9cfb19f4..2e277e0b1d13f 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -114,9 +114,6 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef SplitCondition[] pre_split_conditions - cdef SplitCondition[] post_split_conditions - cdef int init( self, object X, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index beb0ebae3136d..1f781e55350d2 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -155,8 +155,6 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitCondition[] pre_split_conditions=[], - SplitCondition[] post_split_conditions=[], *argv ): """ @@ -197,9 +195,6 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - self.pre_split_conditions = pre_split_conditions - self.post_split_conditions = post_split_conditions - def __reduce__(self): return (type(self), (self.criterion, self.max_features, From b593ee024ad932a93bbc8fb2797a54a981c35604 Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 26 Feb 2024 19:09:10 -0500 Subject: [PATCH 05/29] injection progress --- sklearn/tree/_splitter.pxd | 9 ++++++++- sklearn/tree/_splitter.pyx | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 2e277e0b1d13f..3cd2d1dd3898a 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,7 +19,11 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef bint (*SplitCondition)(Splitter splitter) +ctypedef bint (*SplitCondition)(Splitter splitter) noexcept nogil + +cdef class SplitConditions: + cdef vector[SplitCondition] value + cdef struct SplitRecord: # Data to track sample split @@ -114,6 +118,9 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst + cdef public SplitConditions presplit_conditions + cdef public SplitConditions postsplit_conditions + cdef int init( self, object X, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 1f781e55350d2..260d571f71392 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -43,6 +43,23 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 +cdef bint condition1(Splitter splitter) noexcept nogil: + cdef bint bar = splitter.n_samples > 0 + + return 1 + +cdef class SplitConditions: + def __init__(self, n): + self.value.resize(n) + +def foo(): + presplit_conditions = SplitConditions(2) + presplit_conditions.value[0] = condition1 + presplit_conditions.value[1] = condition1 + + postsplit_conditions = SplitConditions(1) + postsplit_conditions = condition1 + cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY @@ -155,6 +172,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + SplitConditions presplit_conditions=None, + SplitConditions postsplit_conditions=None, *argv ): """ @@ -195,6 +214,9 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.presplit_conditions = presplit_conditions + self.postsplit_conditions = postsplit_conditions + def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -602,6 +624,11 @@ cdef inline intp_t node_split_best( n_right = end_non_missing - current_split.pos + n_missing if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue + + if splitter.presplit_conditions is not None: + for condition in splitter.presplit_conditions.value: + if condition(splitter): + continue criterion.update(current_split.pos) @@ -620,6 +647,11 @@ cdef inline intp_t node_split_best( # Reject if min_weight_leaf is not satisfied if splitter.check_postsplit_conditions() == 1: continue + + if splitter.postsplit_conditions is not None: + for condition in splitter.postsplit_conditions.value: + if condition(splitter): + continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 180fac32308195301e80d574b9b026fc66fece8b Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 27 Feb 2024 13:51:32 -0500 Subject: [PATCH 06/29] injection progress --- sklearn/tree/_splitter.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 260d571f71392..fd65568963a43 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -44,9 +44,7 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 cdef bint condition1(Splitter splitter) noexcept nogil: - cdef bint bar = splitter.n_samples > 0 - - return 1 + return splitter.n_samples > 0 cdef class SplitConditions: def __init__(self, n): @@ -58,7 +56,7 @@ def foo(): presplit_conditions.value[1] = condition1 postsplit_conditions = SplitConditions(1) - postsplit_conditions = condition1 + postsplit_conditions.value[0] = condition1 cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY From c207c3e220f6bf7bb699660da9a28a96834f01bc Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 27 Feb 2024 14:45:32 -0500 Subject: [PATCH 07/29] split injection refactoring --- sklearn/tree/_splitter.pxd | 7 ++----- sklearn/tree/_splitter.pyx | 34 ++++++++++++++-------------------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 3cd2d1dd3898a..37e3554f36dd4 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -21,9 +21,6 @@ from ._criterion cimport BaseCriterion, Criterion ctypedef bint (*SplitCondition)(Splitter splitter) noexcept nogil -cdef class SplitConditions: - cdef vector[SplitCondition] value - cdef struct SplitRecord: # Data to track sample split @@ -118,8 +115,8 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef public SplitConditions presplit_conditions - cdef public SplitConditions postsplit_conditions + cdef vector[SplitCondition] presplit_conditions + cdef vector[SplitCondition] postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index fd65568963a43..92c7a082283fe 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -46,17 +46,17 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 cdef bint condition1(Splitter splitter) noexcept nogil: return splitter.n_samples > 0 -cdef class SplitConditions: - def __init__(self, n): - self.value.resize(n) +cdef bint condition2(Splitter splitter) noexcept nogil: + return splitter.n_samples < 10 def foo(): - presplit_conditions = SplitConditions(2) - presplit_conditions.value[0] = condition1 - presplit_conditions.value[1] = condition1 + splitter = Splitter() + + splitter.presplit_conditions.push_back(condition1) + splitter.presplit_conditions.push_back(condition2) + + splitter.postsplit_conditions.push_back(condition1) - postsplit_conditions = SplitConditions(1) - postsplit_conditions.value[0] = condition1 cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY @@ -170,8 +170,6 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitConditions presplit_conditions=None, - SplitConditions postsplit_conditions=None, *argv ): """ @@ -212,8 +210,6 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - self.presplit_conditions = presplit_conditions - self.postsplit_conditions = postsplit_conditions def __reduce__(self): return (type(self), (self.criterion, @@ -623,10 +619,9 @@ cdef inline intp_t node_split_best( if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue - if splitter.presplit_conditions is not None: - for condition in splitter.presplit_conditions.value: - if condition(splitter): - continue + for condition in splitter.presplit_conditions: + if condition(splitter): + continue criterion.update(current_split.pos) @@ -646,10 +641,9 @@ cdef inline intp_t node_split_best( if splitter.check_postsplit_conditions() == 1: continue - if splitter.postsplit_conditions is not None: - for condition in splitter.postsplit_conditions.value: - if condition(splitter): - continue + for condition in splitter.postsplit_conditions: + if condition(splitter): + continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 7cc71c10c49265cf581efb1637b17af142bb7d29 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 29 Feb 2024 11:04:19 -0800 Subject: [PATCH 08/29] added condition parameter passthrough prototype --- sklearn/tree/_splitter.pxd | 25 ++++++++++++++++++++++--- sklearn/tree/_splitter.pyx | 33 ++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 37e3554f36dd4..9eec9dd9afad8 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,7 +19,26 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef bint (*SplitCondition)(Splitter splitter) noexcept nogil +ctypedef void *SplitConditionParameters +ctypedef bint (*SplitCondition)(Splitter splitter, void* split_condition_parameters) noexcept nogil + +cdef struct SplitConditionTuple: + SplitCondition f + SplitConditionParameters p + +cdef struct DummyParameters: + int dummy + +cdef struct Condition1Parameters: + int some_number + +cdef inline bint condition1(Splitter splitter, void* split_condition_parameters) noexcept nogil: + cdef Condition1Parameters* p = split_condition_parameters + + return splitter.n_samples > 0 and p.some_number < 1000 + +cdef inline bint condition2(Splitter splitter, void* split_condition_parameters) noexcept nogil: + return splitter.n_samples < 10 cdef struct SplitRecord: @@ -115,8 +134,8 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef vector[SplitCondition] presplit_conditions - cdef vector[SplitCondition] postsplit_conditions + cdef vector[SplitConditionTuple] presplit_conditions + cdef vector[SplitConditionTuple] postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 92c7a082283fe..cc047ac605749 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -19,7 +19,7 @@ from cython cimport final from libc.math cimport isnan -from libc.stdlib cimport qsort +from libc.stdlib cimport qsort, malloc, free from libc.string cimport memcpy cimport numpy as cnp @@ -43,19 +43,26 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 -cdef bint condition1(Splitter splitter) noexcept nogil: - return splitter.n_samples > 0 +from ._tree cimport Tree +cdef class FooTree(Tree): + cdef Condition1Parameters* c1p + cdef DummyParameters* dummy_params -cdef bint condition2(Splitter splitter) noexcept nogil: - return splitter.n_samples < 10 + def __init__(self): + splitter = Splitter() + self.c1p = malloc(sizeof(Condition1Parameters)) + self.c1p.some_number = 5 -def foo(): - splitter = Splitter() + self.dummy_params = malloc(sizeof(DummyParameters)) - splitter.presplit_conditions.push_back(condition1) - splitter.presplit_conditions.push_back(condition2) - - splitter.postsplit_conditions.push_back(condition1) + splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) + splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) + + def __dealloc__(self): + if self.c1p is not NULL: + free(self.c1p) + if self.dummy_params is not NULL: + free(self.dummy_params) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: @@ -620,7 +627,7 @@ cdef inline intp_t node_split_best( continue for condition in splitter.presplit_conditions: - if condition(splitter): + if not condition.f(splitter, condition.p): continue criterion.update(current_split.pos) @@ -642,7 +649,7 @@ cdef inline intp_t node_split_best( continue for condition in splitter.postsplit_conditions: - if condition(splitter): + if not condition.f(splitter, condition.p): continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 2470d492c6cf52b5cad1bbeec7e272e56c4470cd Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 29 Feb 2024 11:32:42 -0800 Subject: [PATCH 09/29] some tidying --- sklearn/tree/_splitter.pxd | 21 ++++++++++++++++++--- sklearn/tree/_splitter.pyx | 15 +++++++-------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 9eec9dd9afad8..6b20fec2a56dc 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -13,6 +13,7 @@ cimport numpy as cnp from libcpp.vector cimport vector +from libc.stdlib cimport malloc from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t from ._utils cimport UINT32_t @@ -20,7 +21,7 @@ from ._criterion cimport BaseCriterion, Criterion ctypedef void *SplitConditionParameters -ctypedef bint (*SplitCondition)(Splitter splitter, void* split_condition_parameters) noexcept nogil +ctypedef bint (*SplitCondition)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil cdef struct SplitConditionTuple: SplitCondition f @@ -29,15 +30,29 @@ cdef struct SplitConditionTuple: cdef struct DummyParameters: int dummy +cdef inline DummyParameters* create_dummy_parameters(int dummy): + cdef DummyParameters* result = malloc(sizeof(DummyParameters)) + if result == NULL: + return NULL + result.dummy = dummy + return result + cdef struct Condition1Parameters: int some_number -cdef inline bint condition1(Splitter splitter, void* split_condition_parameters) noexcept nogil: +cdef inline Condition1Parameters* create_condition1_parameters(int some_number): + cdef Condition1Parameters* result = malloc(sizeof(Condition1Parameters)) + if result == NULL: + return NULL + result.some_number = some_number + return result + +cdef inline bint condition1(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: cdef Condition1Parameters* p = split_condition_parameters return splitter.n_samples > 0 and p.some_number < 1000 -cdef inline bint condition2(Splitter splitter, void* split_condition_parameters) noexcept nogil: +cdef inline bint condition2(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: return splitter.n_samples < 10 diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index cc047ac605749..d6d191462bff3 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -19,7 +19,7 @@ from cython cimport final from libc.math cimport isnan -from libc.stdlib cimport qsort, malloc, free +from libc.stdlib cimport qsort, free from libc.string cimport memcpy cimport numpy as cnp @@ -45,18 +45,17 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 from ._tree cimport Tree cdef class FooTree(Tree): + cdef Splitter splitter cdef Condition1Parameters* c1p cdef DummyParameters* dummy_params def __init__(self): - splitter = Splitter() - self.c1p = malloc(sizeof(Condition1Parameters)) - self.c1p.some_number = 5 + self.c1p = create_condition1_parameters(5) + self.dummy_params = create_dummy_parameters(0) - self.dummy_params = malloc(sizeof(DummyParameters)) - - splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) - splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) + self.splitter = Splitter() + self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) + self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) def __dealloc__(self): if self.c1p is not NULL: From ee3399faf3e2d01f0ccf05e3b7083fe7cbd287c6 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 29 Feb 2024 12:45:48 -0800 Subject: [PATCH 10/29] more tidying --- sklearn/tree/_splitter.pxd | 30 ++++++++++-------------------- sklearn/tree/_splitter.pyx | 16 ++++++---------- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 6b20fec2a56dc..1620d744d75c0 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -27,33 +27,23 @@ cdef struct SplitConditionTuple: SplitCondition f SplitConditionParameters p -cdef struct DummyParameters: - int dummy - -cdef inline DummyParameters* create_dummy_parameters(int dummy): - cdef DummyParameters* result = malloc(sizeof(DummyParameters)) - if result == NULL: - return NULL - result.dummy = dummy - return result +cdef inline bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + return splitter.n_samples < 10 -cdef struct Condition1Parameters: - int some_number +cdef struct AlphaRegularityParameters: + float64_t alpha -cdef inline Condition1Parameters* create_condition1_parameters(int some_number): - cdef Condition1Parameters* result = malloc(sizeof(Condition1Parameters)) +cdef inline AlphaRegularityParameters* create_alpha_regularity_parameters(float64_t alpha): + cdef AlphaRegularityParameters* result = malloc(sizeof(AlphaRegularityParameters)) if result == NULL: return NULL - result.some_number = some_number + result.alpha = alpha return result -cdef inline bint condition1(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - cdef Condition1Parameters* p = split_condition_parameters - - return splitter.n_samples > 0 and p.some_number < 1000 +cdef inline bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + cdef AlphaRegularityParameters* p = split_condition_parameters -cdef inline bint condition2(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - return splitter.n_samples < 10 + return 1 cdef struct SplitRecord: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d6d191462bff3..40c20dad96042 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -46,22 +46,18 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 from ._tree cimport Tree cdef class FooTree(Tree): cdef Splitter splitter - cdef Condition1Parameters* c1p - cdef DummyParameters* dummy_params + cdef AlphaRegularityParameters* p_alpha def __init__(self): - self.c1p = create_condition1_parameters(5) - self.dummy_params = create_dummy_parameters(0) + self.p_alpha = create_alpha_regularity_parameters(0.2) self.splitter = Splitter() - self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) - self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) + self.splitter.presplit_conditions.push_back(SplitConditionTuple(alpha_regularity_condition, self.p_alpha)) + self.splitter.presplit_conditions.push_back(SplitConditionTuple(has_data_condition, NULL)) def __dealloc__(self): - if self.c1p is not NULL: - free(self.c1p) - if self.dummy_params is not NULL: - free(self.dummy_params) + if self.p_alpha is not NULL: + free(self.p_alpha) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: From a079e4fdac4f24367686bb1398dcfa6bc2d7d115 Mon Sep 17 00:00:00 2001 From: scarliles Date: Sat, 9 Mar 2024 22:12:39 -0500 Subject: [PATCH 11/29] splitter injection refactoring --- sklearn/tree/_splitter.pxd | 25 +++--------- sklearn/tree/_splitter.pyx | 80 ++++++++++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 37 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 1620d744d75c0..f552101ae40b2 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -20,30 +20,15 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef void *SplitConditionParameters -ctypedef bint (*SplitCondition)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil +ctypedef void* SplitConditionParameters +ctypedef bint (*SplitConditionFunction)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil cdef struct SplitConditionTuple: - SplitCondition f + SplitConditionFunction f SplitConditionParameters p -cdef inline bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - return splitter.n_samples < 10 - -cdef struct AlphaRegularityParameters: - float64_t alpha - -cdef inline AlphaRegularityParameters* create_alpha_regularity_parameters(float64_t alpha): - cdef AlphaRegularityParameters* result = malloc(sizeof(AlphaRegularityParameters)) - if result == NULL: - return NULL - result.alpha = alpha - return result - -cdef inline bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - cdef AlphaRegularityParameters* p = split_condition_parameters - - return 1 +cdef class SplitCondition: + cdef SplitConditionTuple t cdef struct SplitRecord: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 40c20dad96042..22dbb995dd3f6 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -43,21 +43,56 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 + +cdef struct HasDataParameters: + int min_samples + +cdef bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + cdef HasDataParameters* p = split_condition_parameters + return splitter.n_samples >= p.min_samples + +cdef class HasDataCondition(SplitCondition): + def __cinit__(self, int min_samples): + self.t.f = has_data_condition + self.t.p = malloc(sizeof(HasDataParameters)) + (self.t.p).min_samples = min_samples + + def __dealloc__(self): + if self.t.p is not NULL: + free(self.t.p) + + super.__dealloc__(self) + +cdef struct AlphaRegularityParameters: + float64_t alpha + +cdef bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + cdef AlphaRegularityParameters* p = split_condition_parameters + + return 1 + +cdef class AlphaRegularityCondition(SplitCondition): + def __cinit__(self, float64_t alpha): + self.t.f = alpha_regularity_condition + self.t.p = malloc(sizeof(AlphaRegularityParameters)) + (self.t.p).alpha = alpha + + def __dealloc__(self): + if self.t.p is not NULL: + free(self.t.p) + + super.__dealloc__(self) + + from ._tree cimport Tree cdef class FooTree(Tree): cdef Splitter splitter - cdef AlphaRegularityParameters* p_alpha def __init__(self): - self.p_alpha = create_alpha_regularity_parameters(0.2) - - self.splitter = Splitter() - self.splitter.presplit_conditions.push_back(SplitConditionTuple(alpha_regularity_condition, self.p_alpha)) - self.splitter.presplit_conditions.push_back(SplitConditionTuple(has_data_condition, NULL)) - - def __dealloc__(self): - if self.p_alpha is not NULL: - free(self.p_alpha) + self.splitter = Splitter( + presplit_conditions = [HasDataCondition(10)], + postsplit_conditions = [AlphaRegularityCondition(0.1)], + ) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: @@ -172,6 +207,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + SplitCondition[:] presplit_conditions, + SplitCondition[:] postsplit_conditions, *argv ): """ @@ -212,6 +249,14 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + if presplit_conditions is not None: + for condition in presplit_conditions: + self.presplit_conditions.push_back((condition).t) + + if postsplit_conditions is not None: + for condition in postsplit_conditions: + self.postsplit_conditions.push_back((condition).t) + def __reduce__(self): return (type(self), (self.criterion, @@ -618,13 +663,14 @@ cdef inline intp_t node_split_best( else: n_left = current_split.pos - splitter.start n_right = end_non_missing - current_split.pos + n_missing - if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: - continue - + for condition in splitter.presplit_conditions: if not condition.f(splitter, condition.p): continue + if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: + continue + criterion.update(current_split.pos) # Reject if monotonicity constraints are not satisfied @@ -639,14 +685,14 @@ cdef inline intp_t node_split_best( ): continue - # Reject if min_weight_leaf is not satisfied - if splitter.check_postsplit_conditions() == 1: - continue - for condition in splitter.postsplit_conditions: if not condition.f(splitter, condition.p): continue + # Reject if min_weight_leaf is not satisfied + if splitter.check_postsplit_conditions() == 1: + continue + current_proxy_improvement = criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: From 5397b666fe21025c113d30e8eb39c50556b0fca7 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 15 Mar 2024 17:46:16 -0400 Subject: [PATCH 12/29] cython injection due diligence, converted min_sample and monotonic_cst to injections --- sklearn/tree/_splitter.pxd | 22 ++++- sklearn/tree/_splitter.pyx | 191 +++++++++++++++++++++++++++++-------- 2 files changed, 173 insertions(+), 40 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index f552101ae40b2..9a400f3954b13 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -6,6 +6,7 @@ # Jacob Schreiber # Adam Li # Jong Shin +# Samuel Carliles # # License: BSD 3 clause @@ -20,8 +21,27 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion +# NICE IDEAS THAT DON'T APPEAR POSSIBLE +# - accessing elements of a memory view of cython extension types in a nogil block/function +# - storing cython extension types in cpp vectors +# +# despite the fact that we can access scalar extension type properties in such a context, +# as for instance node_split_best does with Criterion and Partition, +# and we can access the elements of a memory view of primitive types in such a context +# +# SO WHERE DOES THAT LEAVE US +# - we can transform these into cpp vectors of structs +# and with some minor casting irritations everything else works ok ctypedef void* SplitConditionParameters -ctypedef bint (*SplitConditionFunction)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil +ctypedef bint (*SplitConditionFunction)( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil cdef struct SplitConditionTuple: SplitConditionFunction f diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 22dbb995dd3f6..bb21548ef4b31 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -44,10 +44,99 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 +cdef bint min_sample_leaf_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: + cdef intp_t min_samples_leaf = splitter.min_samples_leaf + cdef intp_t end_non_missing = splitter.end - n_missing + cdef intp_t n_left, n_right + + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + + # Reject if min_samples_leaf is not guaranteed + if n_left < min_samples_leaf or n_right < min_samples_leaf: + return 0 + + return 1 + +cdef class MinSamplesLeafCondition(SplitCondition): + def __cinit__(self): + self.t.f = min_sample_leaf_condition + self.t.p = NULL # min_samples is stored in splitter, which is already passed to f + +cdef bint min_weight_leaf_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: + cdef float64_t min_weight_leaf = splitter.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((splitter.criterion.weighted_n_left < min_weight_leaf) or + (splitter.criterion.weighted_n_right < min_weight_leaf)): + return 0 + + return 1 + +cdef class MinWeightLeafCondition(SplitCondition): + def __cinit__(self): + self.t.f = min_weight_leaf_condition + self.t.p = NULL # min_weight_leaf is stored in splitter, which is already passed to f + +cdef bint monotonic_constraint_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: + if ( + splitter.with_monotonic_cst and + splitter.monotonic_cst[current_split.feature] != 0 and + not splitter.criterion.check_monotonicity( + splitter.monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): + return 0 + + return 1 + +cdef class MonotonicConstraintCondition(SplitCondition): + def __cinit__(self): + self.t.f = monotonic_constraint_condition + self.t.p = NULL + cdef struct HasDataParameters: int min_samples -cdef bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: +cdef bint has_data_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: cdef HasDataParameters* p = split_condition_parameters return splitter.n_samples >= p.min_samples @@ -66,7 +155,15 @@ cdef class HasDataCondition(SplitCondition): cdef struct AlphaRegularityParameters: float64_t alpha -cdef bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: +cdef bint alpha_regularity_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: cdef AlphaRegularityParameters* p = split_condition_parameters return 1 @@ -249,14 +346,24 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.min_samples_leaf_condition = MinSamplesLeafCondition() + self.min_weight_leaf_condition = MinWeightLeafCondition() + + self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) if presplit_conditions is not None: for condition in presplit_conditions: self.presplit_conditions.push_back((condition).t) + self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) if postsplit_conditions is not None: for condition in postsplit_conditions: self.postsplit_conditions.push_back((condition).t) + if(self.with_monotonic_cst): + self.monotonic_constraint_condition = MonotonicConstraintCondition() + self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) + self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) + def __reduce__(self): return (type(self), (self.criterion, @@ -644,54 +751,60 @@ cdef inline intp_t node_split_best( current_split.pos = p - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): - continue - - # Reject if min_samples_leaf is not guaranteed - if missing_go_to_left: - n_left = current_split.pos - splitter.start + n_missing - n_right = end_non_missing - current_split.pos - else: - n_left = current_split.pos - splitter.start - n_right = end_non_missing - current_split.pos + n_missing + # # Reject if monotonicity constraints are not satisfied + # if ( + # with_monotonic_cst and + # monotonic_cst[current_split.feature] != 0 and + # not criterion.check_monotonicity( + # monotonic_cst[current_split.feature], + # lower_bound, + # upper_bound, + # ) + # ): + # continue + + # # Reject if min_samples_leaf is not guaranteed + # if missing_go_to_left: + # n_left = current_split.pos - splitter.start + n_missing + # n_right = end_non_missing - current_split.pos + # else: + # n_left = current_split.pos - splitter.start + # n_right = end_non_missing - current_split.pos + n_missing for condition in splitter.presplit_conditions: - if not condition.f(splitter, condition.p): + if not condition.f( + splitter, ¤t_split, n_missing, missing_go_to_left, + lower_bound, upper_bound, condition.p + ): continue - if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: - continue + # if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: + # continue criterion.update(current_split.pos) - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): - continue + # # Reject if monotonicity constraints are not satisfied + # if ( + # with_monotonic_cst and + # monotonic_cst[current_split.feature] != 0 and + # not criterion.check_monotonicity( + # monotonic_cst[current_split.feature], + # lower_bound, + # upper_bound, + # ) + # ): + # continue for condition in splitter.postsplit_conditions: - if not condition.f(splitter, condition.p): + if not condition.f( + splitter, ¤t_split, n_missing, missing_go_to_left, + lower_bound, upper_bound, condition.p + ): continue - # Reject if min_weight_leaf is not satisfied - if splitter.check_postsplit_conditions() == 1: - continue + # # Reject if min_weight_leaf is not satisfied + # if splitter.check_postsplit_conditions() == 1: + # continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 44f1d570fd0ba0503737c3f705e83f2ec7b8836a Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 18 Mar 2024 14:53:58 -0400 Subject: [PATCH 13/29] tree tests pass huzzah! --- sklearn/tree/_splitter.pxd | 4 ++++ sklearn/tree/_splitter.pyx | 36 ++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 9a400f3954b13..0edd4eb40231c 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -144,6 +144,10 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst + cdef SplitCondition min_samples_leaf_condition + cdef SplitCondition min_weight_leaf_condition + cdef SplitCondition monotonic_constraint_condition + cdef vector[SplitConditionTuple] presplit_conditions cdef vector[SplitConditionTuple] postsplit_conditions diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index bb21548ef4b31..983a6f89b4a43 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -66,9 +66,9 @@ cdef bint min_sample_leaf_condition( # Reject if min_samples_leaf is not guaranteed if n_left < min_samples_leaf or n_right < min_samples_leaf: - return 0 + return False - return 1 + return True cdef class MinSamplesLeafCondition(SplitCondition): def __cinit__(self): @@ -89,9 +89,9 @@ cdef bint min_weight_leaf_condition( # Reject if min_weight_leaf is not satisfied if ((splitter.criterion.weighted_n_left < min_weight_leaf) or (splitter.criterion.weighted_n_right < min_weight_leaf)): - return 0 + return False - return 1 + return True cdef class MinWeightLeafCondition(SplitCondition): def __cinit__(self): @@ -116,9 +116,9 @@ cdef bint monotonic_constraint_condition( upper_bound, ) ): - return 0 + return False - return 1 + return True cdef class MonotonicConstraintCondition(SplitCondition): def __cinit__(self): @@ -166,7 +166,7 @@ cdef bint alpha_regularity_condition( ) noexcept nogil: cdef AlphaRegularityParameters* p = split_condition_parameters - return 1 + return True cdef class AlphaRegularityCondition(SplitCondition): def __cinit__(self, float64_t alpha): @@ -304,8 +304,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitCondition[:] presplit_conditions, - SplitCondition[:] postsplit_conditions, + SplitCondition[:] presplit_conditions = None, + SplitCondition[:] postsplit_conditions = None, *argv ): """ @@ -657,6 +657,8 @@ cdef inline intp_t node_split_best( # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants + cdef bint conditions_hold = True + _init_split(&best_split, end) partitioner.init_node_split(start, end) @@ -771,12 +773,17 @@ cdef inline intp_t node_split_best( # n_left = current_split.pos - splitter.start # n_right = end_non_missing - current_split.pos + n_missing + conditions_hold = True for condition in splitter.presplit_conditions: if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, lower_bound, upper_bound, condition.p ): - continue + conditions_hold = False + break + + if not conditions_hold: + continue # if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: # continue @@ -795,13 +802,18 @@ cdef inline intp_t node_split_best( # ): # continue + conditions_hold = True for condition in splitter.postsplit_conditions: if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, lower_bound, upper_bound, condition.p ): - continue - + conditions_hold = False + break + + if not conditions_hold: + continue + # # Reject if min_weight_leaf is not satisfied # if splitter.check_postsplit_conditions() == 1: # continue From 4f19d53c1a57fd2e37739d5028f550eb5ba88ba4 Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 18 Mar 2024 16:19:33 -0400 Subject: [PATCH 14/29] added some splitconditions to header --- sklearn/tree/_splitter.pxd | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 0edd4eb40231c..6c9d0d676142a 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -50,6 +50,15 @@ cdef struct SplitConditionTuple: cdef class SplitCondition: cdef SplitConditionTuple t +cdef class MinSamplesLeafCondition(SplitCondition): + pass + +cdef class MinWeightLeafCondition(SplitCondition): + pass + +cdef class MonotonicConstraintCondition(SplitCondition): + pass + cdef struct SplitRecord: # Data to track sample split From cb71be0cdb8be46b19bbdd91d6c5da4897359ff3 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 21 Mar 2024 10:33:33 -0400 Subject: [PATCH 15/29] commented out some sample code that was substantially increasing peak memory utilization in asv --- sklearn/tree/_splitter.pyx | 116 ++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 983a6f89b4a43..6b0a6950b7739 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -125,71 +125,71 @@ cdef class MonotonicConstraintCondition(SplitCondition): self.t.f = monotonic_constraint_condition self.t.p = NULL -cdef struct HasDataParameters: - int min_samples - -cdef bint has_data_condition( - Splitter splitter, - SplitRecord* current_split, - intp_t n_missing, - bint missing_go_to_left, - float64_t lower_bound, - float64_t upper_bound, - SplitConditionParameters split_condition_parameters -) noexcept nogil: - cdef HasDataParameters* p = split_condition_parameters - return splitter.n_samples >= p.min_samples - -cdef class HasDataCondition(SplitCondition): - def __cinit__(self, int min_samples): - self.t.f = has_data_condition - self.t.p = malloc(sizeof(HasDataParameters)) - (self.t.p).min_samples = min_samples +# cdef struct HasDataParameters: +# int min_samples + +# cdef bint has_data_condition( +# Splitter splitter, +# SplitRecord* current_split, +# intp_t n_missing, +# bint missing_go_to_left, +# float64_t lower_bound, +# float64_t upper_bound, +# SplitConditionParameters split_condition_parameters +# ) noexcept nogil: +# cdef HasDataParameters* p = split_condition_parameters +# return splitter.n_samples >= p.min_samples + +# cdef class HasDataCondition(SplitCondition): +# def __cinit__(self, int min_samples): +# self.t.f = has_data_condition +# self.t.p = malloc(sizeof(HasDataParameters)) +# (self.t.p).min_samples = min_samples - def __dealloc__(self): - if self.t.p is not NULL: - free(self.t.p) +# def __dealloc__(self): +# if self.t.p is not NULL: +# free(self.t.p) - super.__dealloc__(self) - -cdef struct AlphaRegularityParameters: - float64_t alpha - -cdef bint alpha_regularity_condition( - Splitter splitter, - SplitRecord* current_split, - intp_t n_missing, - bint missing_go_to_left, - float64_t lower_bound, - float64_t upper_bound, - SplitConditionParameters split_condition_parameters -) noexcept nogil: - cdef AlphaRegularityParameters* p = split_condition_parameters - - return True - -cdef class AlphaRegularityCondition(SplitCondition): - def __cinit__(self, float64_t alpha): - self.t.f = alpha_regularity_condition - self.t.p = malloc(sizeof(AlphaRegularityParameters)) - (self.t.p).alpha = alpha +# super.__dealloc__(self) + +# cdef struct AlphaRegularityParameters: +# float64_t alpha + +# cdef bint alpha_regularity_condition( +# Splitter splitter, +# SplitRecord* current_split, +# intp_t n_missing, +# bint missing_go_to_left, +# float64_t lower_bound, +# float64_t upper_bound, +# SplitConditionParameters split_condition_parameters +# ) noexcept nogil: +# cdef AlphaRegularityParameters* p = split_condition_parameters + +# return True + +# cdef class AlphaRegularityCondition(SplitCondition): +# def __cinit__(self, float64_t alpha): +# self.t.f = alpha_regularity_condition +# self.t.p = malloc(sizeof(AlphaRegularityParameters)) +# (self.t.p).alpha = alpha - def __dealloc__(self): - if self.t.p is not NULL: - free(self.t.p) +# def __dealloc__(self): +# if self.t.p is not NULL: +# free(self.t.p) - super.__dealloc__(self) +# super.__dealloc__(self) -from ._tree cimport Tree -cdef class FooTree(Tree): - cdef Splitter splitter +# from ._tree cimport Tree +# cdef class FooTree(Tree): +# cdef Splitter splitter - def __init__(self): - self.splitter = Splitter( - presplit_conditions = [HasDataCondition(10)], - postsplit_conditions = [AlphaRegularityCondition(0.1)], - ) +# def __init__(self): +# self.splitter = Splitter( +# presplit_conditions = [HasDataCondition(10)], +# postsplit_conditions = [AlphaRegularityCondition(0.1)], +# ) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: From e34be5c58a6f26ed38634b2a7b53a95ed0aabe67 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 9 Apr 2024 15:05:29 -0400 Subject: [PATCH 16/29] added vector resize --- sklearn/tree/_splitter.pyx | 43 ++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 6b0a6950b7739..80cf902c5af07 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -349,20 +349,41 @@ cdef class Splitter(BaseSplitter): self.min_samples_leaf_condition = MinSamplesLeafCondition() self.min_weight_leaf_condition = MinWeightLeafCondition() - self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) - if presplit_conditions is not None: - for condition in presplit_conditions: - self.presplit_conditions.push_back((condition).t) - - self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) - if postsplit_conditions is not None: - for condition in postsplit_conditions: - self.postsplit_conditions.push_back((condition).t) + self.presplit_conditions.resize( + (len(presplit_conditions) if presplit_conditions is not None else 0) + + (2 if self.with_monotonic_cst else 1) + ) + self.postsplit_conditions.resize( + (len(postsplit_conditions) if postsplit_conditions is not None else 0) + + (2 if self.with_monotonic_cst else 1) + ) + + offset = 0 + self.presplit_conditions[offset] = self.min_samples_leaf_condition.t + self.postsplit_conditions[offset] = self.min_weight_leaf_condition.t + offset += 1 if(self.with_monotonic_cst): self.monotonic_constraint_condition = MonotonicConstraintCondition() - self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) - self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) + # self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) + # self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) + self.presplit_conditions[offset] = self.monotonic_constraint_condition.t + self.postsplit_conditions[offset] = self.monotonic_constraint_condition.t + offset += 1 + + # self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) + if presplit_conditions is not None: + # for condition in presplit_conditions: + # self.presplit_conditions.push_back((condition).t) + for i in range(len(presplit_conditions)): + self.presplit_conditions[i + offset] = presplit_conditions[i].t + + # self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) + if postsplit_conditions is not None: + # for condition in postsplit_conditions: + # self.postsplit_conditions.push_back((condition).t) + for i in range(len(postsplit_conditions)): + self.postsplit_conditions[i + offset] = postsplit_conditions[i].t def __reduce__(self): From aac802e5d1cc4710dfb63ea14b9ef02a58da6a64 Mon Sep 17 00:00:00 2001 From: scarliles Date: Wed, 10 Apr 2024 15:10:43 -0400 Subject: [PATCH 17/29] wip --- sklearn/tree/_splitter.pyx | 92 +++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 80cf902c5af07..0afe0afe52ad6 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -19,6 +19,7 @@ from cython cimport final from libc.math cimport isnan +from libc.stdint cimport uintptr_t from libc.stdlib cimport qsort, free from libc.string cimport memcpy cimport numpy as cnp @@ -346,44 +347,65 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - self.min_samples_leaf_condition = MinSamplesLeafCondition() - self.min_weight_leaf_condition = MinWeightLeafCondition() + self._presplit_conditions = presplit_conditions + self._postsplit_conditions = postsplit_conditions - self.presplit_conditions.resize( - (len(presplit_conditions) if presplit_conditions is not None else 0) - + (2 if self.with_monotonic_cst else 1) - ) - self.postsplit_conditions.resize( - (len(postsplit_conditions) if postsplit_conditions is not None else 0) - + (2 if self.with_monotonic_cst else 1) - ) + self._presplit_conditions.append(MinSamplesLeafCondition()) + self._postsplit_conditions.append(MinWeightLeafCondition()) + + if self.with_monotonic_cst: + self._presplit_conditions.append(MonotonicConstraintCondition()) + self._postsplit_conditions.append(MonotonicConstraintCondition()) + + self.presplit_conditions.resize(len(self._presplit_conditions)) + self.postsplit_conditions.resize(len(self._postsplit_conditions)) - offset = 0 - self.presplit_conditions[offset] = self.min_samples_leaf_condition.t - self.postsplit_conditions[offset] = self.min_weight_leaf_condition.t - offset += 1 - - if(self.with_monotonic_cst): - self.monotonic_constraint_condition = MonotonicConstraintCondition() - # self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) - # self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) - self.presplit_conditions[offset] = self.monotonic_constraint_condition.t - self.postsplit_conditions[offset] = self.monotonic_constraint_condition.t - offset += 1 - - # self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) - if presplit_conditions is not None: - # for condition in presplit_conditions: - # self.presplit_conditions.push_back((condition).t) - for i in range(len(presplit_conditions)): - self.presplit_conditions[i + offset] = presplit_conditions[i].t + for i in range(len(self._presplit_conditions)): + self.presplit_conditions[i].f = self._presplit_conditions[i].t.f + self.presplit_conditions[i].p = self._presplit_conditions[i].t.p + + for i in range(len(self._postsplit_conditions)): + self.postsplit_conditions[i].f = self._postsplit_conditions[i].t.f + self.postsplit_conditions[i].p = self._postsplit_conditions[i].t.p + + # self.min_samples_leaf_condition = MinSamplesLeafCondition() + # self.min_weight_leaf_condition = MinWeightLeafCondition() + + # self.presplit_conditions.resize( + # (len(presplit_conditions) if presplit_conditions is not None else 0) + # + (2 if self.with_monotonic_cst else 1) + # ) + # self.postsplit_conditions.resize( + # (len(postsplit_conditions) if postsplit_conditions is not None else 0) + # + (2 if self.with_monotonic_cst else 1) + # ) + + # offset = 0 + # self.presplit_conditions[offset] = self.min_samples_leaf_condition.t + # self.postsplit_conditions[offset] = self.min_weight_leaf_condition.t + # offset += 1 + + # if(self.with_monotonic_cst): + # self.monotonic_constraint_condition = MonotonicConstraintCondition() + # # self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) + # # self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) + # self.presplit_conditions[offset] = self.monotonic_constraint_condition.t + # self.postsplit_conditions[offset] = self.monotonic_constraint_condition.t + # offset += 1 + + # # self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) + # if presplit_conditions is not None: + # # for condition in presplit_conditions: + # # self.presplit_conditions.push_back((condition).t) + # for i in range(len(presplit_conditions)): + # self.presplit_conditions[i + offset] = presplit_conditions[i].t - # self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) - if postsplit_conditions is not None: - # for condition in postsplit_conditions: - # self.postsplit_conditions.push_back((condition).t) - for i in range(len(postsplit_conditions)): - self.postsplit_conditions[i + offset] = postsplit_conditions[i].t + # # self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) + # if postsplit_conditions is not None: + # # for condition in postsplit_conditions: + # # self.postsplit_conditions.push_back((condition).t) + # for i in range(len(postsplit_conditions)): + # self.postsplit_conditions[i + offset] = postsplit_conditions[i].t def __reduce__(self): From a7f5e92741ae4781a92eb6bd697af7789d6c162e Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 15 Apr 2024 14:13:27 -0400 Subject: [PATCH 18/29] settling injection memory management for now --- sklearn/tree/_splitter.pyx | 81 ++++++++++++-------------------------- 1 file changed, 26 insertions(+), 55 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 2143aa3a5d742..ff707817d3d60 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -340,65 +340,36 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - self._presplit_conditions = presplit_conditions - self._postsplit_conditions = postsplit_conditions + self.min_samples_leaf_condition = MinSamplesLeafCondition() + self.min_weight_leaf_condition = MinWeightLeafCondition() - self._presplit_conditions.append(MinSamplesLeafCondition()) - self._postsplit_conditions.append(MinWeightLeafCondition()) + self.presplit_conditions.resize( + (len(presplit_conditions) if presplit_conditions is not None else 0) + + (2 if self.with_monotonic_cst else 1) + ) + self.postsplit_conditions.resize( + (len(postsplit_conditions) if postsplit_conditions is not None else 0) + + (2 if self.with_monotonic_cst else 1) + ) - if self.with_monotonic_cst: - self._presplit_conditions.append(MonotonicConstraintCondition()) - self._postsplit_conditions.append(MonotonicConstraintCondition()) - - self.presplit_conditions.resize(len(self._presplit_conditions)) - self.postsplit_conditions.resize(len(self._postsplit_conditions)) + offset = 0 + self.presplit_conditions[offset] = self.min_samples_leaf_condition.t + self.postsplit_conditions[offset] = self.min_weight_leaf_condition.t + offset += 1 - for i in range(len(self._presplit_conditions)): - self.presplit_conditions[i].f = self._presplit_conditions[i].t.f - self.presplit_conditions[i].p = self._presplit_conditions[i].t.p - - for i in range(len(self._postsplit_conditions)): - self.postsplit_conditions[i].f = self._postsplit_conditions[i].t.f - self.postsplit_conditions[i].p = self._postsplit_conditions[i].t.p - - # self.min_samples_leaf_condition = MinSamplesLeafCondition() - # self.min_weight_leaf_condition = MinWeightLeafCondition() - - # self.presplit_conditions.resize( - # (len(presplit_conditions) if presplit_conditions is not None else 0) - # + (2 if self.with_monotonic_cst else 1) - # ) - # self.postsplit_conditions.resize( - # (len(postsplit_conditions) if postsplit_conditions is not None else 0) - # + (2 if self.with_monotonic_cst else 1) - # ) - - # offset = 0 - # self.presplit_conditions[offset] = self.min_samples_leaf_condition.t - # self.postsplit_conditions[offset] = self.min_weight_leaf_condition.t - # offset += 1 - - # if(self.with_monotonic_cst): - # self.monotonic_constraint_condition = MonotonicConstraintCondition() - # # self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) - # # self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) - # self.presplit_conditions[offset] = self.monotonic_constraint_condition.t - # self.postsplit_conditions[offset] = self.monotonic_constraint_condition.t - # offset += 1 - - # # self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) - # if presplit_conditions is not None: - # # for condition in presplit_conditions: - # # self.presplit_conditions.push_back((condition).t) - # for i in range(len(presplit_conditions)): - # self.presplit_conditions[i + offset] = presplit_conditions[i].t + if(self.with_monotonic_cst): + self.monotonic_constraint_condition = MonotonicConstraintCondition() + self.presplit_conditions[offset] = self.monotonic_constraint_condition.t + self.postsplit_conditions[offset] = self.monotonic_constraint_condition.t + offset += 1 + + if presplit_conditions is not None: + for i in range(len(presplit_conditions)): + self.presplit_conditions[i + offset] = presplit_conditions[i].t - # # self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) - # if postsplit_conditions is not None: - # # for condition in postsplit_conditions: - # # self.postsplit_conditions.push_back((condition).t) - # for i in range(len(postsplit_conditions)): - # self.postsplit_conditions[i + offset] = postsplit_conditions[i].t + if postsplit_conditions is not None: + for i in range(len(postsplit_conditions)): + self.postsplit_conditions[i + offset] = postsplit_conditions[i].t def __reduce__(self): From 7a70a0b6e076bd7e4f54674ea2148697f80916f4 Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 22 Apr 2024 18:54:41 -0400 Subject: [PATCH 19/29] added regression forest benchmark --- asv_benchmarks/benchmarks/ensemble.py | 45 ++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py index c336d1e5f8805..a519cece3ac27 100644 --- a/asv_benchmarks/benchmarks/ensemble.py +++ b/asv_benchmarks/benchmarks/ensemble.py @@ -2,6 +2,7 @@ GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier, + RandomForestRegressor ) from .common import Benchmark, Estimator, Predictor @@ -9,8 +10,50 @@ _20newsgroups_highdim_dataset, _20newsgroups_lowdim_dataset, _synth_classification_dataset, + _synth_regression_dataset, + _synth_regression_sparse_dataset ) -from .utils import make_gen_classif_scorers +from .utils import make_gen_classif_scorers, make_gen_reg_scorers + + +class RandomForestRegressorBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for RandomForestRegressor. + """ + + param_names = ["representation", "n_jobs"] + params = (["dense", "sparse"], Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, n_jobs = params + + if representation == "sparse": + data = _synth_regression_sparse_dataset() + else: + data = _synth_regression_dataset() + + return data + + def make_estimator(self, params): + representation, n_jobs = params + + n_estimators = 500 if Benchmark.data_size == "large" else 100 + + estimator = RandomForestRegressor( + n_estimators=n_estimators, + min_samples_split=10, + max_features="log2", + n_jobs=n_jobs, + random_state=0, + ) + + return estimator + + def make_scorers(self): + make_gen_reg_scorers(self) class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark): From 893d588bccabbd063d1d385a6da7e2d52556c3a6 Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 22 Apr 2024 21:30:25 -0400 Subject: [PATCH 20/29] ran black for linting check --- .github/scripts/label_title_regex.py | 1 + asv_benchmarks/benchmarks/ensemble.py | 4 +- benchmarks/bench_glm.py | 1 + benchmarks/bench_glmnet.py | 1 + benchmarks/bench_isotonic.py | 1 + ...kernel_pca_solvers_time_vs_n_components.py | 1 + ...ch_kernel_pca_solvers_time_vs_n_samples.py | 1 + benchmarks/bench_lasso.py | 1 + benchmarks/bench_plot_lasso_path.py | 1 + benchmarks/bench_plot_neighbors.py | 1 + benchmarks/bench_plot_nmf.py | 7 ++- benchmarks/bench_plot_omp_lars.py | 1 + ...ch_plot_polynomial_kernel_approximation.py | 1 + benchmarks/bench_plot_svd.py | 1 + benchmarks/bench_random_projections.py | 1 + benchmarks/bench_saga.py | 5 +- .../bench_sample_without_replacement.py | 1 + benchmarks/bench_text_vectorizers.py | 1 + benchmarks/bench_tree.py | 1 + benchmarks/bench_tsne_mnist.py | 6 ++- build_tools/generate_authors_table.py | 1 + build_tools/get_comment.py | 3 +- build_tools/github/check_wheels.py | 1 + build_tools/github/vendor.py | 1 - .../update_environments_and_lock_files.py | 33 ++++++++----- doc/sphinxext/doi_role.py | 26 +++++----- doc/sphinxext/sphinx_issues.py | 1 + .../applications/plot_face_recognition.py | 1 + examples/calibration/plot_calibration.py | 1 + examples/cluster/plot_affinity_propagation.py | 1 + examples/cluster/plot_bisect_kmeans.py | 1 + .../covariance/plot_covariance_estimation.py | 1 - .../ensemble/plot_feature_transformation.py | 1 - .../plot_gradient_boosting_early_stopping.py | 1 + .../ensemble/plot_monotonic_constraints.py | 1 + .../linear_model/plot_quantile_regression.py | 12 +++-- examples/manifold/plot_swissroll.py | 1 + .../plot_kernel_ridge_regression.py | 1 + .../miscellaneous/plot_metadata_routing.py | 1 + examples/mixture/plot_gmm_init.py | 1 - .../plot_semi_supervised_newsgroups.py | 1 - examples/tree/plot_iris_dtc.py | 1 + maint_tools/check_pxd_in_installation.py | 8 ++- sklearn/__check_build/__init__.py | 10 ++-- sklearn/_build_utils/__init__.py | 1 + sklearn/_build_utils/openmp_helpers.py | 12 +++-- sklearn/_build_utils/pre_build_helpers.py | 6 ++- sklearn/_build_utils/version.py | 3 +- sklearn/_config.py | 4 +- sklearn/_distributor_init.py | 2 +- sklearn/_loss/link.py | 1 + sklearn/_loss/loss.py | 1 + sklearn/_min_dependencies.py | 1 + sklearn/base.py | 5 +- sklearn/cluster/_agglomerative.py | 1 + sklearn/cluster/_bicluster.py | 1 + sklearn/cluster/_bisect_k_means.py | 1 + sklearn/cluster/_feature_agglomeration.py | 1 + sklearn/cluster/_hdbscan/hdbscan.py | 1 + sklearn/cluster/_spectral.py | 3 +- .../tests/test_feature_agglomeration.py | 1 + sklearn/cluster/tests/test_hdbscan.py | 1 + sklearn/cluster/tests/test_hierarchical.py | 1 + sklearn/cluster/tests/test_k_means.py | 1 + sklearn/cluster/tests/test_spectral.py | 1 + sklearn/covariance/_robust_covariance.py | 1 + .../covariance/tests/test_graphical_lasso.py | 4 +- sklearn/datasets/__init__.py | 7 ++- sklearn/datasets/_arff_parser.py | 1 + sklearn/datasets/_california_housing.py | 1 + sklearn/datasets/_samples_generator.py | 4 +- sklearn/datasets/tests/test_20news.py | 1 + sklearn/datasets/tests/test_arff_parser.py | 24 ++++++--- .../datasets/tests/test_california_housing.py | 1 + sklearn/datasets/tests/test_common.py | 1 + sklearn/datasets/tests/test_covtype.py | 1 + sklearn/datasets/tests/test_openml.py | 4 +- sklearn/decomposition/__init__.py | 1 - sklearn/decomposition/_dict_learning.py | 4 +- sklearn/decomposition/_nmf.py | 7 ++- sklearn/decomposition/_pca.py | 3 +- sklearn/decomposition/_sparse_pca.py | 1 + sklearn/decomposition/_truncated_svd.py | 3 +- sklearn/decomposition/tests/test_fastica.py | 1 + .../tests/test_incremental_pca.py | 1 + sklearn/ensemble/__init__.py | 1 + sklearn/ensemble/_forest.py | 3 +- sklearn/ensemble/_gb.py | 6 +-- .../_hist_gradient_boosting/binning.py | 1 + .../_hist_gradient_boosting/grower.py | 1 + .../_hist_gradient_boosting/predictor.py | 1 + .../ensemble/_hist_gradient_boosting/utils.py | 1 + .../ensemble/tests/test_gradient_boosting.py | 1 + .../enable_hist_gradient_boosting.py | 1 + sklearn/feature_extraction/text.py | 6 +-- sklearn/feature_selection/_sequential.py | 1 + .../tests/test_feature_select.py | 1 + sklearn/gaussian_process/_gpr.py | 8 +-- sklearn/gaussian_process/kernels.py | 4 +- sklearn/gaussian_process/tests/test_gpc.py | 14 ++---- sklearn/gaussian_process/tests/test_gpr.py | 14 ++---- sklearn/impute/__init__.py | 1 + sklearn/impute/_base.py | 5 +- sklearn/inspection/__init__.py | 1 - .../tests/test_partial_dependence.py | 1 + .../tests/test_permutation_importance.py | 4 +- sklearn/linear_model/_glm/_newton_solver.py | 3 +- sklearn/linear_model/_glm/tests/test_glm.py | 3 +- sklearn/linear_model/_least_angle.py | 4 +- sklearn/linear_model/_linear_loss.py | 1 + sklearn/linear_model/_logistic.py | 9 ++-- sklearn/linear_model/_omp.py | 3 +- sklearn/linear_model/_stochastic_gradient.py | 3 +- .../linear_model/tests/test_linear_loss.py | 1 + sklearn/manifold/_spectral_embedding.py | 3 +- sklearn/metrics/__init__.py | 1 - sklearn/metrics/_base.py | 1 + sklearn/metrics/_classification.py | 3 +- sklearn/metrics/cluster/__init__.py | 1 + sklearn/metrics/tests/test_classification.py | 15 ++---- sklearn/mixture/_bayesian_mixture.py | 1 + sklearn/model_selection/_search.py | 3 +- sklearn/model_selection/tests/test_split.py | 1 + .../model_selection/tests/test_validation.py | 1 + sklearn/neighbors/_base.py | 10 ++-- sklearn/neighbors/_kde.py | 1 + sklearn/neighbors/_unsupervised.py | 1 + .../neighbors/tests/test_nearest_centroid.py | 1 + sklearn/neural_network/_base.py | 3 +- .../neural_network/_multilayer_perceptron.py | 6 +-- sklearn/neural_network/_rbm.py | 3 +- .../neural_network/_stochastic_optimizers.py | 3 +- sklearn/neural_network/tests/test_mlp.py | 3 +- sklearn/pipeline.py | 1 + sklearn/preprocessing/_polynomial.py | 1 + sklearn/random_projection.py | 1 + .../tests/test_label_propagation.py | 2 +- sklearn/svm/_base.py | 6 +-- sklearn/svm/_bounds.py | 1 + sklearn/svm/tests/test_svm.py | 1 + sklearn/tests/random_seed.py | 1 + sklearn/tests/test_build.py | 6 ++- sklearn/tests/test_common.py | 6 ++- sklearn/tests/test_metaestimators.py | 1 + sklearn/tests/test_pipeline.py | 1 + sklearn/tree/tests/test_export.py | 49 +++++++++++++------ sklearn/utils/_response.py | 1 + sklearn/utils/_show_versions.py | 1 + sklearn/utils/estimator_checks.py | 9 ++-- sklearn/utils/extmath.py | 1 + sklearn/utils/fixes.py | 1 + sklearn/utils/optimize.py | 1 + sklearn/utils/tests/test_extmath.py | 4 +- sklearn/utils/tests/test_fast_dict.py | 4 +- 154 files changed, 309 insertions(+), 222 deletions(-) diff --git a/.github/scripts/label_title_regex.py b/.github/scripts/label_title_regex.py index a022c3c4dd2a7..9a689b8db09b4 100644 --- a/.github/scripts/label_title_regex.py +++ b/.github/scripts/label_title_regex.py @@ -1,5 +1,6 @@ """Labels PRs based on title. Must be run in a github action with the pull_request_target event.""" + import json import os import re diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py index a519cece3ac27..877fcdb09fe68 100644 --- a/asv_benchmarks/benchmarks/ensemble.py +++ b/asv_benchmarks/benchmarks/ensemble.py @@ -2,7 +2,7 @@ GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier, - RandomForestRegressor + RandomForestRegressor, ) from .common import Benchmark, Estimator, Predictor @@ -11,7 +11,7 @@ _20newsgroups_lowdim_dataset, _synth_classification_dataset, _synth_regression_dataset, - _synth_regression_sparse_dataset + _synth_regression_sparse_dataset, ) from .utils import make_gen_classif_scorers, make_gen_reg_scorers diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py index 803043398d1ac..84cf31858afa7 100644 --- a/benchmarks/bench_glm.py +++ b/benchmarks/bench_glm.py @@ -4,6 +4,7 @@ Data comes from a random square matrix. """ + from datetime import datetime import numpy as np diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py index 7b111f95044e2..1aaad99c10587 100644 --- a/benchmarks/bench_glmnet.py +++ b/benchmarks/bench_glmnet.py @@ -16,6 +16,7 @@ In both cases, only 10% of the features are informative. """ + import gc from time import time diff --git a/benchmarks/bench_isotonic.py b/benchmarks/bench_isotonic.py index 221e6fb12da75..556c452fa3323 100644 --- a/benchmarks/bench_isotonic.py +++ b/benchmarks/bench_isotonic.py @@ -10,6 +10,7 @@ This allows the scaling of the algorithm with the problem size to be visualized and understood. """ + import argparse import gc from datetime import datetime diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py index 6551cb74ff86e..26789c173688f 100644 --- a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py @@ -35,6 +35,7 @@ You can also set `arpack_all=True` to activate arpack solver for large number of components (this takes more time). """ + # Authors: Sylvain MARIE, Schneider Electric import time diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py index 26a45ca9f09ca..cae74c6f442ff 100644 --- a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py +++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py @@ -37,6 +37,7 @@ Solvers comparison benchmark: time vs n_components", where this time the number of examples is fixed, and the desired number of components varies. """ + # Author: Sylvain MARIE, Schneider Electric import time diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py index 1c49c6f5cabdf..9bae570505a75 100644 --- a/benchmarks/bench_lasso.py +++ b/benchmarks/bench_lasso.py @@ -11,6 +11,7 @@ In both cases, only 10% of the features are informative. """ + import gc from time import time diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py index c996c9c09520f..3b46e447401cb 100644 --- a/benchmarks/bench_plot_lasso_path.py +++ b/benchmarks/bench_plot_lasso_path.py @@ -2,6 +2,7 @@ The input data is mostly low rank but is a fat infinite tail. """ + import gc import sys from collections import defaultdict diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py index 2d9cf2b08b71d..2cedb19fb23c4 100644 --- a/benchmarks/bench_plot_neighbors.py +++ b/benchmarks/bench_plot_neighbors.py @@ -1,6 +1,7 @@ """ Plot the scaling of the nearest neighbors algorithms with k, D, and N """ + from time import time import matplotlib.pyplot as plt diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py index 3484850011c1f..f05ede117191b 100644 --- a/benchmarks/bench_plot_nmf.py +++ b/benchmarks/bench_plot_nmf.py @@ -1,6 +1,7 @@ """ Benchmarks of Non-Negative Matrix Factorization """ + # Authors: Tom Dupre la Tour (benchmark) # Chih-Jen Linn (original projected gradient NMF implementation) # Anthony Di Franco (projected gradient, Python and NumPy port) @@ -258,8 +259,7 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0: raise ValueError( "Maximum number of iterations must be a positive " - "integer; got (max_iter=%r)" - % self.max_iter + "integer; got (max_iter=%r)" % self.max_iter ) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError( @@ -305,8 +305,7 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): if n_iter == self.max_iter and self.tol > 0: warnings.warn( "Maximum number of iteration %d reached. Increase it" - " to improve convergence." - % self.max_iter, + " to improve convergence." % self.max_iter, ConvergenceWarning, ) diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py index ec1bf3281f3a4..8a4bc9b1a34fe 100644 --- a/benchmarks/bench_plot_omp_lars.py +++ b/benchmarks/bench_plot_omp_lars.py @@ -3,6 +3,7 @@ The input data is mostly low rank but is a fat infinite tail. """ + import gc import sys from time import time diff --git a/benchmarks/bench_plot_polynomial_kernel_approximation.py b/benchmarks/bench_plot_polynomial_kernel_approximation.py index 1cd9f70a38f44..a80455e21c255 100644 --- a/benchmarks/bench_plot_polynomial_kernel_approximation.py +++ b/benchmarks/bench_plot_polynomial_kernel_approximation.py @@ -38,6 +38,7 @@ (https://people.cs.rutgers.edu/~farach/pubs/FrequentStream.pdf) """ + # Author: Daniel Lopez-Sanchez # License: BSD 3 clause diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py index abd2c6fe9d4d4..ed99d1c44e2fd 100644 --- a/benchmarks/bench_plot_svd.py +++ b/benchmarks/bench_plot_svd.py @@ -2,6 +2,7 @@ The data is mostly low rank but is a fat infinite tail. """ + import gc from collections import defaultdict from time import time diff --git a/benchmarks/bench_random_projections.py b/benchmarks/bench_random_projections.py index bd8c62ecba484..6551de690994b 100644 --- a/benchmarks/bench_random_projections.py +++ b/benchmarks/bench_random_projections.py @@ -6,6 +6,7 @@ Benchmarks for random projections. """ + import collections import gc import optparse diff --git a/benchmarks/bench_saga.py b/benchmarks/bench_saga.py index dc2ed093f11d0..c5b3e7728e2ec 100644 --- a/benchmarks/bench_saga.py +++ b/benchmarks/bench_saga.py @@ -3,6 +3,7 @@ Benchmarks of sklearn SAGA vs lightning SAGA vs Liblinear. Shows the gain in using multinomial logistic regression in term of learning time. """ + import json import os import time @@ -118,9 +119,7 @@ def fit_single( # Lightning predict_proba is not implemented for n_classes > 2 y_pred = _predict_proba(lr, X) score = log_loss(y, y_pred, normalize=False) / n_samples - score += 0.5 * alpha * np.sum(lr.coef_**2) + beta * np.sum( - np.abs(lr.coef_) - ) + score += 0.5 * alpha * np.sum(lr.coef_**2) + beta * np.sum(np.abs(lr.coef_)) scores.append(score) train_score, test_score = tuple(scores) diff --git a/benchmarks/bench_sample_without_replacement.py b/benchmarks/bench_sample_without_replacement.py index 743292ca5fa61..39cf1a11ffed6 100644 --- a/benchmarks/bench_sample_without_replacement.py +++ b/benchmarks/bench_sample_without_replacement.py @@ -2,6 +2,7 @@ Benchmarks for sampling without replacement of integer. """ + import gc import operator import optparse diff --git a/benchmarks/bench_text_vectorizers.py b/benchmarks/bench_text_vectorizers.py index 31d4141d1af97..2eab7071544f9 100644 --- a/benchmarks/bench_text_vectorizers.py +++ b/benchmarks/bench_text_vectorizers.py @@ -8,6 +8,7 @@ * psutil (optional, but recommended) """ + import itertools import timeit diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py index 29cd7584432b7..c522bcb39e994 100644 --- a/benchmarks/bench_tree.py +++ b/benchmarks/bench_tree.py @@ -13,6 +13,7 @@ training set, classify a sample and plot the time taken as a function of the number of dimensions. """ + import gc from datetime import datetime diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py index dfd4c4e92f848..813fffcf29141 100644 --- a/benchmarks/bench_tsne_mnist.py +++ b/benchmarks/bench_tsne_mnist.py @@ -130,7 +130,8 @@ def sanitize(filename): try: from bhtsne.bhtsne import run_bh_tsne except ImportError as e: - raise ImportError("""\ + raise ImportError( + """\ If you want comparison with the reference implementation, build the binary from source (https://github.com/lvdmaaten/bhtsne) in the folder benchmarks/bhtsne and add an empty `__init__.py` file in the folder: @@ -140,7 +141,8 @@ def sanitize(filename): $ g++ sptree.cpp tsne.cpp tsne_main.cpp -o bh_tsne -O2 $ touch __init__.py $ cd .. -""") from e +""" + ) from e def bhtsne(X): """Wrapper for the reference lvdmaaten/bhtsne implementation.""" diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py index f438927772619..28bb267b6f721 100644 --- a/build_tools/generate_authors_table.py +++ b/build_tools/generate_authors_table.py @@ -6,6 +6,7 @@ The table should be updated for each new inclusion in the teams. Generating the table requires admin rights. """ + import getpass import sys import time diff --git a/build_tools/get_comment.py b/build_tools/get_comment.py index 64c5784e0cd06..466396b640302 100644 --- a/build_tools/get_comment.py +++ b/build_tools/get_comment.py @@ -88,8 +88,7 @@ def get_message(log_file, repo, pr_number, sha, run_id, details, versions): "https://scikit-learn.org/dev/developers/contributing.html" "#how-to-contribute)) and push the changes. If you already have done " "that, please send an empty commit with `git commit --allow-empty` " - "and push the changes to trigger the CI.\n\n" - + sub_text + "and push the changes to trigger the CI.\n\n" + sub_text ) message = "" diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py index 2289709fdc037..5579d86c5ce3e 100644 --- a/build_tools/github/check_wheels.py +++ b/build_tools/github/check_wheels.py @@ -1,5 +1,6 @@ """Checks that dist/* contains the number of wheels built from the .github/workflows/wheels.yml config.""" + import sys from pathlib import Path diff --git a/build_tools/github/vendor.py b/build_tools/github/vendor.py index 3bc1aceb3437c..28b44be3c9aa9 100644 --- a/build_tools/github/vendor.py +++ b/build_tools/github/vendor.py @@ -1,6 +1,5 @@ """Embed vcomp140.dll and msvcp140.dll.""" - import os import os.path as op import shutil diff --git a/build_tools/update_environments_and_lock_files.py b/build_tools/update_environments_and_lock_files.py index ab0f3e590d560..fd77cfd3c0721 100644 --- a/build_tools/update_environments_and_lock_files.py +++ b/build_tools/update_environments_and_lock_files.py @@ -102,7 +102,8 @@ def remove_from(alist, to_remove): "folder": "build_tools/azure", "platform": "linux-64", "channel": "conda-forge", - "conda_dependencies": common_dependencies + [ + "conda_dependencies": common_dependencies + + [ "ccache", "pytorch", "pytorch-cpu", @@ -123,7 +124,8 @@ def remove_from(alist, to_remove): "folder": "build_tools/azure", "platform": "osx-64", "channel": "conda-forge", - "conda_dependencies": common_dependencies + [ + "conda_dependencies": common_dependencies + + [ "ccache", "compilers", "llvm-openmp", @@ -160,7 +162,8 @@ def remove_from(alist, to_remove): "channel": "defaults", "conda_dependencies": remove_from( common_dependencies, ["pandas", "cython", "pip", "ninja", "meson-python"] - ) + ["ccache"], + ) + + ["ccache"], "package_constraints": { "python": "3.9", "blas": "[build=openblas]", @@ -268,7 +271,8 @@ def remove_from(alist, to_remove): "folder": "build_tools/azure", "platform": "win-64", "channel": "conda-forge", - "conda_dependencies": remove_from(common_dependencies, ["pandas", "pyamg"]) + [ + "conda_dependencies": remove_from(common_dependencies, ["pandas", "pyamg"]) + + [ "wheel", "pip", ], @@ -284,7 +288,8 @@ def remove_from(alist, to_remove): "folder": "build_tools/circle", "platform": "linux-64", "channel": "conda-forge", - "conda_dependencies": common_dependencies_without_coverage + [ + "conda_dependencies": common_dependencies_without_coverage + + [ "scikit-image", "seaborn", "memory_profiler", @@ -324,7 +329,8 @@ def remove_from(alist, to_remove): "folder": "build_tools/circle", "platform": "linux-64", "channel": "conda-forge", - "conda_dependencies": common_dependencies_without_coverage + [ + "conda_dependencies": common_dependencies_without_coverage + + [ "scikit-image", "seaborn", "memory_profiler", @@ -353,7 +359,8 @@ def remove_from(alist, to_remove): "channel": "conda-forge", "conda_dependencies": remove_from( common_dependencies_without_coverage, ["pandas", "pyamg"] - ) + ["pip", "ccache"], + ) + + ["pip", "ccache"], "package_constraints": { "python": "3.9", }, @@ -460,7 +467,8 @@ def get_package_with_constraint(package_name, build_metadata, uses_pip=False): def get_conda_environment_content(build_metadata): - template = environment.from_string(""" + template = environment.from_string( + """ # DO NOT EDIT: this file is generated from the specification found in the # following script to centralize the configuration for CI builds: # build_tools/update_environments_and_lock_files.py @@ -476,7 +484,8 @@ def get_conda_environment_content(build_metadata): {% for pip_dep in build_metadata.get('pip_dependencies', []) %} - {{ pip_dep | get_package_with_constraint(build_metadata, uses_pip=True) }} {% endfor %} - {% endif %}""".strip()) + {% endif %}""".strip() + ) return template.render(build_metadata=build_metadata) @@ -532,13 +541,15 @@ def write_all_conda_lock_files(build_metadata_list): def get_pip_requirements_content(build_metadata): - template = environment.from_string(""" + template = environment.from_string( + """ # DO NOT EDIT: this file is generated from the specification found in the # following script to centralize the configuration for CI builds: # build_tools/update_environments_and_lock_files.py {% for pip_dep in build_metadata['pip_dependencies'] %} {{ pip_dep | get_package_with_constraint(build_metadata, uses_pip=True) }} -{% endfor %}""".strip()) +{% endfor %}""".strip() + ) return template.render(build_metadata=build_metadata) diff --git a/doc/sphinxext/doi_role.py b/doc/sphinxext/doi_role.py index 32e905fe650ea..9f117b07fa6a3 100644 --- a/doc/sphinxext/doi_role.py +++ b/doc/sphinxext/doi_role.py @@ -1,17 +1,17 @@ """ - doilinks - ~~~~~~~~ - Extension to add links to DOIs. With this extension you can use e.g. - :doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will - create a link to a DOI resolver - (``https://doi.org/10.1016/S0022-2836(05)80360-2``). - The link caption will be the raw DOI. - You can also give an explicit caption, e.g. - :doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`. - - :copyright: Copyright 2015 Jon Lund Steffensen. Based on extlinks by - the Sphinx team. - :license: BSD. +doilinks +~~~~~~~~ +Extension to add links to DOIs. With this extension you can use e.g. +:doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will +create a link to a DOI resolver +(``https://doi.org/10.1016/S0022-2836(05)80360-2``). +The link caption will be the raw DOI. +You can also give an explicit caption, e.g. +:doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`. + +:copyright: Copyright 2015 Jon Lund Steffensen. Based on extlinks by + the Sphinx team. +:license: BSD. """ from docutils import nodes, utils diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py index 5cd532319cbd7..206359a1bd703 100644 --- a/doc/sphinxext/sphinx_issues.py +++ b/doc/sphinxext/sphinx_issues.py @@ -18,6 +18,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ + import re from docutils import nodes, utils diff --git a/examples/applications/plot_face_recognition.py b/examples/applications/plot_face_recognition.py index 1ff4399d60739..97a67fad52776 100644 --- a/examples/applications/plot_face_recognition.py +++ b/examples/applications/plot_face_recognition.py @@ -11,6 +11,7 @@ .. _LFW: http://vis-www.cs.umass.edu/lfw/ """ + # %% from time import time diff --git a/examples/calibration/plot_calibration.py b/examples/calibration/plot_calibration.py index f928ae631b78b..91dca761d1fe3 100644 --- a/examples/calibration/plot_calibration.py +++ b/examples/calibration/plot_calibration.py @@ -22,6 +22,7 @@ Brier score. """ + # Authors: # Mathieu Blondel # Alexandre Gramfort diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py index 5816ae298f419..e286104636d67 100644 --- a/examples/cluster/plot_affinity_propagation.py +++ b/examples/cluster/plot_affinity_propagation.py @@ -8,6 +8,7 @@ Between Data Points", Science Feb. 2007 """ + import numpy as np from sklearn import metrics diff --git a/examples/cluster/plot_bisect_kmeans.py b/examples/cluster/plot_bisect_kmeans.py index 3aebdffddaf63..a562ebbc96ba5 100644 --- a/examples/cluster/plot_bisect_kmeans.py +++ b/examples/cluster/plot_bisect_kmeans.py @@ -13,6 +13,7 @@ present for regular K-Means. """ + import matplotlib.pyplot as plt from sklearn.cluster import BisectingKMeans, KMeans diff --git a/examples/covariance/plot_covariance_estimation.py b/examples/covariance/plot_covariance_estimation.py index df9af8ea330ba..04baa0fd98bc0 100644 --- a/examples/covariance/plot_covariance_estimation.py +++ b/examples/covariance/plot_covariance_estimation.py @@ -15,7 +15,6 @@ trade-off. """ - # %% # Generate sample data # -------------------- diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py index de6f92bad9dfe..d492de07fec87 100644 --- a/examples/ensemble/plot_feature_transformation.py +++ b/examples/ensemble/plot_feature_transformation.py @@ -20,7 +20,6 @@ """ - # Author: Tim Head # # License: BSD 3 clause diff --git a/examples/ensemble/plot_gradient_boosting_early_stopping.py b/examples/ensemble/plot_gradient_boosting_early_stopping.py index 1eaba2e852f28..6c239e97d66ee 100644 --- a/examples/ensemble/plot_gradient_boosting_early_stopping.py +++ b/examples/ensemble/plot_gradient_boosting_early_stopping.py @@ -31,6 +31,7 @@ License: BSD 3 clause """ + # %% # Data Preparation # ---------------- diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py index 15ad8e9524243..dcd5f05af626c 100644 --- a/examples/ensemble/plot_monotonic_constraints.py +++ b/examples/ensemble/plot_monotonic_constraints.py @@ -19,6 +19,7 @@ `_. """ + # %% import matplotlib.pyplot as plt import numpy as np diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py index 715e6129cdef8..70dda86fabd60 100644 --- a/examples/linear_model/plot_quantile_regression.py +++ b/examples/linear_model/plot_quantile_regression.py @@ -261,14 +261,16 @@ y_pred_lr = linear_regression.fit(X, y_pareto).predict(X) y_pred_qr = quantile_regression.fit(X, y_pareto).predict(X) -print(f"""Training error (in-sample performance) +print( + f"""Training error (in-sample performance) {linear_regression.__class__.__name__}: MAE = {mean_absolute_error(y_pareto, y_pred_lr):.3f} MSE = {mean_squared_error(y_pareto, y_pred_lr):.3f} {quantile_regression.__class__.__name__}: MAE = {mean_absolute_error(y_pareto, y_pred_qr):.3f} MSE = {mean_squared_error(y_pareto, y_pred_qr):.3f} - """) + """ +) # %% # On the training set, we see that MAE is lower for @@ -298,14 +300,16 @@ cv=3, scoring=["neg_mean_absolute_error", "neg_mean_squared_error"], ) -print(f"""Test error (cross-validated performance) +print( + f"""Test error (cross-validated performance) {linear_regression.__class__.__name__}: MAE = {-cv_results_lr["test_neg_mean_absolute_error"].mean():.3f} MSE = {-cv_results_lr["test_neg_mean_squared_error"].mean():.3f} {quantile_regression.__class__.__name__}: MAE = {-cv_results_qr["test_neg_mean_absolute_error"].mean():.3f} MSE = {-cv_results_qr["test_neg_mean_squared_error"].mean():.3f} - """) + """ +) # %% # We reach similar conclusions on the out-of-sample evaluation. diff --git a/examples/manifold/plot_swissroll.py b/examples/manifold/plot_swissroll.py index fe17d9f80030f..65df88588efef 100644 --- a/examples/manifold/plot_swissroll.py +++ b/examples/manifold/plot_swissroll.py @@ -8,6 +8,7 @@ Then, we will explore how they both deal with the addition of a hole in the data. """ + # %% # Swiss Roll # --------------------------------------------------- diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py index 6d2288936179a..b865778156c3c 100644 --- a/examples/miscellaneous/plot_kernel_ridge_regression.py +++ b/examples/miscellaneous/plot_kernel_ridge_regression.py @@ -17,6 +17,7 @@ datapoint. """ + # %% # Authors: Jan Hendrik Metzen # License: BSD 3 clause diff --git a/examples/miscellaneous/plot_metadata_routing.py b/examples/miscellaneous/plot_metadata_routing.py index 9984bb6183348..9cad255b763af 100644 --- a/examples/miscellaneous/plot_metadata_routing.py +++ b/examples/miscellaneous/plot_metadata_routing.py @@ -20,6 +20,7 @@ First a few imports and some random data for the rest of the script. """ + # %% import warnings diff --git a/examples/mixture/plot_gmm_init.py b/examples/mixture/plot_gmm_init.py index aa0266c98ff7a..410a843cf78db 100644 --- a/examples/mixture/plot_gmm_init.py +++ b/examples/mixture/plot_gmm_init.py @@ -33,7 +33,6 @@ time to initialize and low number of GaussianMixture iterations to converge. """ - # Author: Gordon Walsh # Data generation code from Jake Vanderplas diff --git a/examples/semi_supervised/plot_semi_supervised_newsgroups.py b/examples/semi_supervised/plot_semi_supervised_newsgroups.py index 58c7f6e42f408..19bcb13c5a99b 100644 --- a/examples/semi_supervised/plot_semi_supervised_newsgroups.py +++ b/examples/semi_supervised/plot_semi_supervised_newsgroups.py @@ -11,7 +11,6 @@ """ - import numpy as np from sklearn.datasets import fetch_20newsgroups diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index b3d834da5d067..4c54a4119ced3 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -14,6 +14,7 @@ We also show the tree structure of a model built on all of the features. """ + # %% # First load the copy of the Iris dataset shipped with scikit-learn: from sklearn.datasets import load_iris diff --git a/maint_tools/check_pxd_in_installation.py b/maint_tools/check_pxd_in_installation.py index 996d45d64d42a..380edbd6350b6 100644 --- a/maint_tools/check_pxd_in_installation.py +++ b/maint_tools/check_pxd_in_installation.py @@ -36,7 +36,9 @@ # We set the language to c++ and we use numpy.get_include() because # some modules require it. with open(tmpdir / "setup_tst.py", "w") as f: - f.write(textwrap.dedent(""" + f.write( + textwrap.dedent( + """ from setuptools import setup, Extension from Cython.Build import cythonize import numpy @@ -47,7 +49,9 @@ include_dirs=[numpy.get_include()])] setup(ext_modules=cythonize(extensions)) - """)) + """ + ) + ) subprocess.run( ["python", "setup_tst.py", "build_ext", "-i"], check=True, cwd=tmpdir diff --git a/sklearn/__check_build/__init__.py b/sklearn/__check_build/__init__.py index 3895a0e430082..ad1a3a818b14d 100644 --- a/sklearn/__check_build/__init__.py +++ b/sklearn/__check_build/__init__.py @@ -1,6 +1,7 @@ -""" Module to give helpful messages to the user that did not +"""Module to give helpful messages to the user that did not compile scikit-learn properly. """ + import os INPLACE_MSG = """ @@ -28,7 +29,8 @@ def raise_build_error(e): dir_content.append(filename.ljust(26)) else: dir_content.append(filename + "\n") - raise ImportError("""%s + raise ImportError( + """%s ___________________________________________________________________________ Contents of %s: %s @@ -38,7 +40,9 @@ def raise_build_error(e): If you have installed scikit-learn from source, please do not forget to build the package before using it: run `python setup.py install` or `make` in the source directory. -%s""" % (e, local_dir, "".join(dir_content).strip(), msg)) +%s""" + % (e, local_dir, "".join(dir_content).strip(), msg) + ) try: diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py index a8ced8aa9d292..ceb72441000c3 100644 --- a/sklearn/_build_utils/__init__.py +++ b/sklearn/_build_utils/__init__.py @@ -1,6 +1,7 @@ """ Utilities useful during the build. """ + # author: Andy Mueller, Gael Varoquaux # license: BSD diff --git a/sklearn/_build_utils/openmp_helpers.py b/sklearn/_build_utils/openmp_helpers.py index 9172d40830bb9..ed9bf0ea3eea0 100644 --- a/sklearn/_build_utils/openmp_helpers.py +++ b/sklearn/_build_utils/openmp_helpers.py @@ -38,7 +38,8 @@ def check_openmp_support(): # Pyodide doesn't support OpenMP return False - code = textwrap.dedent("""\ + code = textwrap.dedent( + """\ #include #include int main(void) { @@ -46,7 +47,8 @@ def check_openmp_support(): printf("nthreads=%d\\n", omp_get_num_threads()); return 0; } - """) + """ + ) extra_preargs = os.getenv("LDFLAGS", None) if extra_preargs is not None: @@ -94,7 +96,8 @@ def check_openmp_support(): "Failed to build scikit-learn with OpenMP support" ) from openmp_exception else: - message = textwrap.dedent(""" + message = textwrap.dedent( + """ *********** * WARNING * @@ -117,7 +120,8 @@ def check_openmp_support(): parallelism. *** - """) + """ + ) warnings.warn(message) return openmp_supported diff --git a/sklearn/_build_utils/pre_build_helpers.py b/sklearn/_build_utils/pre_build_helpers.py index f3eb054bb037e..b73fa8658739f 100644 --- a/sklearn/_build_utils/pre_build_helpers.py +++ b/sklearn/_build_utils/pre_build_helpers.py @@ -64,10 +64,12 @@ def basic_check_build(): # The following check won't work in pyodide return - code = textwrap.dedent("""\ + code = textwrap.dedent( + """\ #include int main(void) { return 0; } - """) + """ + ) compile_test_program(code) diff --git a/sklearn/_build_utils/version.py b/sklearn/_build_utils/version.py index 1f8688a008e9d..49a3cfb82bebd 100644 --- a/sklearn/_build_utils/version.py +++ b/sklearn/_build_utils/version.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -""" Extract version number from __init__.py -""" +"""Extract version number from __init__.py""" import os diff --git a/sklearn/_config.py b/sklearn/_config.py index d4ccaca0a98f7..fc9392de68df6 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -1,5 +1,5 @@ -"""Global configuration state and functions for management -""" +"""Global configuration state and functions for management""" + import os import threading from contextlib import contextmanager as contextmanager diff --git a/sklearn/_distributor_init.py b/sklearn/_distributor_init.py index a0142ac80878f..f0901034e83e4 100644 --- a/sklearn/_distributor_init.py +++ b/sklearn/_distributor_init.py @@ -1,4 +1,4 @@ -""" Distributor init file +"""Distributor init file Distributors: you can add custom code here to support particular distributions of scikit-learn. diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py index 9459844f6b89a..a6560d58d91e6 100644 --- a/sklearn/_loss/link.py +++ b/sklearn/_loss/link.py @@ -1,6 +1,7 @@ """ Module contains classes for invertible (and differentiable) link functions. """ + # Author: Christian Lorentzen from abc import ABC, abstractmethod diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py index a3b205ed10687..96863cc00fe01 100644 --- a/sklearn/_loss/loss.py +++ b/sklearn/_loss/loss.py @@ -5,6 +5,7 @@ Specific losses are used for regression, binary classification or multiclass classification. """ + # Goals: # - Provide a common private module for loss functions/classes. # - To be used in: diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index a7b9c48466a5d..b015a375b2bb0 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -1,4 +1,5 @@ """All minimum dependencies for scikit-learn.""" + import argparse from collections import defaultdict diff --git a/sklearn/base.py b/sklearn/base.py index e73ae4c8a180e..d6014332f7cc0 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -1353,9 +1353,8 @@ class _UnstableArchMixin: def _more_tags(self): return { - "non_deterministic": _IS_32BIT or platform.machine().startswith( - ("ppc", "powerpc") - ) + "non_deterministic": _IS_32BIT + or platform.machine().startswith(("ppc", "powerpc")) } diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 2da9d8c5a0f43..fcecacc9ca57c 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -7,6 +7,7 @@ Gael Varoquaux License: BSD 3 clause """ + import warnings from heapq import heapify, heappop, heappush, heappushpop from numbers import Integral, Real diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 18c98ad5348b5..b22f6a369fcc1 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -1,4 +1,5 @@ """Spectral biclustering algorithms.""" + # Authors : Kemal Eren # License: BSD 3 clause diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index a1f7716ced822..1d4a9e1d84c26 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -1,4 +1,5 @@ """Bisecting K-means clustering.""" + # Author: Michal Krawczyk import warnings diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index f84f18c1c18b3..218db48ad2331 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -2,6 +2,7 @@ Feature agglomeration. Base classes and functions for performing feature agglomeration. """ + # Author: V. Michel, A. Gramfort # License: BSD 3 clause diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py index 380448f1f8589..e77baaf4b1146 100644 --- a/sklearn/cluster/_hdbscan/hdbscan.py +++ b/sklearn/cluster/_hdbscan/hdbscan.py @@ -2,6 +2,7 @@ HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise """ + # Authors: Leland McInnes # Steve Astels # John Healy diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index d323a6b8afd03..91606056c17aa 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -793,7 +793,8 @@ def fit_predict(self, X, y=None): def _more_tags(self): return { - "pairwise": self.affinity in [ + "pairwise": self.affinity + in [ "precomputed", "precomputed_nearest_neighbors", ] diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py index 121e8f2cfe400..abeb81dca50aa 100644 --- a/sklearn/cluster/tests/test_feature_agglomeration.py +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -1,6 +1,7 @@ """ Tests for sklearn.cluster._feature_agglomeration """ + # Authors: Sergul Aydore 2017 import warnings diff --git a/sklearn/cluster/tests/test_hdbscan.py b/sklearn/cluster/tests/test_hdbscan.py index 6db2d4387de18..d586d203747c2 100644 --- a/sklearn/cluster/tests/test_hdbscan.py +++ b/sklearn/cluster/tests/test_hdbscan.py @@ -2,6 +2,7 @@ Tests for HDBSCAN clustering algorithm Based on the DBSCAN test code """ + import numpy as np import pytest from scipy import stats diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 3c99dd50ea85f..0a139bf3c4571 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -2,6 +2,7 @@ Several basic tests for hierarchical clustering procedures """ + # Authors: Vincent Michel, 2010, Gael Varoquaux 2012, # Matteo Visconti di Oleggio Castello 2014 # License: BSD 3 clause diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 4a112a30b29ed..1f2f8c390c909 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -1,4 +1,5 @@ """Testing for K-means""" + import re import sys from io import StringIO diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 682df64044bf9..689a159851f50 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -1,4 +1,5 @@ """Testing for Spectral Clustering methods""" + import pickle import re diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index c90e855ca6768..980bf964e6dfa 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -4,6 +4,7 @@ Here are implemented estimators that are resistant to outliers. """ + # Author: Virgile Fritsch # # License: BSD 3 clause diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py index a7d251a5bbdfe..c0e2deb20de16 100644 --- a/sklearn/covariance/tests/test_graphical_lasso.py +++ b/sklearn/covariance/tests/test_graphical_lasso.py @@ -1,5 +1,5 @@ -""" Test the graphical_lasso module. -""" +"""Test the graphical_lasso module.""" + import sys from io import StringIO diff --git a/sklearn/datasets/__init__.py b/sklearn/datasets/__init__.py index 7ae7902f3365c..6f61e027dceaa 100644 --- a/sklearn/datasets/__init__.py +++ b/sklearn/datasets/__init__.py @@ -3,6 +3,7 @@ including methods to load and fetch popular reference datasets. It also features some artificial data generators. """ + import textwrap from ._base import ( @@ -106,7 +107,8 @@ def __getattr__(name): if name == "load_boston": - msg = textwrap.dedent(""" + msg = textwrap.dedent( + """ `load_boston` has been removed from scikit-learn since version 1.2. The Boston housing prices dataset has an ethical problem: as @@ -153,7 +155,8 @@ def __getattr__(name): "Hedonic housing prices and the demand for clean air." Journal of environmental economics and management 5.1 (1978): 81-102. - """) + """ + ) raise ImportError(msg) try: return globals()[name] diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 5c427441012d6..86dfeb37a6ef5 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -1,4 +1,5 @@ """Implementation of ARFF parsers: via LIAC-ARFF and pandas.""" + import itertools import re from collections import OrderedDict diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index e94996ccdec65..a1e4b911f1bef 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -18,6 +18,7 @@ Statistics and Probability Letters, 33 (1997) 291-297. """ + # Authors: Peter Prettenhofer # License: BSD 3 clause diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 396e4af9389e6..224978bd70770 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -221,9 +221,7 @@ def make_classification( msg = "n_classes({}) * n_clusters_per_class({}) must be" msg += " smaller or equal 2**n_informative({})={}" raise ValueError( - msg.format( - n_classes, n_clusters_per_class, n_informative, 2**n_informative - ) + msg.format(n_classes, n_clusters_per_class, n_informative, 2**n_informative) ) if weights is not None: diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py index 4072d9c8ec67f..84e7c91d3176f 100644 --- a/sklearn/datasets/tests/test_20news.py +++ b/sklearn/datasets/tests/test_20news.py @@ -1,6 +1,7 @@ """Test the 20news downloader, if the data is available, or if specifically requested via environment variable (e.g. for CI jobs).""" + from functools import partial from unittest.mock import patch diff --git a/sklearn/datasets/tests/test_arff_parser.py b/sklearn/datasets/tests/test_arff_parser.py index b675439cd2e9d..c4f9e3eb00ffd 100644 --- a/sklearn/datasets/tests/test_arff_parser.py +++ b/sklearn/datasets/tests/test_arff_parser.py @@ -83,7 +83,9 @@ def test_pandas_arff_parser_strip_single_quotes(parser_func): """Check that we properly strip single quotes from the data.""" pd = pytest.importorskip("pandas") - arff_file = BytesIO(textwrap.dedent(""" + arff_file = BytesIO( + textwrap.dedent( + """ @relation 'toy' @attribute 'cat_single_quote' {'A', 'B', 'C'} @attribute 'str_single_quote' string @@ -91,7 +93,9 @@ def test_pandas_arff_parser_strip_single_quotes(parser_func): @attribute 'class' numeric @data 'A','some text','\"expect double quotes\"',0 - """).encode("utf-8")) + """ + ).encode("utf-8") + ) columns_info = { "cat_single_quote": { @@ -150,7 +154,9 @@ def test_pandas_arff_parser_strip_double_quotes(parser_func): """Check that we properly strip double quotes from the data.""" pd = pytest.importorskip("pandas") - arff_file = BytesIO(textwrap.dedent(""" + arff_file = BytesIO( + textwrap.dedent( + """ @relation 'toy' @attribute 'cat_double_quote' {"A", "B", "C"} @attribute 'str_double_quote' string @@ -158,7 +164,9 @@ def test_pandas_arff_parser_strip_double_quotes(parser_func): @attribute 'class' numeric @data "A","some text","\'expect double quotes\'",0 - """).encode("utf-8")) + """ + ).encode("utf-8") + ) columns_info = { "cat_double_quote": { @@ -217,7 +225,9 @@ def test_pandas_arff_parser_strip_no_quotes(parser_func): """Check that we properly parse with no quotes characters.""" pd = pytest.importorskip("pandas") - arff_file = BytesIO(textwrap.dedent(""" + arff_file = BytesIO( + textwrap.dedent( + """ @relation 'toy' @attribute 'cat_without_quote' {A, B, C} @attribute 'str_without_quote' string @@ -225,7 +235,9 @@ def test_pandas_arff_parser_strip_no_quotes(parser_func): @attribute 'class' numeric @data A,some text,'internal' quote,0 - """).encode("utf-8")) + """ + ).encode("utf-8") + ) columns_info = { "cat_without_quote": { diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py index ef6fc95db80bf..b24fb5bd66a56 100644 --- a/sklearn/datasets/tests/test_california_housing.py +++ b/sklearn/datasets/tests/test_california_housing.py @@ -1,6 +1,7 @@ """Test the california_housing loader, if the data is available, or if specifically requested via environment variable (e.g. for CI jobs).""" + from functools import partial import pytest diff --git a/sklearn/datasets/tests/test_common.py b/sklearn/datasets/tests/test_common.py index 8048a31041ddc..5bed37837718b 100644 --- a/sklearn/datasets/tests/test_common.py +++ b/sklearn/datasets/tests/test_common.py @@ -1,4 +1,5 @@ """Test loaders for common functionality.""" + import inspect import os diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index e44fdaae69ec3..018505bc4fa05 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -1,6 +1,7 @@ """Test the covtype loader, if the data is available, or if specifically requested via environment variable (e.g. for CI jobs).""" + from functools import partial import pytest diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index e48e361909603..70bb33e22adb7 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1,4 +1,5 @@ """Test the openml loader.""" + import gzip import json import os @@ -1457,8 +1458,7 @@ def _mock_urlopen_raise(request, *args, **kwargs): raise ValueError( "This mechanism intends to test correct cache" "handling. As such, urlopen should never be " - "accessed. URL: %s" - % request.get_full_url() + "accessed. URL: %s" % request.get_full_url() ) data_id = 61 diff --git a/sklearn/decomposition/__init__.py b/sklearn/decomposition/__init__.py index 1f9cfe07dc0e8..3d33938a755a7 100644 --- a/sklearn/decomposition/__init__.py +++ b/sklearn/decomposition/__init__.py @@ -4,7 +4,6 @@ this module can be regarded as dimensionality reduction techniques. """ - from ..utils.extmath import randomized_svd from ._dict_learning import ( DictionaryLearning, diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 177d6960033da..267e1cbfe756b 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1,5 +1,5 @@ -""" Dictionary learning. -""" +"""Dictionary learning.""" + # Author: Vlad Niculae, Gael Varoquaux, Alexandre Gramfort # License: BSD 3 clause diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index db46540e26708..75266c5f64b2b 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1,5 +1,5 @@ -""" Non-negative matrix factorization. -""" +"""Non-negative matrix factorization.""" + # Author: Vlad Niculae # Lars Buitinck # Mathieu Blondel @@ -1769,8 +1769,7 @@ def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): if n_iter == self.max_iter and self.tol > 0: warnings.warn( "Maximum number of iterations %d reached. Increase " - "it to improve convergence." - % self.max_iter, + "it to improve convergence." % self.max_iter, ConvergenceWarning, ) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index abd2fda2d5d2f..4c49337e88093 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -1,5 +1,4 @@ -""" Principal Component Analysis. -""" +"""Principal Component Analysis.""" # Author: Alexandre Gramfort # Olivier Grisel diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index b14df8c5f4d22..fa711ce8c0703 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -1,4 +1,5 @@ """Matrix factorization with Sparse PCA.""" + # Author: Vlad Niculae, Gael Varoquaux, Alexandre Gramfort # License: BSD 3 clause diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 725683e8d46c6..d238f35cb2167 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -1,5 +1,4 @@ -"""Truncated SVD for sparse matrices, aka latent semantic analysis (LSA). -""" +"""Truncated SVD for sparse matrices, aka latent semantic analysis (LSA).""" # Author: Lars Buitinck # Olivier Grisel diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 6a376b01ecb19..bd7a35bb8a96f 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -1,6 +1,7 @@ """ Test the fastica algorithm. """ + import itertools import os import warnings diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 5d7c8aa03f174..646aad2db795d 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -1,4 +1,5 @@ """Tests for Incremental PCA.""" + import warnings import numpy as np diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index f4a3756bdaf1d..8ddf05084f1be 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -2,6 +2,7 @@ The :mod:`sklearn.ensemble` module includes ensemble-based methods for classification, regression and anomaly detection. """ + from ._bagging import BaggingClassifier, BaggingRegressor from ._base import BaseEnsemble from ._forest import ( diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b5ee64b6e708c..6e5a7e47b0c10 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1198,8 +1198,7 @@ def _validate_y_class_weight(self, y, classes=None): raise ValueError( "Valid presets for class_weight include " '"balanced" and "balanced_subsample".' - 'Given "%s".' - % self.class_weight + 'Given "%s".' % self.class_weight ) if self.warm_start: warn( diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 49575cefa5090..bd11e373d3915 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -741,8 +741,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): if ( "pass parameters to specific steps of " "your pipeline using the " - "stepname__parameter" - in str(e) + "stepname__parameter" in str(e) ): # pipeline raise ValueError(msg) from e else: # regular estimator whose input checking failed @@ -1060,8 +1059,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): warnings.warn( "Using recursion method with a non-constant init predictor " "will lead to incorrect partial dependence values. " - "Got init=%s." - % self.init, + "Got init=%s." % self.init, UserWarning, ) grid = np.asarray(grid, dtype=DTYPE, order="C") diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index 98d01ea5cb9f2..d23f6e7b00a82 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -5,6 +5,7 @@ Bin thresholds are computed with the quantiles so that each bin contains approximately the same number of samples. """ + # Author: Nicolas Hug import numpy as np diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 15f92cd324768..c9b1b56bc7999 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -4,6 +4,7 @@ TreeGrower builds a regression tree fitting a Newton-Raphson step, based on the gradients and hessians of the training data. """ + # Author: Nicolas Hug import numbers diff --git a/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/sklearn/ensemble/_hist_gradient_boosting/predictor.py index b939712d18893..799c25aadcec3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -1,6 +1,7 @@ """ This module contains the TreePredictor class which is used for prediction. """ + # Author: Nicolas Hug import numpy as np diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.py b/sklearn/ensemble/_hist_gradient_boosting/utils.py index 12f49b6cdce50..1ff17217164c8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.py +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.py @@ -1,4 +1,5 @@ """This module contains utility routines.""" + from ...base import is_classifier from .binning import _BinMapper diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 4bfbf7c2ff6ee..f13f5983d1f4b 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1,6 +1,7 @@ """ Testing for the gradient boosting module (sklearn.ensemble.gradient_boosting). """ + import re import warnings diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py index d287400c7999f..6fa4512ce39c6 100644 --- a/sklearn/experimental/enable_hist_gradient_boosting.py +++ b/sklearn/experimental/enable_hist_gradient_boosting.py @@ -6,6 +6,7 @@ :term:`experimental`, but these estimators are now stable and can be imported normally from `sklearn.ensemble`. """ + # Don't remove this file, we don't want to break users code just because the # feature isn't experimental anymore. diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index ea6686ef45eaa..d50c489e6b852 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -409,8 +409,7 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): "Your stop_words may be inconsistent with " "your preprocessing. Tokenizing the stop " "words generated tokens %r not in " - "stop_words." - % sorted(inconsistent) + "stop_words." % sorted(inconsistent) ) return not inconsistent except Exception: @@ -516,8 +515,7 @@ def _validate_ngram_range(self): if min_n > max_m: raise ValueError( "Invalid value for ngram_range=%s " - "lower boundary larger than the upper boundary." - % str(self.ngram_range) + "lower boundary larger than the upper boundary." % str(self.ngram_range) ) def _warn_for_unused_params(self): diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 5a90d46c9758b..9c393724f9cea 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -1,6 +1,7 @@ """ Sequential feature selection """ + from numbers import Integral, Real import numpy as np diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 3815a88c374e8..d7bffec5159bf 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -1,6 +1,7 @@ """ Todo: cross-check the F-value with stats model """ + import itertools import warnings diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index d3723016be127..67bba2e29c857 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -456,9 +456,7 @@ def predict(self, X, return_std=False, return_cov=False): y_cov = self.kernel_(X) - V.T @ V # undo normalisation - y_cov = np.outer(y_cov, self._y_train_std**2).reshape( - *y_cov.shape, -1 - ) + y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1) # if y_cov has shape (n_samples, n_samples, 1), reshape to # (n_samples, n_samples) if y_cov.shape[2] == 1: @@ -483,9 +481,7 @@ def predict(self, X, return_std=False, return_cov=False): y_var[y_var_negative] = 0.0 # undo normalisation - y_var = np.outer(y_var, self._y_train_std**2).reshape( - *y_var.shape, -1 - ) + y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1) # if y_var has shape (n_samples, 1), reshape to (n_samples,) if y_var.shape[1] == 1: diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py index 3b995c48b1f71..c31335696944c 100644 --- a/sklearn/gaussian_process/kernels.py +++ b/sklearn/gaussian_process/kernels.py @@ -1750,9 +1750,7 @@ def __call__(self, X, Y=None, eval_gradient=False): # We need to recompute the pairwise dimension-wise distances if self.anisotropic: - D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / ( - length_scale**2 - ) + D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / (length_scale**2) else: D = squareform(dists**2)[:, :, np.newaxis] diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py index 842159f13ac04..bd8bd39e1cc01 100644 --- a/sklearn/gaussian_process/tests/test_gpc.py +++ b/sklearn/gaussian_process/tests/test_gpc.py @@ -1,4 +1,4 @@ -"""Testing for Gaussian process classification """ +"""Testing for Gaussian process classification""" # Author: Jan Hendrik Metzen # License: BSD 3 clause @@ -218,8 +218,7 @@ def test_warning_bounds(): assert issubclass(record[0].category, ConvergenceWarning) assert ( - record[0].message.args[0] - == "The optimal value found for " + record[0].message.args[0] == "The optimal value found for " "dimension 0 of parameter " "k1__noise_level is close to the " "specified upper bound 0.001. " @@ -229,8 +228,7 @@ def test_warning_bounds(): assert issubclass(record[1].category, ConvergenceWarning) assert ( - record[1].message.args[0] - == "The optimal value found for " + record[1].message.args[0] == "The optimal value found for " "dimension 0 of parameter " "k2__length_scale is close to the " "specified lower bound 1000.0. " @@ -250,8 +248,7 @@ def test_warning_bounds(): assert issubclass(record[0].category, ConvergenceWarning) assert ( - record[0].message.args[0] - == "The optimal value found for " + record[0].message.args[0] == "The optimal value found for " "dimension 0 of parameter " "length_scale is close to the " "specified upper bound 100.0. " @@ -261,8 +258,7 @@ def test_warning_bounds(): assert issubclass(record[1].category, ConvergenceWarning) assert ( - record[1].message.args[0] - == "The optimal value found for " + record[1].message.args[0] == "The optimal value found for " "dimension 1 of parameter " "length_scale is close to the " "specified upper bound 100.0. " diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index d890dc05d9f02..e280827926d28 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -1,4 +1,4 @@ -"""Testing for Gaussian process regression """ +"""Testing for Gaussian process regression""" # Author: Jan Hendrik Metzen # Modified by: Pete Green @@ -493,8 +493,7 @@ def test_warning_bounds(): assert issubclass(record[0].category, ConvergenceWarning) assert ( - record[0].message.args[0] - == "The optimal value found for " + record[0].message.args[0] == "The optimal value found for " "dimension 0 of parameter " "k1__noise_level is close to the " "specified upper bound 0.001. " @@ -504,8 +503,7 @@ def test_warning_bounds(): assert issubclass(record[1].category, ConvergenceWarning) assert ( - record[1].message.args[0] - == "The optimal value found for " + record[1].message.args[0] == "The optimal value found for " "dimension 0 of parameter " "k2__length_scale is close to the " "specified lower bound 1000.0. " @@ -525,8 +523,7 @@ def test_warning_bounds(): assert issubclass(record[0].category, ConvergenceWarning) assert ( - record[0].message.args[0] - == "The optimal value found for " + record[0].message.args[0] == "The optimal value found for " "dimension 0 of parameter " "length_scale is close to the " "specified lower bound 10.0. " @@ -536,8 +533,7 @@ def test_warning_bounds(): assert issubclass(record[1].category, ConvergenceWarning) assert ( - record[1].message.args[0] - == "The optimal value found for " + record[1].message.args[0] == "The optimal value found for " "dimension 1 of parameter " "length_scale is close to the " "specified lower bound 10.0. " diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py index e305bc2a657dc..380bcecaf65b5 100644 --- a/sklearn/impute/__init__.py +++ b/sklearn/impute/__init__.py @@ -1,4 +1,5 @@ """Transformers for missing value imputation""" + import typing from ._base import MissingIndicator, SimpleImputer diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index af298ae8c380e..04a4dffd10e68 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -701,9 +701,8 @@ def inverse_transform(self, X): def _more_tags(self): return { - "allow_nan": is_pandas_na(self.missing_values) or is_scalar_nan( - self.missing_values - ) + "allow_nan": is_pandas_na(self.missing_values) + or is_scalar_nan(self.missing_values) } def get_feature_names_out(self, input_features=None): diff --git a/sklearn/inspection/__init__.py b/sklearn/inspection/__init__.py index f8e08785e8358..f254967f96166 100644 --- a/sklearn/inspection/__init__.py +++ b/sklearn/inspection/__init__.py @@ -1,6 +1,5 @@ """The :mod:`sklearn.inspection` module includes tools for model inspection.""" - from ._partial_dependence import partial_dependence from ._permutation_importance import permutation_importance from ._plot.decision_boundary import DecisionBoundaryDisplay diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index b052609a85a2b..3cb4999eb0833 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -1,6 +1,7 @@ """ Testing for the partial dependence module. """ + import warnings import numpy as np diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index 2869e84c78bf8..8b3ed78cdd368 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -437,9 +437,7 @@ def test_permutation_importance_sample_weight(): # the second half of the samples approaches to infinity, the ratio of # the two features importance should equal to 2 on expectation (when using # mean absolutes error as the loss function). - w = np.hstack( - [np.repeat(10.0**10, n_half_samples), np.repeat(1.0, n_half_samples)] - ) + w = np.hstack([np.repeat(10.0**10, n_half_samples), np.repeat(1.0, n_half_samples)]) lr.fit(x, y, w) pi = permutation_importance( lr, diff --git a/sklearn/linear_model/_glm/_newton_solver.py b/sklearn/linear_model/_glm/_newton_solver.py index fa9b431fd2377..0b6adbe44e686 100644 --- a/sklearn/linear_model/_glm/_newton_solver.py +++ b/sklearn/linear_model/_glm/_newton_solver.py @@ -502,8 +502,7 @@ def inner_solve(self, X, y, sample_weight): "Further options are to use another solver or to avoid such situation " "in the first place. Possible remedies are removing collinear features" " of X or increasing the penalization strengths.\n" - "The original Linear Algebra message was:\n" - + str(e), + "The original Linear Algebra message was:\n" + str(e), scipy.linalg.LinAlgWarning, ) # Possible causes: diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 5256a5f370272..26f6bdc08d254 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -1107,6 +1107,5 @@ def test_newton_solver_verbosity(capsys, verbose): if verbose >= 1: assert ( "The inner solver detected a pointwise Hessian with many negative values" - " and resorts to lbfgs instead." - in captured.out + " and resorts to lbfgs instead." in captured.out ) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index efea6c6b4c5f9..4e038ecb28da9 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -2,6 +2,7 @@ Least Angle Regression algorithm. See the documentation on the Generalized Linear Model for a complete discussion. """ + # Author: Fabian Pedregosa # Alexandre Gramfort # Gael Varoquaux @@ -1737,8 +1738,7 @@ def fit(self, X, y, **params): if hasattr(Gram, "__array__"): warnings.warn( 'Parameter "precompute" cannot be an array in ' - '%s. Automatically switch to "auto" instead.' - % self.__class__.__name__ + '%s. Automatically switch to "auto" instead.' % self.__class__.__name__ ) Gram = "auto" diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py index 4255706e284f1..e8c1466b30623 100644 --- a/sklearn/linear_model/_linear_loss.py +++ b/sklearn/linear_model/_linear_loss.py @@ -1,6 +1,7 @@ """ Loss functions for linear models with raw_prediction = X @ coef """ + import numpy as np from scipy import sparse diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 259ce54d3f11e..a8ecc29715886 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1246,8 +1246,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "This solver needs samples of at least 2 classes" " in the data, but the data contains only one" - " class: %r" - % classes_[0] + " class: %r" % classes_[0] ) if len(self.classes_) == 2: @@ -1787,8 +1786,7 @@ def fit(self, X, y, sample_weight=None, **params): ): raise ValueError( "l1_ratios must be a list of numbers between " - "0 and 1; got (l1_ratios=%r)" - % self.l1_ratios + "0 and 1; got (l1_ratios=%r)" % self.l1_ratios ) l1_ratios_ = self.l1_ratios else: @@ -1856,8 +1854,7 @@ def fit(self, X, y, sample_weight=None, **params): raise ValueError( "This solver needs samples of at least 2 classes" " in the data, but the data contains only one" - " class: %r" - % classes[0] + " class: %r" % classes[0] ) if n_classes == 2: diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index efac0508963ba..2d6fe48869742 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -1,5 +1,4 @@ -"""Orthogonal matching pursuit algorithms -""" +"""Orthogonal matching pursuit algorithms""" # Author: Vlad Niculae # diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 67187bbdb5934..e0fad5d8be8b8 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -1358,8 +1358,7 @@ def predict_proba(self, X): raise NotImplementedError( "predict_(log_)proba only supported when" " loss='log_loss' or loss='modified_huber' " - "(%r given)" - % self.loss + "(%r given)" % self.loss ) @available_if(_check_proba) diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py index 659ff134198db..230966db1ceaf 100644 --- a/sklearn/linear_model/tests/test_linear_loss.py +++ b/sklearn/linear_model/tests/test_linear_loss.py @@ -4,6 +4,7 @@ Note that correctness of losses (which compose LinearModelLoss) is already well covered in the _loss module. """ + import numpy as np import pytest from numpy.testing import assert_allclose diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index f1707fad1c950..2e2e262183a17 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -650,7 +650,8 @@ def __init__( def _more_tags(self): return { - "pairwise": self.affinity in [ + "pairwise": self.affinity + in [ "precomputed", "precomputed_nearest_neighbors", ] diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 713c5fe651dbb..8a818c885043c 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -3,7 +3,6 @@ and pairwise metrics and distance computations. """ - from . import cluster from ._classification import ( accuracy_score, diff --git a/sklearn/metrics/_base.py b/sklearn/metrics/_base.py index 53ff14b039e0c..c344008755004 100644 --- a/sklearn/metrics/_base.py +++ b/sklearn/metrics/_base.py @@ -2,6 +2,7 @@ Common code for all metrics. """ + # Authors: Alexandre Gramfort # Mathieu Blondel # Olivier Grisel diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 999d3795b8dd9..c5290fd39eb7e 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -583,8 +583,7 @@ def multilabel_confusion_matrix( raise ValueError( "All labels must be in [0, n labels) for " "multilabel targets. " - "Got %d < 0" - % np.min(labels) + "Got %d < 0" % np.min(labels) ) if n_labels is not None: diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py index a332997a84414..44da911061bc8 100644 --- a/sklearn/metrics/cluster/__init__.py +++ b/sklearn/metrics/cluster/__init__.py @@ -5,6 +5,7 @@ - supervised, which uses a ground truth class values for each sample. - unsupervised, which does not and measures the 'quality' of the model itself. """ + from ._bicluster import consensus_score from ._supervised import ( adjusted_mutual_info_score, diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index ec26ef7dcd399..bbebe2cba2197 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -2217,8 +2217,7 @@ def test_recall_warnings(zero_division): ) if zero_division == "warn": assert ( - str(record.pop().message) - == "Recall is ill-defined and " + str(record.pop().message) == "Recall is ill-defined and " "being set to 0.0 due to no true samples." " Use `zero_division` parameter to control" " this behavior." @@ -2229,8 +2228,7 @@ def test_recall_warnings(zero_division): recall_score([0, 0], [0, 0]) if zero_division == "warn": assert ( - str(record.pop().message) - == "Recall is ill-defined and " + str(record.pop().message) == "Recall is ill-defined and " "being set to 0.0 due to no true samples." " Use `zero_division` parameter to control" " this behavior." @@ -2249,8 +2247,7 @@ def test_precision_warnings(zero_division): ) if zero_division == "warn": assert ( - str(record.pop().message) - == "Precision is ill-defined and " + str(record.pop().message) == "Precision is ill-defined and " "being set to 0.0 due to no predicted samples." " Use `zero_division` parameter to control" " this behavior." @@ -2261,8 +2258,7 @@ def test_precision_warnings(zero_division): precision_score([0, 0], [0, 0]) if zero_division == "warn": assert ( - str(record.pop().message) - == "Precision is ill-defined and " + str(record.pop().message) == "Precision is ill-defined and " "being set to 0.0 due to no predicted samples." " Use `zero_division` parameter to control" " this behavior." @@ -2307,8 +2303,7 @@ def test_fscore_warnings(zero_division): ) if zero_division == "warn": assert ( - str(record.pop().message) - == "F-score is ill-defined and " + str(record.pop().message) == "F-score is ill-defined and " "being set to 0.0 due to no true nor predicted " "samples. Use `zero_division` parameter to " "control this behavior." diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py index e361ce8f61a1c..fda1a83702bbf 100644 --- a/sklearn/mixture/_bayesian_mixture.py +++ b/sklearn/mixture/_bayesian_mixture.py @@ -1,4 +1,5 @@ """Bayesian Gaussian Mixture Model.""" + # Author: Wei Xue # Thierry Guillemot # License: BSD 3 clause diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 6b546c6bc9441..9b9072f1491a2 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -484,8 +484,7 @@ def score(self, X, y=None, **params): if self.scorer_ is None: raise ValueError( "No score function explicitly defined, " - "and the estimator doesn't provide one %s" - % self.best_estimator_ + "and the estimator doesn't provide one %s" % self.best_estimator_ ) if isinstance(self.scorer_, dict): if self.multimetric_: diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 2afb9ae6adce7..fa425a5e6a18b 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1,4 +1,5 @@ """Test the split module""" + import re import warnings from itertools import combinations, combinations_with_replacement, permutations diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 22306d88e021f..43916d8cecb2e 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1,4 +1,5 @@ """Test the validation module""" + import os import re import sys diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index e1e8bdbb09d7c..776d462928fbb 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -1,4 +1,5 @@ """Base and mixin classes for nearest neighbors.""" + # Authors: Jake Vanderplas # Fabian Pedregosa # Alexandre Gramfort @@ -444,8 +445,7 @@ def _check_algorithm_metric(self): raise ValueError( "kd_tree does not support callable metric '%s'" "Function call overhead will result" - "in very poor performance." - % self.metric + "in very poor performance." % self.metric ) elif self.metric not in VALID_METRICS[alg_check] and not isinstance( self.metric, DistanceMetric @@ -898,8 +898,7 @@ class from an array representing our data set and ask who's if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " - "or set algorithm='brute'" - % self._fit_method + "or set algorithm='brute'" % self._fit_method ) chunked_results = Parallel(n_jobs, prefer="threads")( delayed(_tree_query_parallel_helper)( @@ -1253,8 +1252,7 @@ class from an array representing our data set and ask who's if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " - "or set algorithm='brute'" - % self._fit_method + "or set algorithm='brute'" % self._fit_method ) n_jobs = effective_n_jobs(self.n_jobs) diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 8885fb4c8c5d0..a9e5fe011150a 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -2,6 +2,7 @@ Kernel Density Estimation ------------------------- """ + # Author: Jake Vanderplas import itertools from numbers import Integral, Real diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index a4ff66786340a..4185bbe15826b 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -1,4 +1,5 @@ """Unsupervised nearest neighbors learner""" + from ..base import _fit_context from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py index ee548d8017810..09c2501818fd3 100644 --- a/sklearn/neighbors/tests/test_nearest_centroid.py +++ b/sklearn/neighbors/tests/test_nearest_centroid.py @@ -1,6 +1,7 @@ """ Testing for the nearest centroid module. """ + import numpy as np import pytest from numpy.testing import assert_array_equal diff --git a/sklearn/neural_network/_base.py b/sklearn/neural_network/_base.py index 73d62f9543e98..60ef660ef917d 100644 --- a/sklearn/neural_network/_base.py +++ b/sklearn/neural_network/_base.py @@ -1,5 +1,4 @@ -"""Utilities for the neural network modules -""" +"""Utilities for the neural network modules""" # Author: Issam H. Laradji # License: BSD 3 clause diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index cc419b57f2410..f56f68ac852c2 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -1,5 +1,4 @@ -"""Multi-layer Perceptron -""" +"""Multi-layer Perceptron""" # Authors: Issam H. Laradji # Andreas Mueller @@ -755,8 +754,7 @@ def _check_solver(self): if self.solver not in _STOCHASTIC_SOLVERS: raise AttributeError( "partial_fit is only available for stochastic" - " optimizers. %s is not stochastic." - % self.solver + " optimizers. %s is not stochastic." % self.solver ) return True diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index e3814f45d3633..4b7f0f9422625 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -1,5 +1,4 @@ -"""Restricted Boltzmann Machine -""" +"""Restricted Boltzmann Machine""" # Authors: Yann N. Dauphin # Vlad Niculae diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py index d9fbaec0098d0..ab87300aff110 100644 --- a/sklearn/neural_network/_stochastic_optimizers.py +++ b/sklearn/neural_network/_stochastic_optimizers.py @@ -1,5 +1,4 @@ -"""Stochastic optimization methods for MLP -""" +"""Stochastic optimization methods for MLP""" # Authors: Jiyuan Qian # License: BSD 3 clause diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py index 6b94e2703f7e1..64ad4c5edc019 100644 --- a/sklearn/neural_network/tests/test_mlp.py +++ b/sklearn/neural_network/tests/test_mlp.py @@ -732,8 +732,7 @@ def test_warm_start(): message = ( "warm_start can only be used where `y` has the same " "classes as in the previous call to fit." - " Previously got [0 1 2], `y` has %s" - % np.unique(y_i) + " Previously got [0 1 2], `y` has %s" % np.unique(y_i) ) with pytest.raises(ValueError, match=re.escape(message)): clf.fit(X, y_i) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 4ee0622c699b7..b26b83e66510f 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -2,6 +2,7 @@ The :mod:`sklearn.pipeline` module implements utilities to build a composite estimator, as a chain of transforms and estimators. """ + # Author: Edouard Duchesnay # Gael Varoquaux # Virgile Fritsch diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 2512f411a5a9c..f4c9fb032cfb0 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -1,6 +1,7 @@ """ This file contains preprocessing tools based on polynomials. """ + import collections from itertools import chain, combinations from itertools import combinations_with_replacement as combinations_w_r diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index c8c0193ac9b0b..886a805960d52 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -22,6 +22,7 @@ and can even be taken to be an orthogonal projection. """ + # Authors: Olivier Grisel , # Arnaud Joly # License: BSD 3 clause diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 8812c3c352a03..4b046aa111250 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -1,4 +1,4 @@ -""" test the label propagation module """ +"""test the label propagation module""" import warnings diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 6d154c99dc669..47d4027c50754 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -297,8 +297,7 @@ def _warn_from_fit_status(self): warnings.warn( "Solver terminated early (max_iter=%i)." " Consider pre-processing your data with" - " StandardScaler or MinMaxScaler." - % self.max_iter, + " StandardScaler or MinMaxScaler." % self.max_iter, ConvergenceWarning, ) @@ -1174,8 +1173,7 @@ def _fit_liblinear( raise ValueError( "This solver needs samples of at least 2 classes" " in the data, but the data contains only one" - " class: %r" - % classes_[0] + " class: %r" % classes_[0] ) class_weight_ = compute_class_weight(class_weight, classes=classes_, y=y) diff --git a/sklearn/svm/_bounds.py b/sklearn/svm/_bounds.py index d14297230af4c..b02720637c03b 100644 --- a/sklearn/svm/_bounds.py +++ b/sklearn/svm/_bounds.py @@ -1,4 +1,5 @@ """Determination of parameter bounds""" + # Author: Paolo Losi # License: BSD 3 clause diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index e1c6e36af28fb..f728136b0f98c 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -3,6 +3,7 @@ TODO: remove hard coded numerical results when possible """ + import re import numpy as np diff --git a/sklearn/tests/random_seed.py b/sklearn/tests/random_seed.py index 0fffd57a1016d..ecda17e36d2bf 100644 --- a/sklearn/tests/random_seed.py +++ b/sklearn/tests/random_seed.py @@ -8,6 +8,7 @@ https://scikit-learn.org/dev/computing/parallelism.html#sklearn-tests-global-random-seed """ + from os import environ from random import Random diff --git a/sklearn/tests/test_build.py b/sklearn/tests/test_build.py index 72cab1dfcb174..40a960cba6283 100644 --- a/sklearn/tests/test_build.py +++ b/sklearn/tests/test_build.py @@ -15,7 +15,8 @@ def test_openmp_parallelism_enabled(): pytest.skip("test explicitly skipped (SKLEARN_SKIP_OPENMP_TEST)") base_url = "dev" if __version__.endswith(".dev0") else "stable" - err_msg = textwrap.dedent(""" + err_msg = textwrap.dedent( + """ This test fails because scikit-learn has been built without OpenMP. This is not recommended since some estimators will run in sequential mode instead of leveraging thread-based parallelism. @@ -27,6 +28,7 @@ def test_openmp_parallelism_enabled(): You can skip this test by setting the environment variable SKLEARN_SKIP_OPENMP_TEST to any value. - """).format(base_url) + """ + ).format(base_url) assert _openmp_parallelism_enabled(), err_msg diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index fccc58f9fa2a5..ea84eec258d83 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -255,11 +255,13 @@ def test_all_tests_are_importable(): # Ensure that for each contentful subpackage, there is a test directory # within it that is also a subpackage (i.e. a directory with __init__.py) - HAS_TESTS_EXCEPTIONS = re.compile(r"""(?x) + HAS_TESTS_EXCEPTIONS = re.compile( + r"""(?x) \.externals(\.|$)| \.tests(\.|$)| \._ - """) + """ + ) resource_modules = { "sklearn.datasets.data", "sklearn.datasets.descr", diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index b3c6820faefc2..e06d2f59a6c10 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -1,4 +1,5 @@ """Common tests for metaestimators""" + import functools from inspect import signature diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index f5ed64a094063..150dcc287e651 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1,6 +1,7 @@ """ Test the pipeline module. """ + import itertools import re import shutil diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index f8c612b6029c2..cd4a106ee7606 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -1,6 +1,7 @@ """ Testing for export functions of decision trees (sklearn.tree.export). """ + from io import StringIO from re import finditer, search from textwrap import dedent @@ -375,12 +376,14 @@ def test_export_text(): clf = DecisionTreeClassifier(max_depth=2, random_state=0) clf.fit(X, y) - expected_report = dedent(""" + expected_report = dedent( + """ |--- feature_1 <= 0.00 | |--- class: -1 |--- feature_1 > 0.00 | |--- class: 1 - """).lstrip() + """ + ).lstrip() assert export_text(clf) == expected_report # testing that leaves at level 1 are not truncated @@ -388,32 +391,38 @@ def test_export_text(): # testing that the rest of the tree is truncated assert export_text(clf, max_depth=10) == expected_report - expected_report = dedent(""" + expected_report = dedent( + """ |--- feature_1 <= 0.00 | |--- weights: [3.00, 0.00] class: -1 |--- feature_1 > 0.00 | |--- weights: [0.00, 3.00] class: 1 - """).lstrip() + """ + ).lstrip() assert export_text(clf, show_weights=True) == expected_report - expected_report = dedent(""" + expected_report = dedent( + """ |- feature_1 <= 0.00 | |- class: -1 |- feature_1 > 0.00 | |- class: 1 - """).lstrip() + """ + ).lstrip() assert export_text(clf, spacing=1) == expected_report X_l = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, 1]] y_l = [-1, -1, -1, 1, 1, 1, 2] clf = DecisionTreeClassifier(max_depth=4, random_state=0) clf.fit(X_l, y_l) - expected_report = dedent(""" + expected_report = dedent( + """ |--- feature_1 <= 0.00 | |--- class: -1 |--- feature_1 > 0.00 | |--- truncated branch of depth 2 - """).lstrip() + """ + ).lstrip() assert export_text(clf, max_depth=0) == expected_report X_mo = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] @@ -422,12 +431,14 @@ def test_export_text(): reg = DecisionTreeRegressor(max_depth=2, random_state=0) reg.fit(X_mo, y_mo) - expected_report = dedent(""" + expected_report = dedent( + """ |--- feature_1 <= 0.0 | |--- value: [-1.0, -1.0] |--- feature_1 > 0.0 | |--- value: [1.0, 1.0] - """).lstrip() + """ + ).lstrip() assert export_text(reg, decimals=1) == expected_report assert export_text(reg, decimals=1, show_weights=True) == expected_report @@ -435,12 +446,14 @@ def test_export_text(): reg = DecisionTreeRegressor(max_depth=2, random_state=0) reg.fit(X_single, y_mo) - expected_report = dedent(""" + expected_report = dedent( + """ |--- first <= 0.0 | |--- value: [-1.0, -1.0] |--- first > 0.0 | |--- value: [1.0, 1.0] - """).lstrip() + """ + ).lstrip() assert export_text(reg, decimals=1, feature_names=["first"]) == expected_report assert ( export_text(reg, decimals=1, show_weights=True, feature_names=["first"]) @@ -455,20 +468,24 @@ def test_export_text_feature_class_names_array_support(constructor): clf = DecisionTreeClassifier(max_depth=2, random_state=0) clf.fit(X, y) - expected_report = dedent(""" + expected_report = dedent( + """ |--- b <= 0.00 | |--- class: -1 |--- b > 0.00 | |--- class: 1 - """).lstrip() + """ + ).lstrip() assert export_text(clf, feature_names=constructor(["a", "b"])) == expected_report - expected_report = dedent(""" + expected_report = dedent( + """ |--- feature_1 <= 0.00 | |--- class: cat |--- feature_1 > 0.00 | |--- class: dog - """).lstrip() + """ + ).lstrip() assert export_text(clf, class_names=constructor(["cat", "dog"])) == expected_report diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py index e647ba3a4f009..0207cc1205120 100644 --- a/sklearn/utils/_response.py +++ b/sklearn/utils/_response.py @@ -2,6 +2,7 @@ It allows to make uniform checks and validation. """ + import numpy as np from ..base import is_classifier diff --git a/sklearn/utils/_show_versions.py b/sklearn/utils/_show_versions.py index 89052e88b65fe..1431108477263 100644 --- a/sklearn/utils/_show_versions.py +++ b/sklearn/utils/_show_versions.py @@ -3,6 +3,7 @@ adapted from :func:`pandas.show_versions` """ + # License: BSD 3 clause import platform diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d2559cb66b2ad..b466a7765b819 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1461,8 +1461,7 @@ def check_dont_overwrite_parameters(name, estimator_orig): " the fit method." " Estimators are only allowed to add private attributes" " either started with _ or ended" - " with _ but %s added" - % ", ".join(attrs_added_by_fit) + " with _ but %s added" % ", ".join(attrs_added_by_fit) ) # check that fit doesn't change any public attribute @@ -1477,8 +1476,7 @@ def check_dont_overwrite_parameters(name, estimator_orig): " the fit method. Estimators are only allowed" " to change attributes started" " or ended with _, but" - " %s changed" - % ", ".join(attrs_changed_by_fit) + " %s changed" % ", ".join(attrs_changed_by_fit) ) @@ -2927,8 +2925,7 @@ def check_supervised_y_2d(name, estimator_orig): assert len(w) > 0, msg assert ( "DataConversionWarning('A column-vector y" - " was passed when a 1d array was expected" - in msg + " was passed when a 1d array was expected" in msg ) assert_allclose(y_pred.ravel(), y_pred_2d.ravel()) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index be93464353832..2fe7dbc3cc179 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -2,6 +2,7 @@ The :mod:`sklearn.utils.extmath` module includes utilities to perform optimal mathematical operations in scikit-learn that are not available in SciPy. """ + # Authors: Gael Varoquaux # Alexandre Gramfort # Alexandre T. Passos diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 8eca047b1a844..33be9f4ab3473 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -3,6 +3,7 @@ If you add content to this file, please give the version of the package at which the fix is no longer needed. """ + # Authors: Emmanuelle Gouillart # Gael Varoquaux # Fabian Pedregosa diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index 024b0bcaf95ee..d79f514aae778 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -8,6 +8,7 @@ regression with large design matrix), this approach gives very significant speedups. """ + # This is a modified file from scipy.optimize # Original authors: Travis Oliphant, Eric Jones # Modifications by Gael Varoquaux, Mathieu Blondel and Tom Dupre la Tour diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index c167a7e9d8f59..5ec962433d7c0 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -703,9 +703,7 @@ def test_incremental_weighted_mean_and_variance_simple(rng, dtype): mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0, sample_weight=sample_weight) expected_mean = np.average(X, weights=sample_weight, axis=0) - expected_var = ( - np.average(X**2, weights=sample_weight, axis=0) - expected_mean**2 - ) + expected_var = np.average(X**2, weights=sample_weight, axis=0) - expected_mean**2 assert_almost_equal(mean, expected_mean) assert_almost_equal(var, expected_var) diff --git a/sklearn/utils/tests/test_fast_dict.py b/sklearn/utils/tests/test_fast_dict.py index 8fada45db3f52..c44250c36daac 100644 --- a/sklearn/utils/tests/test_fast_dict.py +++ b/sklearn/utils/tests/test_fast_dict.py @@ -1,5 +1,5 @@ -""" Test fast_dict. -""" +"""Test fast_dict.""" + import numpy as np from numpy.testing import assert_allclose, assert_array_equal From 87c90fd861c97872ab1f247c82ca47efada282e4 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 23 May 2024 19:24:31 -0400 Subject: [PATCH 21/29] initial pass at refactoring DepthFirstTreeBuilder.build --- sklearn/tree/_tree.pxd | 75 +++++++ sklearn/tree/_tree.pyx | 442 +++++++++++++++++++++-------------------- 2 files changed, 301 insertions(+), 216 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 2267b4306e261..635d3c5fece07 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -43,6 +43,81 @@ cdef struct ParentInfo: float64_t impurity # the impurity of the parent intp_t n_constant_features # the number of constant features found in parent +ctypedef intp_t (*AddOrUpdateNodeFunc)( + Tree tree, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + unsigned char missing_go_to_left +) except -1 nogil + +# A record on the stack for depth-first tree growing +cdef struct StackRecord: + intp_t start + intp_t end + intp_t depth + intp_t parent + bint is_left + float64_t impurity + intp_t n_constant_features + float64_t lower_bound + float64_t upper_bound + +cdef extern from "" namespace "std" nogil: + cdef cppclass stack[T]: + ctypedef T value_type + stack() except + + bint empty() + void pop() + void push(T&) except + # Raise c++ exception for bad_alloc -> MemoryError + T& top() + +cdef struct BuildEnv: + # Parameters + intp_t max_depth + intp_t min_samples_leaf + float64_t min_weight_leaf + intp_t min_samples_split + float64_t min_impurity_decrease + + unsigned char store_leaf_values + + # Initial capacity + intp_t init_capacity + bint first + + intp_t start + intp_t end + intp_t depth + intp_t parent + bint is_left + intp_t n_node_samples + float64_t weighted_n_node_samples + intp_t node_id + float64_t right_child_min, left_child_min, right_child_max, left_child_max + + SplitRecord* split_ptr + + float64_t middle_value + bint is_leaf + intp_t max_depth_seen + + intp_t rc + + stack[StackRecord] builder_stack + stack[StackRecord] update_stack + stack[StackRecord]* target_stack + StackRecord stack_record + + ParentInfo parent_record + + AddOrUpdateNodeFunc add_or_update_node + + cdef class BaseTree: # Inner structures: values are stored separately from node structure, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 418eae57e4995..4efb0db5f09c6 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -48,14 +48,6 @@ cdef extern from "numpy/arrayobject.h": void* data, intp_t flags, object obj) intp_t PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) -cdef extern from "" namespace "std" nogil: - cdef cppclass stack[T]: - ctypedef T value_type - stack() except + - bint empty() - void pop() - void push(T&) except + # Raise c++ exception for bad_alloc -> MemoryError - T& top() # ============================================================================= # Types and constants @@ -161,19 +153,44 @@ cdef class TreeBuilder: # Depth first builder --------------------------------------------------------- -# A record on the stack for depth-first tree growing -cdef struct StackRecord: - intp_t start - intp_t end - intp_t depth - intp_t parent - bint is_left - float64_t impurity - intp_t n_constant_features - float64_t lower_bound - float64_t upper_bound +cdef intp_t tree_add_node( + Tree tree, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + unsigned char missing_go_to_left +) except -1 nogil: + return tree._add_node( + parent, is_left, is_leaf, + split_node, impurity, + n_node_samples, weighted_n_node_samples, + missing_go_to_left + ) + +cdef intp_t tree_update_node( + Tree tree, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + unsigned char missing_go_to_left +) except -1 nogil: + return tree._update_node( + parent, is_left, is_leaf, + split_node, impurity, + n_node_samples, weighted_n_node_samples, + missing_go_to_left + ) + cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" @@ -285,31 +302,32 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # check input X, y, sample_weight = self._check_input(X, y, sample_weight) - # Parameters cdef Splitter splitter = self.splitter - cdef intp_t max_depth = self.max_depth - cdef intp_t min_samples_leaf = self.min_samples_leaf - cdef float64_t min_weight_leaf = self.min_weight_leaf - cdef intp_t min_samples_split = self.min_samples_split - cdef float64_t min_impurity_decrease = self.min_impurity_decrease - - cdef unsigned char store_leaf_values = self.store_leaf_values + cdef SplitRecord split cdef cnp.ndarray initial_roots = self.initial_roots + cdef BuildEnv e + e.max_depth = self.max_depth + e.min_samples_leaf = self.min_samples_leaf + e.min_weight_leaf = self.min_weight_leaf + e.min_samples_split = self.min_samples_split + e.min_impurity_decrease = self.min_impurity_decrease + + e.store_leaf_values = self.store_leaf_values + # Initial capacity - cdef intp_t init_capacity - cdef bint first = 0 + e.first = 0 if initial_roots is None: # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight, missing_values_in_feature_mask) if tree.max_depth <= 10: - init_capacity = (2 ** (tree.max_depth + 1)) - 1 + e.init_capacity = (2 ** (tree.max_depth + 1)) - 1 else: - init_capacity = 2047 + e.init_capacity = 2047 - tree._resize(init_capacity) - first = 1 + tree._resize(e.init_capacity) + e.first = 1 else: # convert numpy array back to dict false_roots = {} @@ -319,39 +337,24 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # reset the root array self.initial_roots = None - cdef intp_t start = 0 - cdef intp_t end = 0 - cdef intp_t depth - cdef intp_t parent - cdef bint is_left - cdef intp_t n_node_samples = splitter.n_samples - cdef float64_t weighted_n_node_samples - cdef intp_t node_id - cdef float64_t right_child_min, left_child_min, right_child_max, left_child_max - - cdef SplitRecord split - cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + e.start = 0 + e.end = 0 + e.n_node_samples = splitter.n_samples + e.split_ptr = malloc(splitter.pointer_size()) - cdef float64_t middle_value - cdef bint is_leaf - cdef intp_t max_depth_seen = -1 if first else tree.max_depth + e.max_depth_seen = -1 if e.first else tree.max_depth - cdef intp_t rc = 0 + e.rc = 0 - cdef stack[StackRecord] builder_stack - cdef stack[StackRecord] update_stack - cdef StackRecord stack_record + _init_parent_record(&e.parent_record) - cdef ParentInfo parent_record - _init_parent_record(&parent_record) - - if not first: + if not e.first: # push reached leaf nodes onto stack for key, value in reversed(sorted(false_roots.items())): - end += value[0] - update_stack.push({ - "start": start, - "end": end, + e.end += value[0] + e.update_stack.push({ + "start": e.start, + "end": e.end, "depth": value[1], "parent": key[0], "is_left": key[1], @@ -360,12 +363,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "lower_bound": -INFINITY, "upper_bound": INFINITY, }) - start += value[0] + e.start += value[0] else: # push root node onto stack - builder_stack.push({ + e.builder_stack.push({ "start": 0, - "end": n_node_samples, + "end": e.n_node_samples, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0, @@ -376,72 +379,75 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): }) with nogil: - while not update_stack.empty(): - stack_record = update_stack.top() - update_stack.pop() - - start = stack_record.start - end = stack_record.end - depth = stack_record.depth - parent = stack_record.parent - is_left = stack_record.is_left - parent_record.impurity = stack_record.impurity - parent_record.n_constant_features = stack_record.n_constant_features - parent_record.lower_bound = stack_record.lower_bound - parent_record.upper_bound = stack_record.upper_bound - - n_node_samples = end - start - splitter.node_reset(start, end, &weighted_n_node_samples) - - is_leaf = (depth >= max_depth or - n_node_samples < min_samples_split or - n_node_samples < 2 * min_samples_leaf or - weighted_n_node_samples < 2 * min_weight_leaf) - - if first: - parent_record.impurity = splitter.node_impurity() - first = 0 + e.target_stack = &e.update_stack + e.add_or_update_node = tree_update_node + while not e.target_stack.empty(): + e.stack_record = e.target_stack.top() + e.target_stack.pop() + + e.start = e.stack_record.start + e.end = e.stack_record.end + e.depth = e.stack_record.depth + e.parent = e.stack_record.parent + e.is_left = e.stack_record.is_left + e.parent_record.impurity = e.stack_record.impurity + e.parent_record.n_constant_features = e.stack_record.n_constant_features + e.parent_record.lower_bound = e.stack_record.lower_bound + e.parent_record.upper_bound = e.stack_record.upper_bound + + e.n_node_samples = e.end - e.start + splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) + + e.is_leaf = (e.depth >= e.max_depth or + e.n_node_samples < e.min_samples_split or + e.n_node_samples < 2 * e.min_samples_leaf or + e.weighted_n_node_samples < 2 * e.min_weight_leaf) + + if e.first: + e.parent_record.impurity = splitter.node_impurity() + e.first = 0 # impurity == 0 with tolerance due to rounding errors - is_leaf = is_leaf or parent_record.impurity <= EPSILON + e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON - if not is_leaf: + if not e.is_leaf: splitter.node_split( - &parent_record, - split_ptr, + &e.parent_record, + e.split_ptr, ) # assign local copy of SplitRecord to assign # pos, improvement, and impurity scores - split = deref(split_ptr) + split = deref(e.split_ptr) # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 - is_leaf = (is_leaf or split.pos >= end or + e.is_leaf = (e.is_leaf or split.pos >= e.end or (split.improvement + EPSILON < - min_impurity_decrease)) + e.min_impurity_decrease)) - node_id = tree._update_node(parent, is_left, is_leaf, split_ptr, - parent_record.impurity, - n_node_samples, weighted_n_node_samples, - split.missing_go_to_left) + e.node_id = e.add_or_update_node( + tree, e.parent, e.is_left, e.is_leaf, e.split_ptr, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + split.missing_go_to_left + ) - if node_id == INTPTR_MAX: - rc = -1 + if e.node_id == INTPTR_MAX: + e.rc = -1 break # Store value for all nodes, to facilitate tree/model # inspection and interpretation - splitter.node_value(tree.value + node_id * tree.value_stride) + splitter.node_value(tree.value + e.node_id * tree.value_stride) if splitter.with_monotonic_cst: splitter.clip_node_value( - tree.value + node_id * tree.value_stride, - parent_record.lower_bound, - parent_record.upper_bound + tree.value + e.node_id * tree.value_stride, + e.parent_record.lower_bound, + e.parent_record.upper_bound ) - if not is_leaf: + if not e.is_leaf: if ( not splitter.with_monotonic_cst or splitter.monotonic_cst[split.feature] == 0 @@ -451,126 +457,130 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Current bounds must always be propagated to both children. # If a monotonic constraint is active, bounds are used in # node value clipping. - left_child_min = right_child_min = parent_record.lower_bound - left_child_max = right_child_max = parent_record.upper_bound + e.left_child_min = e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.right_child_max = e.parent_record.upper_bound elif splitter.monotonic_cst[split.feature] == 1: # Split on a feature with monotonic increase constraint - left_child_min = parent_record.lower_bound - right_child_max = parent_record.upper_bound + e.left_child_min = e.parent_record.lower_bound + e.right_child_max = e.parent_record.upper_bound # Lower bound for right child and upper bound for left child # are set to the same value. - middle_value = splitter.criterion.middle_value() - right_child_min = middle_value - left_child_max = middle_value + e.middle_value = splitter.criterion.middle_value() + e.right_child_min = e.middle_value + e.left_child_max = e.middle_value else: # i.e. splitter.monotonic_cst[split.feature] == -1 # Split on a feature with monotonic decrease constraint - right_child_min = parent_record.lower_bound - left_child_max = parent_record.upper_bound + e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.parent_record.upper_bound # Lower bound for left child and upper bound for right child # are set to the same value. - middle_value = splitter.criterion.middle_value() - left_child_min = middle_value - right_child_max = middle_value + e.middle_value = splitter.criterion.middle_value() + e.left_child_min = e.middle_value + e.right_child_max = e.middle_value # Push right child on stack - builder_stack.push({ + e.builder_stack.push({ "start": split.pos, - "end": end, - "depth": depth + 1, - "parent": node_id, + "end": e.end, + "depth": e.depth + 1, + "parent": e.node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": right_child_min, - "upper_bound": right_child_max, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.right_child_min, + "upper_bound": e.right_child_max, }) # Push left child on stack - builder_stack.push({ - "start": start, + e.builder_stack.push({ + "start": e.start, "end": split.pos, - "depth": depth + 1, - "parent": node_id, + "depth": e.depth + 1, + "parent": e.node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": left_child_min, - "upper_bound": left_child_max, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.left_child_min, + "upper_bound": e.left_child_max, }) - elif store_leaf_values and is_leaf: + elif e.store_leaf_values and e.is_leaf: # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[node_id]) - - if depth > max_depth_seen: - max_depth_seen = depth - - while not builder_stack.empty(): - stack_record = builder_stack.top() - builder_stack.pop() - - start = stack_record.start - end = stack_record.end - depth = stack_record.depth - parent = stack_record.parent - is_left = stack_record.is_left - parent_record.impurity = stack_record.impurity - parent_record.n_constant_features = stack_record.n_constant_features - parent_record.lower_bound = stack_record.lower_bound - parent_record.upper_bound = stack_record.upper_bound - - n_node_samples = end - start - splitter.node_reset(start, end, &weighted_n_node_samples) - - is_leaf = (depth >= max_depth or - n_node_samples < min_samples_split or - n_node_samples < 2 * min_samples_leaf or - weighted_n_node_samples < 2 * min_weight_leaf) - - if first: - parent_record.impurity = splitter.node_impurity() - first=0 + splitter.node_samples(tree.value_samples[e.node_id]) + + if e.depth > e.max_depth_seen: + e.max_depth_seen = e.depth + + e.target_stack = &e.builder_stack + e.add_or_update_node = tree_add_node + while not e.target_stack.empty(): + e.stack_record = e.target_stack.top() + e.target_stack.pop() + + e.start = e.stack_record.start + e.end = e.stack_record.end + e.depth = e.stack_record.depth + e.parent = e.stack_record.parent + e.is_left = e.stack_record.is_left + e.parent_record.impurity = e.stack_record.impurity + e.parent_record.n_constant_features = e.stack_record.n_constant_features + e.parent_record.lower_bound = e.stack_record.lower_bound + e.parent_record.upper_bound = e.stack_record.upper_bound + + e.n_node_samples = e.end - e.start + splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) + + e.is_leaf = (e.depth >= e.max_depth or + e.n_node_samples < e.min_samples_split or + e.n_node_samples < 2 * e.min_samples_leaf or + e.weighted_n_node_samples < 2 * e.min_weight_leaf) + + if e.first: + e.parent_record.impurity = splitter.node_impurity() + e.first=0 # impurity == 0 with tolerance due to rounding errors - is_leaf = is_leaf or parent_record.impurity <= EPSILON + e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON - if not is_leaf: + if not e.is_leaf: splitter.node_split( - &parent_record, - split_ptr, + &e.parent_record, + e.split_ptr, ) # assign local copy of SplitRecord to assign # pos, improvement, and impurity scores - split = deref(split_ptr) + split = deref(e.split_ptr) # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 - is_leaf = (is_leaf or split.pos >= end or + e.is_leaf = (e.is_leaf or split.pos >= e.end or (split.improvement + EPSILON < - min_impurity_decrease)) + e.min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, - parent_record.impurity, n_node_samples, - weighted_n_node_samples, split.missing_go_to_left) + e.node_id = e.add_or_update_node( + tree, e.parent, e.is_left, e.is_leaf, e.split_ptr, + e.parent_record.impurity, e.n_node_samples, + e.weighted_n_node_samples, split.missing_go_to_left + ) - if node_id == INTPTR_MAX: - rc = -1 + if e.node_id == INTPTR_MAX: + e.rc = -1 break # Store value for all nodes, to facilitate tree/model # inspection and interpretation - splitter.node_value(tree.value + node_id * tree.value_stride) + splitter.node_value(tree.value + e.node_id * tree.value_stride) if splitter.with_monotonic_cst: splitter.clip_node_value( - tree.value + node_id * tree.value_stride, - parent_record.lower_bound, - parent_record.upper_bound + tree.value + e.node_id * tree.value_stride, + e.parent_record.lower_bound, + e.parent_record.upper_bound ) - if not is_leaf: + if not e.is_leaf: if ( not splitter.with_monotonic_cst or splitter.monotonic_cst[split.feature] == 0 @@ -580,71 +590,71 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Current bounds must always be propagated to both children. # If a monotonic constraint is active, bounds are used in # node value clipping. - left_child_min = right_child_min = parent_record.lower_bound - left_child_max = right_child_max = parent_record.upper_bound + e.left_child_min = e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.right_child_max = e.parent_record.upper_bound elif splitter.monotonic_cst[split.feature] == 1: # Split on a feature with monotonic increase constraint - left_child_min = parent_record.lower_bound - right_child_max = parent_record.upper_bound + e.left_child_min = e.parent_record.lower_bound + e.right_child_max = e.parent_record.upper_bound # Lower bound for right child and upper bound for left child # are set to the same value. - middle_value = splitter.criterion.middle_value() - right_child_min = middle_value - left_child_max = middle_value + e.middle_value = splitter.criterion.middle_value() + e.right_child_min = e.middle_value + e.left_child_max = e.middle_value else: # i.e. splitter.monotonic_cst[split.feature] == -1 # Split on a feature with monotonic decrease constraint - right_child_min = parent_record.lower_bound - left_child_max = parent_record.upper_bound + e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.parent_record.upper_bound # Lower bound for left child and upper bound for right child # are set to the same value. - middle_value = splitter.criterion.middle_value() - left_child_min = middle_value - right_child_max = middle_value + e.middle_value = splitter.criterion.middle_value() + e.left_child_min = e.middle_value + e.right_child_max = e.middle_value # Push right child on stack - builder_stack.push({ + e.builder_stack.push({ "start": split.pos, - "end": end, - "depth": depth + 1, - "parent": node_id, + "end": e.end, + "depth": e.depth + 1, + "parent": e.node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": right_child_min, - "upper_bound": right_child_max, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.right_child_min, + "upper_bound": e.right_child_max, }) # Push left child on stack - builder_stack.push({ - "start": start, + e.builder_stack.push({ + "start": e.start, "end": split.pos, - "depth": depth + 1, - "parent": node_id, + "depth": e.depth + 1, + "parent": e.node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": parent_record.n_constant_features, - "lower_bound": left_child_min, - "upper_bound": left_child_max, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.left_child_min, + "upper_bound": e.left_child_max, }) - elif store_leaf_values and is_leaf: + elif e.store_leaf_values and e.is_leaf: # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[node_id]) + splitter.node_samples(tree.value_samples[e.node_id]) - if depth > max_depth_seen: - max_depth_seen = depth + if e.depth > e.max_depth_seen: + e.max_depth_seen = e.depth - if rc >= 0: - rc = tree._resize_c(tree.node_count) + if e.rc >= 0: + e.rc = tree._resize_c(tree.node_count) - if rc >= 0: - tree.max_depth = max_depth_seen + if e.rc >= 0: + tree.max_depth = e.max_depth_seen # free the memory created for the SplitRecord pointer - free(split_ptr) + free(e.split_ptr) - if rc == -1: + if e.rc == -1: raise MemoryError() # Best first builder ---------------------------------------------------------- From 51da5864a6b3a6f95c4293fc3ed7f57ed124d328 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 28 May 2024 15:08:57 -0400 Subject: [PATCH 22/29] some renaming to make closure pattern more obvious --- sklearn/tree/_splitter.pxd | 14 ++++---- sklearn/tree/_splitter.pyx | 68 +++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 0aeb07c9606d4..66c83283f677d 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -30,7 +30,7 @@ from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, ui # SO WHERE DOES THAT LEAVE US # - we can transform these into cpp vectors of structs # and with some minor casting irritations everything else works ok -ctypedef void* SplitConditionParameters +ctypedef void* SplitConditionEnv ctypedef bint (*SplitConditionFunction)( Splitter splitter, SplitRecord* current_split, @@ -38,15 +38,15 @@ ctypedef bint (*SplitConditionFunction)( bint missing_go_to_left, float64_t lower_bound, float64_t upper_bound, - SplitConditionParameters split_condition_parameters + SplitConditionEnv split_condition_env ) noexcept nogil -cdef struct SplitConditionTuple: +cdef struct SplitConditionClosure: SplitConditionFunction f - SplitConditionParameters p + SplitConditionEnv e cdef class SplitCondition: - cdef SplitConditionTuple t + cdef SplitConditionClosure c cdef class MinSamplesLeafCondition(SplitCondition): pass @@ -150,8 +150,8 @@ cdef class Splitter(BaseSplitter): cdef SplitCondition min_weight_leaf_condition cdef SplitCondition monotonic_constraint_condition - cdef vector[SplitConditionTuple] presplit_conditions - cdef vector[SplitConditionTuple] postsplit_conditions + cdef vector[SplitConditionClosure] presplit_conditions + cdef vector[SplitConditionClosure] postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index ff707817d3d60..c2f092bc18954 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -51,7 +51,7 @@ cdef bint min_sample_leaf_condition( bint missing_go_to_left, float64_t lower_bound, float64_t upper_bound, - SplitConditionParameters split_condition_parameters + SplitConditionEnv split_condition_env ) noexcept nogil: cdef intp_t min_samples_leaf = splitter.min_samples_leaf cdef intp_t end_non_missing = splitter.end - n_missing @@ -72,8 +72,8 @@ cdef bint min_sample_leaf_condition( cdef class MinSamplesLeafCondition(SplitCondition): def __cinit__(self): - self.t.f = min_sample_leaf_condition - self.t.p = NULL # min_samples is stored in splitter, which is already passed to f + self.c.f = min_sample_leaf_condition + self.c.e = NULL # min_samples is stored in splitter, which is already passed to f cdef bint min_weight_leaf_condition( Splitter splitter, @@ -82,7 +82,7 @@ cdef bint min_weight_leaf_condition( bint missing_go_to_left, float64_t lower_bound, float64_t upper_bound, - SplitConditionParameters split_condition_parameters + SplitConditionEnv split_condition_env ) noexcept nogil: cdef float64_t min_weight_leaf = splitter.min_weight_leaf @@ -95,8 +95,8 @@ cdef bint min_weight_leaf_condition( cdef class MinWeightLeafCondition(SplitCondition): def __cinit__(self): - self.t.f = min_weight_leaf_condition - self.t.p = NULL # min_weight_leaf is stored in splitter, which is already passed to f + self.c.f = min_weight_leaf_condition + self.c.e = NULL # min_weight_leaf is stored in splitter, which is already passed to f cdef bint monotonic_constraint_condition( Splitter splitter, @@ -105,7 +105,7 @@ cdef bint monotonic_constraint_condition( bint missing_go_to_left, float64_t lower_bound, float64_t upper_bound, - SplitConditionParameters split_condition_parameters + SplitConditionEnv split_condition_env ) noexcept nogil: if ( splitter.with_monotonic_cst and @@ -122,10 +122,10 @@ cdef bint monotonic_constraint_condition( cdef class MonotonicConstraintCondition(SplitCondition): def __cinit__(self): - self.t.f = monotonic_constraint_condition - self.t.p = NULL + self.c.f = monotonic_constraint_condition + self.c.e = NULL -# cdef struct HasDataParameters: +# cdef struct HasDataEnv: # int min_samples # cdef bint has_data_condition( @@ -135,24 +135,24 @@ cdef class MonotonicConstraintCondition(SplitCondition): # bint missing_go_to_left, # float64_t lower_bound, # float64_t upper_bound, -# SplitConditionParameters split_condition_parameters +# SplitConditionEnv split_condition_env # ) noexcept nogil: -# cdef HasDataParameters* p = split_condition_parameters -# return splitter.n_samples >= p.min_samples +# cdef HasDataEnv* e = split_condition_env +# return splitter.n_samples >= e.min_samples # cdef class HasDataCondition(SplitCondition): # def __cinit__(self, int min_samples): -# self.t.f = has_data_condition -# self.t.p = malloc(sizeof(HasDataParameters)) -# (self.t.p).min_samples = min_samples +# self.c.f = has_data_condition +# self.c.e = malloc(sizeof(HasDataEnv)) +# (self.c.e).min_samples = min_samples # def __dealloc__(self): -# if self.t.p is not NULL: -# free(self.t.p) +# if self.c.e is not NULL: +# free(self.c.e) # super.__dealloc__(self) -# cdef struct AlphaRegularityParameters: +# cdef struct AlphaRegularityEnv: # float64_t alpha # cdef bint alpha_regularity_condition( @@ -162,21 +162,21 @@ cdef class MonotonicConstraintCondition(SplitCondition): # bint missing_go_to_left, # float64_t lower_bound, # float64_t upper_bound, -# SplitConditionParameters split_condition_parameters +# SplitConditionEnv split_condition_env # ) noexcept nogil: -# cdef AlphaRegularityParameters* p = split_condition_parameters +# cdef AlphaRegularityEnv* e = split_condition_env # return True # cdef class AlphaRegularityCondition(SplitCondition): # def __cinit__(self, float64_t alpha): -# self.t.f = alpha_regularity_condition -# self.t.p = malloc(sizeof(AlphaRegularityParameters)) -# (self.t.p).alpha = alpha +# self.c.f = alpha_regularity_condition +# self.c.e = malloc(sizeof(AlphaRegularityEnv)) +# (self.c.e).alpha = alpha # def __dealloc__(self): -# if self.t.p is not NULL: -# free(self.t.p) +# if self.c.e is not NULL: +# free(self.c.e) # super.__dealloc__(self) @@ -353,23 +353,23 @@ cdef class Splitter(BaseSplitter): ) offset = 0 - self.presplit_conditions[offset] = self.min_samples_leaf_condition.t - self.postsplit_conditions[offset] = self.min_weight_leaf_condition.t + self.presplit_conditions[offset] = self.min_samples_leaf_condition.c + self.postsplit_conditions[offset] = self.min_weight_leaf_condition.c offset += 1 if(self.with_monotonic_cst): self.monotonic_constraint_condition = MonotonicConstraintCondition() - self.presplit_conditions[offset] = self.monotonic_constraint_condition.t - self.postsplit_conditions[offset] = self.monotonic_constraint_condition.t + self.presplit_conditions[offset] = self.monotonic_constraint_condition.c + self.postsplit_conditions[offset] = self.monotonic_constraint_condition.c offset += 1 if presplit_conditions is not None: for i in range(len(presplit_conditions)): - self.presplit_conditions[i + offset] = presplit_conditions[i].t + self.presplit_conditions[i + offset] = presplit_conditions[i].c if postsplit_conditions is not None: for i in range(len(postsplit_conditions)): - self.postsplit_conditions[i + offset] = postsplit_conditions[i].t + self.postsplit_conditions[i + offset] = postsplit_conditions[i].c def __reduce__(self): @@ -789,7 +789,7 @@ cdef inline intp_t node_split_best( for condition in splitter.presplit_conditions: if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, - lower_bound, upper_bound, condition.p + lower_bound, upper_bound, condition.e ): conditions_hold = False break @@ -818,7 +818,7 @@ cdef inline intp_t node_split_best( for condition in splitter.postsplit_conditions: if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, - lower_bound, upper_bound, condition.p + lower_bound, upper_bound, condition.e ): conditions_hold = False break From 6c117a22efbe0caf90a856c51a8cacbbe122b721 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 28 May 2024 15:52:33 -0400 Subject: [PATCH 23/29] added SplitRecordFactory --- sklearn/tree/_splitter.pxd | 10 ++++++++++ sklearn/tree/_splitter.pyx | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 66c83283f677d..0f16f10538a62 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -71,6 +71,13 @@ cdef struct SplitRecord: unsigned char missing_go_to_left # Controls if missing values go to the left node. intp_t n_missing # Number of missing values for the feature being split on +ctypedef void* SplitRecordFactoryEnv +ctypedef SplitRecord* (*SplitRecordFactory)(SplitRecordFactoryEnv env) except NULL nogil + +cdef struct SplitRecordFactoryClosure: + SplitRecordFactory f + SplitRecordFactoryEnv e + cdef class BaseSplitter: """Abstract interface for splitter.""" @@ -100,6 +107,8 @@ cdef class BaseSplitter: cdef const float64_t[:] sample_weight + cdef SplitRecordFactoryClosure split_record_factory + # The samples vector `samples` is maintained by the Splitter object such # that the samples contained in a node are contiguous. With this setting, # `node_split` reorganizes the node samples `samples[start:end]` in two @@ -131,6 +140,7 @@ cdef class BaseSplitter: cdef void node_value(self, float64_t* dest) noexcept nogil cdef float64_t node_impurity(self) noexcept nogil cdef intp_t pointer_size(self) noexcept nogil + cdef SplitRecord* create_split_record(self) except NULL nogil cdef class Splitter(BaseSplitter): """Base class for supervised splitters.""" diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index c2f092bc18954..66776e8bc5b38 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -20,7 +20,7 @@ from cython cimport final from libc.math cimport isnan from libc.stdint cimport uintptr_t -from libc.stdlib cimport qsort, free +from libc.stdlib cimport qsort, free, malloc from libc.string cimport memcpy from ._criterion cimport Criterion @@ -202,6 +202,9 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil self.missing_go_to_left = False self.n_missing = 0 +cdef SplitRecord* _base_split_record_factory(SplitRecordFactoryEnv env) except NULL nogil: + return malloc(sizeof(SplitRecord)); + cdef class BaseSplitter: """This is an abstract interface for splitters. @@ -286,6 +289,9 @@ cdef class BaseSplitter: `SplitRecord`. """ return sizeof(SplitRecord) + + cdef SplitRecord* create_split_record(self) except NULL nogil: + return self.split_record_factory.f(self.split_record_factory.e) cdef class Splitter(BaseSplitter): """Abstract interface for supervised splitters.""" @@ -352,7 +358,7 @@ cdef class Splitter(BaseSplitter): + (2 if self.with_monotonic_cst else 1) ) - offset = 0 + cdef int offset = 0 self.presplit_conditions[offset] = self.min_samples_leaf_condition.c self.postsplit_conditions[offset] = self.min_weight_leaf_condition.c offset += 1 @@ -363,6 +369,7 @@ cdef class Splitter(BaseSplitter): self.postsplit_conditions[offset] = self.monotonic_constraint_condition.c offset += 1 + cdef int i if presplit_conditions is not None: for i in range(len(presplit_conditions)): self.presplit_conditions[i + offset] = presplit_conditions[i].c @@ -370,6 +377,9 @@ cdef class Splitter(BaseSplitter): if postsplit_conditions is not None: for i in range(len(postsplit_conditions)): self.postsplit_conditions[i + offset] = postsplit_conditions[i].c + + self.split_record_factory.f = _base_split_record_factory + self.split_record_factory.e = NULL def __reduce__(self): From 9e7b1313bd8656ab0d3dddcd507fd468b8bccc62 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 28 May 2024 16:10:42 -0400 Subject: [PATCH 24/29] SplitRecordFactory progress --- sklearn/tree/_tree.pxd | 2 +- sklearn/tree/_tree.pyx | 61 ++++++++++++++++++------------------------ 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 635d3c5fece07..dd0ebcd0aa251 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -100,7 +100,7 @@ cdef struct BuildEnv: intp_t node_id float64_t right_child_min, left_child_min, right_child_max, left_child_max - SplitRecord* split_ptr + SplitRecord* split float64_t middle_value bint is_leaf diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 4efb0db5f09c6..2dfad80df4204 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -303,7 +303,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): X, y, sample_weight = self._check_input(X, y, sample_weight) cdef Splitter splitter = self.splitter - cdef SplitRecord split cdef cnp.ndarray initial_roots = self.initial_roots cdef BuildEnv e @@ -340,7 +339,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): e.start = 0 e.end = 0 e.n_node_samples = splitter.n_samples - e.split_ptr = malloc(splitter.pointer_size()) + e.split = self.splitter.create_split_record() e.max_depth_seen = -1 if e.first else tree.max_depth @@ -413,24 +412,20 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not e.is_leaf: splitter.node_split( &e.parent_record, - e.split_ptr, + e.split, ) - # assign local copy of SplitRecord to assign - # pos, improvement, and impurity scores - split = deref(e.split_ptr) - # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 - e.is_leaf = (e.is_leaf or split.pos >= e.end or - (split.improvement + EPSILON < + e.is_leaf = (e.is_leaf or e.split.pos >= e.end or + (e.split.improvement + EPSILON < e.min_impurity_decrease)) e.node_id = e.add_or_update_node( - tree, e.parent, e.is_left, e.is_leaf, e.split_ptr, + tree, e.parent, e.is_left, e.is_leaf, e.split, e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, - split.missing_go_to_left + e.split.missing_go_to_left ) if e.node_id == INTPTR_MAX: @@ -450,7 +445,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not e.is_leaf: if ( not splitter.with_monotonic_cst or - splitter.monotonic_cst[split.feature] == 0 + splitter.monotonic_cst[e.split.feature] == 0 ): # Split on a feature with no monotonicity constraint @@ -459,7 +454,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # node value clipping. e.left_child_min = e.right_child_min = e.parent_record.lower_bound e.left_child_max = e.right_child_max = e.parent_record.upper_bound - elif splitter.monotonic_cst[split.feature] == 1: + elif splitter.monotonic_cst[e.split.feature] == 1: # Split on a feature with monotonic increase constraint e.left_child_min = e.parent_record.lower_bound e.right_child_max = e.parent_record.upper_bound @@ -469,7 +464,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): e.middle_value = splitter.criterion.middle_value() e.right_child_min = e.middle_value e.left_child_max = e.middle_value - else: # i.e. splitter.monotonic_cst[split.feature] == -1 + else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 # Split on a feature with monotonic decrease constraint e.right_child_min = e.parent_record.lower_bound e.left_child_max = e.parent_record.upper_bound @@ -482,12 +477,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Push right child on stack e.builder_stack.push({ - "start": split.pos, + "start": e.split.pos, "end": e.end, "depth": e.depth + 1, "parent": e.node_id, "is_left": 0, - "impurity": split.impurity_right, + "impurity": e.split.impurity_right, "n_constant_features": e.parent_record.n_constant_features, "lower_bound": e.right_child_min, "upper_bound": e.right_child_max, @@ -496,11 +491,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Push left child on stack e.builder_stack.push({ "start": e.start, - "end": split.pos, + "end": e.split.pos, "depth": e.depth + 1, "parent": e.node_id, "is_left": 1, - "impurity": split.impurity_left, + "impurity": e.split.impurity_left, "n_constant_features": e.parent_record.n_constant_features, "lower_bound": e.left_child_min, "upper_bound": e.left_child_max, @@ -546,24 +541,20 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not e.is_leaf: splitter.node_split( &e.parent_record, - e.split_ptr, + e.split, ) - # assign local copy of SplitRecord to assign - # pos, improvement, and impurity scores - split = deref(e.split_ptr) - # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 - e.is_leaf = (e.is_leaf or split.pos >= e.end or - (split.improvement + EPSILON < + e.is_leaf = (e.is_leaf or e.split.pos >= e.end or + (e.split.improvement + EPSILON < e.min_impurity_decrease)) e.node_id = e.add_or_update_node( - tree, e.parent, e.is_left, e.is_leaf, e.split_ptr, + tree, e.parent, e.is_left, e.is_leaf, e.split, e.parent_record.impurity, e.n_node_samples, - e.weighted_n_node_samples, split.missing_go_to_left + e.weighted_n_node_samples, e.split.missing_go_to_left ) if e.node_id == INTPTR_MAX: @@ -583,7 +574,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not e.is_leaf: if ( not splitter.with_monotonic_cst or - splitter.monotonic_cst[split.feature] == 0 + splitter.monotonic_cst[e.split.feature] == 0 ): # Split on a feature with no monotonicity constraint @@ -592,7 +583,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # node value clipping. e.left_child_min = e.right_child_min = e.parent_record.lower_bound e.left_child_max = e.right_child_max = e.parent_record.upper_bound - elif splitter.monotonic_cst[split.feature] == 1: + elif splitter.monotonic_cst[e.split.feature] == 1: # Split on a feature with monotonic increase constraint e.left_child_min = e.parent_record.lower_bound e.right_child_max = e.parent_record.upper_bound @@ -602,7 +593,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): e.middle_value = splitter.criterion.middle_value() e.right_child_min = e.middle_value e.left_child_max = e.middle_value - else: # i.e. splitter.monotonic_cst[split.feature] == -1 + else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 # Split on a feature with monotonic decrease constraint e.right_child_min = e.parent_record.lower_bound e.left_child_max = e.parent_record.upper_bound @@ -615,12 +606,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Push right child on stack e.builder_stack.push({ - "start": split.pos, + "start": e.split.pos, "end": e.end, "depth": e.depth + 1, "parent": e.node_id, "is_left": 0, - "impurity": split.impurity_right, + "impurity": e.split.impurity_right, "n_constant_features": e.parent_record.n_constant_features, "lower_bound": e.right_child_min, "upper_bound": e.right_child_max, @@ -629,11 +620,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Push left child on stack e.builder_stack.push({ "start": e.start, - "end": split.pos, + "end": e.split.pos, "depth": e.depth + 1, "parent": e.node_id, "is_left": 1, - "impurity": split.impurity_left, + "impurity": e.split.impurity_left, "n_constant_features": e.parent_record.n_constant_features, "lower_bound": e.left_child_min, "upper_bound": e.left_child_max, @@ -652,7 +643,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): tree.max_depth = e.max_depth_seen # free the memory created for the SplitRecord pointer - free(e.split_ptr) + free(e.split) if e.rc == -1: raise MemoryError() From a0176696d929268ee68db33f1a5a75016494b01d Mon Sep 17 00:00:00 2001 From: scarliles Date: Wed, 29 May 2024 13:04:23 -0400 Subject: [PATCH 25/29] build loop refactor --- sklearn/tree/_tree.pxd | 2 +- sklearn/tree/_tree.pyx | 431 +++++++++++++---------------------------- 2 files changed, 140 insertions(+), 293 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index dd0ebcd0aa251..e7627f0a9ab79 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -115,7 +115,7 @@ cdef struct BuildEnv: ParentInfo parent_record - AddOrUpdateNodeFunc add_or_update_node + bint add_or_update cdef class BaseTree: diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 2dfad80df4204..18c7e06b4e6fe 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -153,44 +153,6 @@ cdef class TreeBuilder: # Depth first builder --------------------------------------------------------- - - -cdef intp_t tree_add_node( - Tree tree, - intp_t parent, - bint is_left, - bint is_leaf, - SplitRecord* split_node, - float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - unsigned char missing_go_to_left -) except -1 nogil: - return tree._add_node( - parent, is_left, is_leaf, - split_node, impurity, - n_node_samples, weighted_n_node_samples, - missing_go_to_left - ) - -cdef intp_t tree_update_node( - Tree tree, - intp_t parent, - bint is_left, - bint is_leaf, - SplitRecord* split_node, - float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - unsigned char missing_go_to_left -) except -1 nogil: - return tree._update_node( - parent, is_left, is_leaf, - split_node, impurity, - n_node_samples, weighted_n_node_samples, - missing_go_to_left - ) - cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" @@ -289,6 +251,141 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # convert dict to numpy array and store value self.initial_roots = np.array(list(false_roots.items())) + cdef intp_t _build_body(self, Tree tree, Splitter splitter, BuildEnv* e) except -1 nogil: + while not e.target_stack.empty(): + e.stack_record = e.target_stack.top() + e.target_stack.pop() + + e.start = e.stack_record.start + e.end = e.stack_record.end + e.depth = e.stack_record.depth + e.parent = e.stack_record.parent + e.is_left = e.stack_record.is_left + e.parent_record.impurity = e.stack_record.impurity + e.parent_record.n_constant_features = e.stack_record.n_constant_features + e.parent_record.lower_bound = e.stack_record.lower_bound + e.parent_record.upper_bound = e.stack_record.upper_bound + + e.n_node_samples = e.end - e.start + splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) + + e.is_leaf = (e.depth >= e.max_depth or + e.n_node_samples < e.min_samples_split or + e.n_node_samples < 2 * e.min_samples_leaf or + e.weighted_n_node_samples < 2 * e.min_weight_leaf) + + if e.first: + e.parent_record.impurity = splitter.node_impurity() + e.first = 0 + + # impurity == 0 with tolerance due to rounding errors + e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON + + if not e.is_leaf: + splitter.node_split( + &e.parent_record, + e.split, + ) + + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + e.is_leaf = (e.is_leaf or e.split.pos >= e.end or + (e.split.improvement + EPSILON < + e.min_impurity_decrease)) + + e.node_id = tree._add_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) if e.add_or_update else tree._update_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) + + if e.node_id == INTPTR_MAX: + e.rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + e.node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value( + tree.value + e.node_id * tree.value_stride, + e.parent_record.lower_bound, + e.parent_record.upper_bound + ) + + if not e.is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[e.split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + e.left_child_min = e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.right_child_max = e.parent_record.upper_bound + elif splitter.monotonic_cst[e.split.feature] == 1: + # Split on a feature with monotonic increase constraint + e.left_child_min = e.parent_record.lower_bound + e.right_child_max = e.parent_record.upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.right_child_min = e.middle_value + e.left_child_max = e.middle_value + else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 + # Split on a feature with monotonic decrease constraint + e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.parent_record.upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.left_child_min = e.middle_value + e.right_child_max = e.middle_value + + # Push right child on stack + e.builder_stack.push({ + "start": e.split.pos, + "end": e.end, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 0, + "impurity": e.split.impurity_right, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.right_child_min, + "upper_bound": e.right_child_max, + }) + + # Push left child on stack + e.builder_stack.push({ + "start": e.start, + "end": e.split.pos, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 1, + "impurity": e.split.impurity_left, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.left_child_min, + "upper_bound": e.left_child_max, + }) + elif e.store_leaf_values and e.is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[e.node_id]) + + if e.depth > e.max_depth_seen: + e.max_depth_seen = e.depth + + return 0 + + cpdef build( self, Tree tree, @@ -379,262 +476,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): with nogil: e.target_stack = &e.update_stack - e.add_or_update_node = tree_update_node - while not e.target_stack.empty(): - e.stack_record = e.target_stack.top() - e.target_stack.pop() - - e.start = e.stack_record.start - e.end = e.stack_record.end - e.depth = e.stack_record.depth - e.parent = e.stack_record.parent - e.is_left = e.stack_record.is_left - e.parent_record.impurity = e.stack_record.impurity - e.parent_record.n_constant_features = e.stack_record.n_constant_features - e.parent_record.lower_bound = e.stack_record.lower_bound - e.parent_record.upper_bound = e.stack_record.upper_bound - - e.n_node_samples = e.end - e.start - splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) - - e.is_leaf = (e.depth >= e.max_depth or - e.n_node_samples < e.min_samples_split or - e.n_node_samples < 2 * e.min_samples_leaf or - e.weighted_n_node_samples < 2 * e.min_weight_leaf) - - if e.first: - e.parent_record.impurity = splitter.node_impurity() - e.first = 0 - - # impurity == 0 with tolerance due to rounding errors - e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON - - if not e.is_leaf: - splitter.node_split( - &e.parent_record, - e.split, - ) - - # If EPSILON=0 in the below comparison, float precision - # issues stop splitting, producing trees that are - # dissimilar to v0.18 - e.is_leaf = (e.is_leaf or e.split.pos >= e.end or - (e.split.improvement + EPSILON < - e.min_impurity_decrease)) - - e.node_id = e.add_or_update_node( - tree, e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, - e.split.missing_go_to_left - ) - - if e.node_id == INTPTR_MAX: - e.rc = -1 - break - - # Store value for all nodes, to facilitate tree/model - # inspection and interpretation - splitter.node_value(tree.value + e.node_id * tree.value_stride) - if splitter.with_monotonic_cst: - splitter.clip_node_value( - tree.value + e.node_id * tree.value_stride, - e.parent_record.lower_bound, - e.parent_record.upper_bound - ) - - if not e.is_leaf: - if ( - not splitter.with_monotonic_cst or - splitter.monotonic_cst[e.split.feature] == 0 - ): - # Split on a feature with no monotonicity constraint - - # Current bounds must always be propagated to both children. - # If a monotonic constraint is active, bounds are used in - # node value clipping. - e.left_child_min = e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.right_child_max = e.parent_record.upper_bound - elif splitter.monotonic_cst[e.split.feature] == 1: - # Split on a feature with monotonic increase constraint - e.left_child_min = e.parent_record.lower_bound - e.right_child_max = e.parent_record.upper_bound - - # Lower bound for right child and upper bound for left child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.right_child_min = e.middle_value - e.left_child_max = e.middle_value - else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 - # Split on a feature with monotonic decrease constraint - e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.parent_record.upper_bound - - # Lower bound for left child and upper bound for right child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.left_child_min = e.middle_value - e.right_child_max = e.middle_value - - # Push right child on stack - e.builder_stack.push({ - "start": e.split.pos, - "end": e.end, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 0, - "impurity": e.split.impurity_right, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.right_child_min, - "upper_bound": e.right_child_max, - }) - - # Push left child on stack - e.builder_stack.push({ - "start": e.start, - "end": e.split.pos, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 1, - "impurity": e.split.impurity_left, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.left_child_min, - "upper_bound": e.left_child_max, - }) - elif e.store_leaf_values and e.is_leaf: - # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[e.node_id]) - - if e.depth > e.max_depth_seen: - e.max_depth_seen = e.depth + e.add_or_update = 0 + self._build_body(tree, splitter, &e) e.target_stack = &e.builder_stack - e.add_or_update_node = tree_add_node - while not e.target_stack.empty(): - e.stack_record = e.target_stack.top() - e.target_stack.pop() - - e.start = e.stack_record.start - e.end = e.stack_record.end - e.depth = e.stack_record.depth - e.parent = e.stack_record.parent - e.is_left = e.stack_record.is_left - e.parent_record.impurity = e.stack_record.impurity - e.parent_record.n_constant_features = e.stack_record.n_constant_features - e.parent_record.lower_bound = e.stack_record.lower_bound - e.parent_record.upper_bound = e.stack_record.upper_bound - - e.n_node_samples = e.end - e.start - splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) - - e.is_leaf = (e.depth >= e.max_depth or - e.n_node_samples < e.min_samples_split or - e.n_node_samples < 2 * e.min_samples_leaf or - e.weighted_n_node_samples < 2 * e.min_weight_leaf) - - if e.first: - e.parent_record.impurity = splitter.node_impurity() - e.first=0 - - # impurity == 0 with tolerance due to rounding errors - e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON - - if not e.is_leaf: - splitter.node_split( - &e.parent_record, - e.split, - ) - - # If EPSILON=0 in the below comparison, float precision - # issues stop splitting, producing trees that are - # dissimilar to v0.18 - e.is_leaf = (e.is_leaf or e.split.pos >= e.end or - (e.split.improvement + EPSILON < - e.min_impurity_decrease)) - - e.node_id = e.add_or_update_node( - tree, e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, - e.weighted_n_node_samples, e.split.missing_go_to_left - ) - - if e.node_id == INTPTR_MAX: - e.rc = -1 - break - - # Store value for all nodes, to facilitate tree/model - # inspection and interpretation - splitter.node_value(tree.value + e.node_id * tree.value_stride) - if splitter.with_monotonic_cst: - splitter.clip_node_value( - tree.value + e.node_id * tree.value_stride, - e.parent_record.lower_bound, - e.parent_record.upper_bound - ) - - if not e.is_leaf: - if ( - not splitter.with_monotonic_cst or - splitter.monotonic_cst[e.split.feature] == 0 - ): - # Split on a feature with no monotonicity constraint - - # Current bounds must always be propagated to both children. - # If a monotonic constraint is active, bounds are used in - # node value clipping. - e.left_child_min = e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.right_child_max = e.parent_record.upper_bound - elif splitter.monotonic_cst[e.split.feature] == 1: - # Split on a feature with monotonic increase constraint - e.left_child_min = e.parent_record.lower_bound - e.right_child_max = e.parent_record.upper_bound - - # Lower bound for right child and upper bound for left child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.right_child_min = e.middle_value - e.left_child_max = e.middle_value - else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 - # Split on a feature with monotonic decrease constraint - e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.parent_record.upper_bound - - # Lower bound for left child and upper bound for right child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.left_child_min = e.middle_value - e.right_child_max = e.middle_value - - # Push right child on stack - e.builder_stack.push({ - "start": e.split.pos, - "end": e.end, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 0, - "impurity": e.split.impurity_right, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.right_child_min, - "upper_bound": e.right_child_max, - }) - - # Push left child on stack - e.builder_stack.push({ - "start": e.start, - "end": e.split.pos, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 1, - "impurity": e.split.impurity_left, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.left_child_min, - "upper_bound": e.left_child_max, - }) - elif e.store_leaf_values and e.is_leaf: - # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[e.node_id]) - - if e.depth > e.max_depth_seen: - e.max_depth_seen = e.depth + e.add_or_update = 1 + self._build_body(tree, splitter, &e) if e.rc >= 0: e.rc = tree._resize_c(tree.node_count) From 4325b0a101ea34c8193e21d003ee381fa9695b70 Mon Sep 17 00:00:00 2001 From: scarliles Date: Wed, 29 May 2024 13:43:46 -0400 Subject: [PATCH 26/29] add_or_update tweak --- sklearn/tree/_tree.pyx | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 18c7e06b4e6fe..ee0d979aad858 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -294,15 +294,18 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (e.split.improvement + EPSILON < e.min_impurity_decrease)) - e.node_id = tree._add_node( - e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, - e.split.missing_go_to_left - ) if e.add_or_update else tree._update_node( - e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, - e.split.missing_go_to_left - ) + if e.add_or_update: + e.node_id = tree._add_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) + else: + e.node_id = tree._update_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) if e.node_id == INTPTR_MAX: e.rc = -1 From 78c3a1b8352ab901cb07dcba0e6795103b3ced67 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 30 May 2024 10:18:12 -0400 Subject: [PATCH 27/29] reverted to back out build body refactor --- sklearn/tree/_tree.pxd | 2 +- sklearn/tree/_tree.pyx | 434 +++++++++++++++++++++++++++-------------- 2 files changed, 293 insertions(+), 143 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index e7627f0a9ab79..dd0ebcd0aa251 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -115,7 +115,7 @@ cdef struct BuildEnv: ParentInfo parent_record - bint add_or_update + AddOrUpdateNodeFunc add_or_update_node cdef class BaseTree: diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ee0d979aad858..2dfad80df4204 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -153,6 +153,44 @@ cdef class TreeBuilder: # Depth first builder --------------------------------------------------------- + + +cdef intp_t tree_add_node( + Tree tree, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + unsigned char missing_go_to_left +) except -1 nogil: + return tree._add_node( + parent, is_left, is_leaf, + split_node, impurity, + n_node_samples, weighted_n_node_samples, + missing_go_to_left + ) + +cdef intp_t tree_update_node( + Tree tree, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + unsigned char missing_go_to_left +) except -1 nogil: + return tree._update_node( + parent, is_left, is_leaf, + split_node, impurity, + n_node_samples, weighted_n_node_samples, + missing_go_to_left + ) + cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" @@ -251,144 +289,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # convert dict to numpy array and store value self.initial_roots = np.array(list(false_roots.items())) - cdef intp_t _build_body(self, Tree tree, Splitter splitter, BuildEnv* e) except -1 nogil: - while not e.target_stack.empty(): - e.stack_record = e.target_stack.top() - e.target_stack.pop() - - e.start = e.stack_record.start - e.end = e.stack_record.end - e.depth = e.stack_record.depth - e.parent = e.stack_record.parent - e.is_left = e.stack_record.is_left - e.parent_record.impurity = e.stack_record.impurity - e.parent_record.n_constant_features = e.stack_record.n_constant_features - e.parent_record.lower_bound = e.stack_record.lower_bound - e.parent_record.upper_bound = e.stack_record.upper_bound - - e.n_node_samples = e.end - e.start - splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) - - e.is_leaf = (e.depth >= e.max_depth or - e.n_node_samples < e.min_samples_split or - e.n_node_samples < 2 * e.min_samples_leaf or - e.weighted_n_node_samples < 2 * e.min_weight_leaf) - - if e.first: - e.parent_record.impurity = splitter.node_impurity() - e.first = 0 - - # impurity == 0 with tolerance due to rounding errors - e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON - - if not e.is_leaf: - splitter.node_split( - &e.parent_record, - e.split, - ) - - # If EPSILON=0 in the below comparison, float precision - # issues stop splitting, producing trees that are - # dissimilar to v0.18 - e.is_leaf = (e.is_leaf or e.split.pos >= e.end or - (e.split.improvement + EPSILON < - e.min_impurity_decrease)) - - if e.add_or_update: - e.node_id = tree._add_node( - e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, - e.split.missing_go_to_left - ) - else: - e.node_id = tree._update_node( - e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, - e.split.missing_go_to_left - ) - - if e.node_id == INTPTR_MAX: - e.rc = -1 - break - - # Store value for all nodes, to facilitate tree/model - # inspection and interpretation - splitter.node_value(tree.value + e.node_id * tree.value_stride) - if splitter.with_monotonic_cst: - splitter.clip_node_value( - tree.value + e.node_id * tree.value_stride, - e.parent_record.lower_bound, - e.parent_record.upper_bound - ) - - if not e.is_leaf: - if ( - not splitter.with_monotonic_cst or - splitter.monotonic_cst[e.split.feature] == 0 - ): - # Split on a feature with no monotonicity constraint - - # Current bounds must always be propagated to both children. - # If a monotonic constraint is active, bounds are used in - # node value clipping. - e.left_child_min = e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.right_child_max = e.parent_record.upper_bound - elif splitter.monotonic_cst[e.split.feature] == 1: - # Split on a feature with monotonic increase constraint - e.left_child_min = e.parent_record.lower_bound - e.right_child_max = e.parent_record.upper_bound - - # Lower bound for right child and upper bound for left child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.right_child_min = e.middle_value - e.left_child_max = e.middle_value - else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 - # Split on a feature with monotonic decrease constraint - e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.parent_record.upper_bound - - # Lower bound for left child and upper bound for right child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.left_child_min = e.middle_value - e.right_child_max = e.middle_value - - # Push right child on stack - e.builder_stack.push({ - "start": e.split.pos, - "end": e.end, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 0, - "impurity": e.split.impurity_right, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.right_child_min, - "upper_bound": e.right_child_max, - }) - - # Push left child on stack - e.builder_stack.push({ - "start": e.start, - "end": e.split.pos, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 1, - "impurity": e.split.impurity_left, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.left_child_min, - "upper_bound": e.left_child_max, - }) - elif e.store_leaf_values and e.is_leaf: - # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[e.node_id]) - - if e.depth > e.max_depth_seen: - e.max_depth_seen = e.depth - - return 0 - - cpdef build( self, Tree tree, @@ -479,12 +379,262 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): with nogil: e.target_stack = &e.update_stack - e.add_or_update = 0 - self._build_body(tree, splitter, &e) + e.add_or_update_node = tree_update_node + while not e.target_stack.empty(): + e.stack_record = e.target_stack.top() + e.target_stack.pop() + + e.start = e.stack_record.start + e.end = e.stack_record.end + e.depth = e.stack_record.depth + e.parent = e.stack_record.parent + e.is_left = e.stack_record.is_left + e.parent_record.impurity = e.stack_record.impurity + e.parent_record.n_constant_features = e.stack_record.n_constant_features + e.parent_record.lower_bound = e.stack_record.lower_bound + e.parent_record.upper_bound = e.stack_record.upper_bound + + e.n_node_samples = e.end - e.start + splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) + + e.is_leaf = (e.depth >= e.max_depth or + e.n_node_samples < e.min_samples_split or + e.n_node_samples < 2 * e.min_samples_leaf or + e.weighted_n_node_samples < 2 * e.min_weight_leaf) + + if e.first: + e.parent_record.impurity = splitter.node_impurity() + e.first = 0 + + # impurity == 0 with tolerance due to rounding errors + e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON + + if not e.is_leaf: + splitter.node_split( + &e.parent_record, + e.split, + ) + + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + e.is_leaf = (e.is_leaf or e.split.pos >= e.end or + (e.split.improvement + EPSILON < + e.min_impurity_decrease)) + + e.node_id = e.add_or_update_node( + tree, e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) + + if e.node_id == INTPTR_MAX: + e.rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + e.node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value( + tree.value + e.node_id * tree.value_stride, + e.parent_record.lower_bound, + e.parent_record.upper_bound + ) + + if not e.is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[e.split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + e.left_child_min = e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.right_child_max = e.parent_record.upper_bound + elif splitter.monotonic_cst[e.split.feature] == 1: + # Split on a feature with monotonic increase constraint + e.left_child_min = e.parent_record.lower_bound + e.right_child_max = e.parent_record.upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.right_child_min = e.middle_value + e.left_child_max = e.middle_value + else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 + # Split on a feature with monotonic decrease constraint + e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.parent_record.upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.left_child_min = e.middle_value + e.right_child_max = e.middle_value + + # Push right child on stack + e.builder_stack.push({ + "start": e.split.pos, + "end": e.end, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 0, + "impurity": e.split.impurity_right, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.right_child_min, + "upper_bound": e.right_child_max, + }) + + # Push left child on stack + e.builder_stack.push({ + "start": e.start, + "end": e.split.pos, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 1, + "impurity": e.split.impurity_left, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.left_child_min, + "upper_bound": e.left_child_max, + }) + elif e.store_leaf_values and e.is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[e.node_id]) + + if e.depth > e.max_depth_seen: + e.max_depth_seen = e.depth e.target_stack = &e.builder_stack - e.add_or_update = 1 - self._build_body(tree, splitter, &e) + e.add_or_update_node = tree_add_node + while not e.target_stack.empty(): + e.stack_record = e.target_stack.top() + e.target_stack.pop() + + e.start = e.stack_record.start + e.end = e.stack_record.end + e.depth = e.stack_record.depth + e.parent = e.stack_record.parent + e.is_left = e.stack_record.is_left + e.parent_record.impurity = e.stack_record.impurity + e.parent_record.n_constant_features = e.stack_record.n_constant_features + e.parent_record.lower_bound = e.stack_record.lower_bound + e.parent_record.upper_bound = e.stack_record.upper_bound + + e.n_node_samples = e.end - e.start + splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) + + e.is_leaf = (e.depth >= e.max_depth or + e.n_node_samples < e.min_samples_split or + e.n_node_samples < 2 * e.min_samples_leaf or + e.weighted_n_node_samples < 2 * e.min_weight_leaf) + + if e.first: + e.parent_record.impurity = splitter.node_impurity() + e.first=0 + + # impurity == 0 with tolerance due to rounding errors + e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON + + if not e.is_leaf: + splitter.node_split( + &e.parent_record, + e.split, + ) + + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + e.is_leaf = (e.is_leaf or e.split.pos >= e.end or + (e.split.improvement + EPSILON < + e.min_impurity_decrease)) + + e.node_id = e.add_or_update_node( + tree, e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, + e.weighted_n_node_samples, e.split.missing_go_to_left + ) + + if e.node_id == INTPTR_MAX: + e.rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + e.node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value( + tree.value + e.node_id * tree.value_stride, + e.parent_record.lower_bound, + e.parent_record.upper_bound + ) + + if not e.is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[e.split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + e.left_child_min = e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.right_child_max = e.parent_record.upper_bound + elif splitter.monotonic_cst[e.split.feature] == 1: + # Split on a feature with monotonic increase constraint + e.left_child_min = e.parent_record.lower_bound + e.right_child_max = e.parent_record.upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.right_child_min = e.middle_value + e.left_child_max = e.middle_value + else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 + # Split on a feature with monotonic decrease constraint + e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.parent_record.upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.left_child_min = e.middle_value + e.right_child_max = e.middle_value + + # Push right child on stack + e.builder_stack.push({ + "start": e.split.pos, + "end": e.end, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 0, + "impurity": e.split.impurity_right, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.right_child_min, + "upper_bound": e.right_child_max, + }) + + # Push left child on stack + e.builder_stack.push({ + "start": e.start, + "end": e.split.pos, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 1, + "impurity": e.split.impurity_left, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.left_child_min, + "upper_bound": e.left_child_max, + }) + elif e.store_leaf_values and e.is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[e.node_id]) + + if e.depth > e.max_depth_seen: + e.max_depth_seen = e.depth if e.rc >= 0: e.rc = tree._resize_c(tree.node_count) From b8cc636565f14dcbcf4ad912cc1336db25638e30 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 30 May 2024 11:22:37 -0400 Subject: [PATCH 28/29] refactor baby step --- sklearn/tree/_tree.pxd | 14 -- sklearn/tree/_tree.pyx | 306 +++++++++++++++++++---------------------- 2 files changed, 138 insertions(+), 182 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index dd0ebcd0aa251..930a21ad05783 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -43,18 +43,6 @@ cdef struct ParentInfo: float64_t impurity # the impurity of the parent intp_t n_constant_features # the number of constant features found in parent -ctypedef intp_t (*AddOrUpdateNodeFunc)( - Tree tree, - intp_t parent, - bint is_left, - bint is_leaf, - SplitRecord* split_node, - float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - unsigned char missing_go_to_left -) except -1 nogil - # A record on the stack for depth-first tree growing cdef struct StackRecord: intp_t start @@ -114,8 +102,6 @@ cdef struct BuildEnv: StackRecord stack_record ParentInfo parent_record - - AddOrUpdateNodeFunc add_or_update_node cdef class BaseTree: diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 2dfad80df4204..5dff8ed049921 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -153,44 +153,6 @@ cdef class TreeBuilder: # Depth first builder --------------------------------------------------------- - - -cdef intp_t tree_add_node( - Tree tree, - intp_t parent, - bint is_left, - bint is_leaf, - SplitRecord* split_node, - float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - unsigned char missing_go_to_left -) except -1 nogil: - return tree._add_node( - parent, is_left, is_leaf, - split_node, impurity, - n_node_samples, weighted_n_node_samples, - missing_go_to_left - ) - -cdef intp_t tree_update_node( - Tree tree, - intp_t parent, - bint is_left, - bint is_leaf, - SplitRecord* split_node, - float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - unsigned char missing_go_to_left -) except -1 nogil: - return tree._update_node( - parent, is_left, is_leaf, - split_node, impurity, - n_node_samples, weighted_n_node_samples, - missing_go_to_left - ) - cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" @@ -289,6 +251,141 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # convert dict to numpy array and store value self.initial_roots = np.array(list(false_roots.items())) + cdef void _build_body(self, Tree tree, Splitter splitter, BuildEnv* e, bint update) noexcept nogil: + while not e.target_stack.empty(): + e.stack_record = e.target_stack.top() + e.target_stack.pop() + + e.start = e.stack_record.start + e.end = e.stack_record.end + e.depth = e.stack_record.depth + e.parent = e.stack_record.parent + e.is_left = e.stack_record.is_left + e.parent_record.impurity = e.stack_record.impurity + e.parent_record.n_constant_features = e.stack_record.n_constant_features + e.parent_record.lower_bound = e.stack_record.lower_bound + e.parent_record.upper_bound = e.stack_record.upper_bound + + e.n_node_samples = e.end - e.start + splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) + + e.is_leaf = (e.depth >= e.max_depth or + e.n_node_samples < e.min_samples_split or + e.n_node_samples < 2 * e.min_samples_leaf or + e.weighted_n_node_samples < 2 * e.min_weight_leaf) + + if e.first: + e.parent_record.impurity = splitter.node_impurity() + e.first = 0 + + # impurity == 0 with tolerance due to rounding errors + e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON + + if not e.is_leaf: + splitter.node_split( + &e.parent_record, + e.split, + ) + + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + e.is_leaf = (e.is_leaf or e.split.pos >= e.end or + (e.split.improvement + EPSILON < + e.min_impurity_decrease)) + + if update == 1: + e.node_id = tree._update_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) + else: + e.node_id = tree._add_node( + e.parent, e.is_left, e.is_leaf, e.split, + e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, + e.split.missing_go_to_left + ) + + if e.node_id == INTPTR_MAX: + e.rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + e.node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value( + tree.value + e.node_id * tree.value_stride, + e.parent_record.lower_bound, + e.parent_record.upper_bound + ) + + if not e.is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[e.split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + e.left_child_min = e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.right_child_max = e.parent_record.upper_bound + elif splitter.monotonic_cst[e.split.feature] == 1: + # Split on a feature with monotonic increase constraint + e.left_child_min = e.parent_record.lower_bound + e.right_child_max = e.parent_record.upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.right_child_min = e.middle_value + e.left_child_max = e.middle_value + else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 + # Split on a feature with monotonic decrease constraint + e.right_child_min = e.parent_record.lower_bound + e.left_child_max = e.parent_record.upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + e.middle_value = splitter.criterion.middle_value() + e.left_child_min = e.middle_value + e.right_child_max = e.middle_value + + # Push right child on stack + e.builder_stack.push({ + "start": e.split.pos, + "end": e.end, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 0, + "impurity": e.split.impurity_right, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.right_child_min, + "upper_bound": e.right_child_max, + }) + + # Push left child on stack + e.builder_stack.push({ + "start": e.start, + "end": e.split.pos, + "depth": e.depth + 1, + "parent": e.node_id, + "is_left": 1, + "impurity": e.split.impurity_left, + "n_constant_features": e.parent_record.n_constant_features, + "lower_bound": e.left_child_min, + "upper_bound": e.left_child_max, + }) + elif e.store_leaf_values and e.is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[e.node_id]) + + if e.depth > e.max_depth_seen: + e.max_depth_seen = e.depth + cpdef build( self, Tree tree, @@ -379,136 +476,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): with nogil: e.target_stack = &e.update_stack - e.add_or_update_node = tree_update_node - while not e.target_stack.empty(): - e.stack_record = e.target_stack.top() - e.target_stack.pop() - - e.start = e.stack_record.start - e.end = e.stack_record.end - e.depth = e.stack_record.depth - e.parent = e.stack_record.parent - e.is_left = e.stack_record.is_left - e.parent_record.impurity = e.stack_record.impurity - e.parent_record.n_constant_features = e.stack_record.n_constant_features - e.parent_record.lower_bound = e.stack_record.lower_bound - e.parent_record.upper_bound = e.stack_record.upper_bound - - e.n_node_samples = e.end - e.start - splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) - - e.is_leaf = (e.depth >= e.max_depth or - e.n_node_samples < e.min_samples_split or - e.n_node_samples < 2 * e.min_samples_leaf or - e.weighted_n_node_samples < 2 * e.min_weight_leaf) - - if e.first: - e.parent_record.impurity = splitter.node_impurity() - e.first = 0 - - # impurity == 0 with tolerance due to rounding errors - e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON - - if not e.is_leaf: - splitter.node_split( - &e.parent_record, - e.split, - ) - - # If EPSILON=0 in the below comparison, float precision - # issues stop splitting, producing trees that are - # dissimilar to v0.18 - e.is_leaf = (e.is_leaf or e.split.pos >= e.end or - (e.split.improvement + EPSILON < - e.min_impurity_decrease)) - - e.node_id = e.add_or_update_node( - tree, e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, - e.split.missing_go_to_left - ) - - if e.node_id == INTPTR_MAX: - e.rc = -1 - break - - # Store value for all nodes, to facilitate tree/model - # inspection and interpretation - splitter.node_value(tree.value + e.node_id * tree.value_stride) - if splitter.with_monotonic_cst: - splitter.clip_node_value( - tree.value + e.node_id * tree.value_stride, - e.parent_record.lower_bound, - e.parent_record.upper_bound - ) - - if not e.is_leaf: - if ( - not splitter.with_monotonic_cst or - splitter.monotonic_cst[e.split.feature] == 0 - ): - # Split on a feature with no monotonicity constraint - - # Current bounds must always be propagated to both children. - # If a monotonic constraint is active, bounds are used in - # node value clipping. - e.left_child_min = e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.right_child_max = e.parent_record.upper_bound - elif splitter.monotonic_cst[e.split.feature] == 1: - # Split on a feature with monotonic increase constraint - e.left_child_min = e.parent_record.lower_bound - e.right_child_max = e.parent_record.upper_bound - - # Lower bound for right child and upper bound for left child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.right_child_min = e.middle_value - e.left_child_max = e.middle_value - else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 - # Split on a feature with monotonic decrease constraint - e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.parent_record.upper_bound - - # Lower bound for left child and upper bound for right child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.left_child_min = e.middle_value - e.right_child_max = e.middle_value - - # Push right child on stack - e.builder_stack.push({ - "start": e.split.pos, - "end": e.end, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 0, - "impurity": e.split.impurity_right, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.right_child_min, - "upper_bound": e.right_child_max, - }) - - # Push left child on stack - e.builder_stack.push({ - "start": e.start, - "end": e.split.pos, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 1, - "impurity": e.split.impurity_left, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.left_child_min, - "upper_bound": e.left_child_max, - }) - elif e.store_leaf_values and e.is_leaf: - # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[e.node_id]) - - if e.depth > e.max_depth_seen: - e.max_depth_seen = e.depth + self._build_body(tree, splitter, &e, 1) e.target_stack = &e.builder_stack - e.add_or_update_node = tree_add_node while not e.target_stack.empty(): e.stack_record = e.target_stack.top() e.target_stack.pop() @@ -551,8 +521,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (e.split.improvement + EPSILON < e.min_impurity_decrease)) - e.node_id = e.add_or_update_node( - tree, e.parent, e.is_left, e.is_leaf, e.split, + e.node_id = tree._add_node( + e.parent, e.is_left, e.is_leaf, e.split, e.parent_record.impurity, e.n_node_samples, e.weighted_n_node_samples, e.split.missing_go_to_left ) From f2256580d2482e607f40a938f3569f20cec95e95 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 30 May 2024 11:53:46 -0400 Subject: [PATCH 29/29] update node refactor more baby steps --- sklearn/tree/_tree.pyx | 127 +---------------------------------------- 1 file changed, 1 insertion(+), 126 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 5dff8ed049921..6e5ad54848b3c 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -479,132 +479,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): self._build_body(tree, splitter, &e, 1) e.target_stack = &e.builder_stack - while not e.target_stack.empty(): - e.stack_record = e.target_stack.top() - e.target_stack.pop() - - e.start = e.stack_record.start - e.end = e.stack_record.end - e.depth = e.stack_record.depth - e.parent = e.stack_record.parent - e.is_left = e.stack_record.is_left - e.parent_record.impurity = e.stack_record.impurity - e.parent_record.n_constant_features = e.stack_record.n_constant_features - e.parent_record.lower_bound = e.stack_record.lower_bound - e.parent_record.upper_bound = e.stack_record.upper_bound - - e.n_node_samples = e.end - e.start - splitter.node_reset(e.start, e.end, &e.weighted_n_node_samples) - - e.is_leaf = (e.depth >= e.max_depth or - e.n_node_samples < e.min_samples_split or - e.n_node_samples < 2 * e.min_samples_leaf or - e.weighted_n_node_samples < 2 * e.min_weight_leaf) - - if e.first: - e.parent_record.impurity = splitter.node_impurity() - e.first=0 - - # impurity == 0 with tolerance due to rounding errors - e.is_leaf = e.is_leaf or e.parent_record.impurity <= EPSILON - - if not e.is_leaf: - splitter.node_split( - &e.parent_record, - e.split, - ) - - # If EPSILON=0 in the below comparison, float precision - # issues stop splitting, producing trees that are - # dissimilar to v0.18 - e.is_leaf = (e.is_leaf or e.split.pos >= e.end or - (e.split.improvement + EPSILON < - e.min_impurity_decrease)) - - e.node_id = tree._add_node( - e.parent, e.is_left, e.is_leaf, e.split, - e.parent_record.impurity, e.n_node_samples, - e.weighted_n_node_samples, e.split.missing_go_to_left - ) - - if e.node_id == INTPTR_MAX: - e.rc = -1 - break - - # Store value for all nodes, to facilitate tree/model - # inspection and interpretation - splitter.node_value(tree.value + e.node_id * tree.value_stride) - if splitter.with_monotonic_cst: - splitter.clip_node_value( - tree.value + e.node_id * tree.value_stride, - e.parent_record.lower_bound, - e.parent_record.upper_bound - ) - - if not e.is_leaf: - if ( - not splitter.with_monotonic_cst or - splitter.monotonic_cst[e.split.feature] == 0 - ): - # Split on a feature with no monotonicity constraint - - # Current bounds must always be propagated to both children. - # If a monotonic constraint is active, bounds are used in - # node value clipping. - e.left_child_min = e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.right_child_max = e.parent_record.upper_bound - elif splitter.monotonic_cst[e.split.feature] == 1: - # Split on a feature with monotonic increase constraint - e.left_child_min = e.parent_record.lower_bound - e.right_child_max = e.parent_record.upper_bound - - # Lower bound for right child and upper bound for left child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.right_child_min = e.middle_value - e.left_child_max = e.middle_value - else: # i.e. splitter.monotonic_cst[e.split.feature] == -1 - # Split on a feature with monotonic decrease constraint - e.right_child_min = e.parent_record.lower_bound - e.left_child_max = e.parent_record.upper_bound - - # Lower bound for left child and upper bound for right child - # are set to the same value. - e.middle_value = splitter.criterion.middle_value() - e.left_child_min = e.middle_value - e.right_child_max = e.middle_value - - # Push right child on stack - e.builder_stack.push({ - "start": e.split.pos, - "end": e.end, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 0, - "impurity": e.split.impurity_right, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.right_child_min, - "upper_bound": e.right_child_max, - }) - - # Push left child on stack - e.builder_stack.push({ - "start": e.start, - "end": e.split.pos, - "depth": e.depth + 1, - "parent": e.node_id, - "is_left": 1, - "impurity": e.split.impurity_left, - "n_constant_features": e.parent_record.n_constant_features, - "lower_bound": e.left_child_min, - "upper_bound": e.left_child_max, - }) - elif e.store_leaf_values and e.is_leaf: - # copy leaf values to leaf_values array - splitter.node_samples(tree.value_samples[e.node_id]) - - if e.depth > e.max_depth_seen: - e.max_depth_seen = e.depth + self._build_body(tree, splitter, &e, 0) if e.rc >= 0: e.rc = tree._resize_c(tree.node_count)