Skip to content

Commit

Permalink
preprocessing fix
Browse files Browse the repository at this point in the history
  • Loading branch information
perib committed Nov 16, 2023
1 parent f298771 commit a0095e5
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 32 deletions.
44 changes: 28 additions & 16 deletions tpot2/tpot_estimator/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,23 +619,35 @@ def fit(self, X, y):
if self.preprocessing:
#X = pd.DataFrame(X)

#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
if isinstance(X, pd.DataFrame): #pandas dataframe
if self.categorical_features is not None:
X[self.categorical_features] = X[self.categorical_features].astype(object)
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'), #impute categorical columns
tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'), #impute numeric columns
tpot2.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.0001)) #one hot encode categorical columns
X = self._preprocessing_pipeline.fit_transform(X)
else:
if self.categorical_features is not None: #numpy array and categorical columns specified
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'), #impute categorical columns
tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns
tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns
else: #numpy array and no categorical columns specified, just do imputation
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))

if not isinstance(self.preprocessing, bool) and isinstance(self.preprocessing, sklearn.base.BaseEstimator):
self._preprocessing_pipeline = self.preprocessing

#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
else: #if self.preprocessing is True or not a sklearn estimator

pipeline_steps = []

if self.categorical_features is not None: #if categorical features are specified, use those
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent')))
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent')))

else:
if isinstance(X, pd.DataFrame):
categorical_columns = X.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent')))
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent')))
else:
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
else:
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))

self._preprocessing_pipeline = sklearn.pipeline.Pipeline(pipeline_steps)

X = self._preprocessing_pipeline.fit_transform(X, y)

else:
self._preprocessing_pipeline = None

Expand Down
44 changes: 28 additions & 16 deletions tpot2/tpot_estimator/steady_state_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,23 +605,35 @@ def fit(self, X, y):
if self.preprocessing:
#X = pd.DataFrame(X)

#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
if isinstance(X, pd.DataFrame): #pandas dataframe
if self.categorical_features is not None:
X[self.categorical_features] = X[self.categorical_features].astype(object)
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'), #impute categorical columns
tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'), #impute numeric columns
tpot2.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.0001)) #one hot encode categorical columns
X = self._preprocessing_pipeline.fit_transform(X)
else:
if self.categorical_features is not None: #numpy array and categorical columns specified
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'), #impute categorical columns
tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns
tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns
else: #numpy array and no categorical columns specified, just do imputation
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))

if not isinstance(self.preprocessing, bool) and isinstance(self.preprocessing, sklearn.base.BaseEstimator):
self._preprocessing_pipeline = self.preprocessing

#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
else: #if self.preprocessing is True or not a sklearn estimator

pipeline_steps = []

if self.categorical_features is not None: #if categorical features are specified, use those
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent')))
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent')))

else:
if isinstance(X, pd.DataFrame):
categorical_columns = X.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent')))
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent')))
else:
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
else:
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))

self._preprocessing_pipeline = sklearn.pipeline.Pipeline(pipeline_steps)

X = self._preprocessing_pipeline.fit_transform(X, y)

else:
self._preprocessing_pipeline = None

Expand Down

0 comments on commit a0095e5

Please sign in to comment.