-
Notifications
You must be signed in to change notification settings - Fork 0
/
param_opt.py
152 lines (133 loc) · 6.41 KB
/
param_opt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
import pandas as pd
import argparse
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
# Load dataset
def load_data(file_path):
data = pd.read_csv(file_path)
X = data.iloc[:, :-1] # Features (all columns except the last)
y = data.iloc[:, -1] # Labels (last column)
return X, y
# Define the classifiers and their parameter grids
def get_classifiers_and_param_grids():
classifiers = {
'DT': (DecisionTreeClassifier(random_state=42, class_weight='balanced'), {
'criterion': ['gini', 'entropy'],
'max_depth': [10, 20, 50, 100],
'min_samples_split': [10, 20, 30, 40, 50],
'max_features': ['auto', 'sqrt', 'log2']
}),
'RF': (RandomForestClassifier(random_state=42, class_weight='balanced'), {
'n_estimators': [10, 50, 100, 200],
'criterion': ['gini', 'entropy'],
'max_depth': [10, 20, 50, 100],
'max_features': ['auto', 'sqrt', 'log2']
}),
'LR': (LogisticRegression(random_state=42, class_weight='balanced'), {
'penalty': ['l1', 'l2'],
'C': np.logspace(-4, 4, 20),
'solver': ['liblinear']
}),
'XGB': (XGBClassifier(random_state=42), {
'min_child_weight': [1, 5, 10],
'gamma': [0.5, 1, 1.5, 2, 5],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'max_depth': [3, 4, 5]
}),
'GNB': (GaussianNB(), {
'var_smoothing': np.logspace(0, -9, num=100)
}),
'ET': (ExtraTreesClassifier(random_state=42, class_weight='balanced'), {
'n_estimators': [10, 50, 100, 200],
'criterion': ['gini', 'entropy'],
'max_depth': [10, 20, 50, 100],
'max_features': ['auto', 'sqrt', 'log2']
}),
'KNN': (KNeighborsClassifier(), {
'n_neighbors': np.arange(50, 100),
'metric': ['minkowski', 'euclidean', 'manhattan'],
'weights': ['uniform', 'distance']
}),
'SVC': (SVC(random_state=42, class_weight='balanced', probability=True), {
'kernel': ['rbf', 'linear'],
'gamma': [0.0005, 0.001, 0.05, 0.1],
'C': [0.001, 0.01, 0.1, 1, 10]
})
}
return classifiers
# Function to perform grid search and calculate AUROC
def perform_grid_search(X, y, classifier_name, classifier, param_grid):
best_params_results = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Dynamically adjust all KNN parameters based on the number of samples in each fold
if classifier_name == 'KNN':
# Get the smallest training fold size from cross-validation
smallest_fold_size = min(len(train_idx) for train_idx, _ in cv.split(X, y))
# Adjust n_neighbors to be <= smallest fold size
max_n_neighbors = min(smallest_fold_size, param_grid['n_neighbors'].max())
param_grid['n_neighbors'] = np.arange(1, max_n_neighbors + 1)
# Print the adjusted n_neighbors range for debugging
print(f"Adjusted n_neighbors range for KNN: {param_grid['n_neighbors']}")
try:
print(f"Running Grid Search for {classifier_name}...")
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
y_proba = grid_search.predict_proba(X)[:, 1]
auroc = roc_auc_score(y, y_proba)
best_params_results.append({
'Classifier': classifier_name,
'Best Parameters': grid_search.best_params_,
'AUROC': auroc
})
print(f"Best parameters for {classifier_name}: {grid_search.best_params_} with AUROC: {auroc}")
except Exception as e:
print(f"Error with {classifier_name}: {e}")
return best_params_results
# Save the best parameters and AUROC to a txt file
def save_best_params(best_params_results, output_file):
with open(output_file, 'w') as f:
f.write("Classifier\tBest Parameters\tAUROC\n")
for result in best_params_results:
f.write(f"{result['Classifier']}\t{result['Best Parameters']}\t{result['AUROC']:.4f}\n")
if __name__ == "__main__":
# Setup argparse to get input arguments
parser = argparse.ArgumentParser(description='Hypertune classifiers with GridSearchCV.')
parser.add_argument('--file', type=str, required=True, help='Path to the input CSV file containing features and labels.')
parser.add_argument('--classifier', type=str, choices=['DT', 'RF', 'LR', 'XGB', 'GNB', 'ET', 'KNN', 'SVC', 'ALL'],
default='DT', help='Classifier to use (default: DT). Use "ALL" to run all classifiers.')
parser.add_argument('--output', type=str, default='best_params.txt', help='Output file to save best parameters and AUROC (default: best_params.txt).')
args = parser.parse_args()
# Load data
file_path = args.file
X, y = load_data(file_path)
# Get classifiers and their corresponding parameter grids
classifiers = get_classifiers_and_param_grids()
# If user chooses "ALL", run for all classifiers, otherwise run for the specified classifier
if args.classifier == "ALL":
selected_classifiers = classifiers
else:
selected_classifiers = {args.classifier: classifiers.get(args.classifier)}
if selected_classifiers[args.classifier] is None:
print(f"Error: Classifier {args.classifier} is not valid. Please choose from {list(classifiers.keys())} or 'ALL'.")
exit(1)
# Perform grid search and calculate AUROC
all_results = []
for clf_name, (clf, param_grid) in selected_classifiers.items():
results = perform_grid_search(X, y, clf_name, clf, param_grid)
all_results.extend(results)
# Save best parameters and AUROC
output_file = args.output
save_best_params(all_results, output_file)
print(f"Best parameters and AUROC saved to {output_file}")