forked from LeoGrin/tabular-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
158 lines (132 loc) · 6.76 KB
/
train.py
File metadata and controls
158 lines (132 loc) · 6.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import numpy as np
from create_models import create_model
import os
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
def skorch_evaluation(model, x_train, x_val, x_test, y_train, y_val, y_test, config, model_id, return_r2):
"""
Evaluate the model
"""
y_hat_train = model.predict(x_train)
if x_val is not None:
y_hat_val = model.predict(x_val)
y_hat_test = model.predict(x_test)
if "regression" in config.keys() and config["regression"]:
if return_r2:
print(np.any(np.isnan(y_hat_train)))
train_score = r2_score(y_train.reshape(-1), y_hat_train.reshape(-1))
else:
train_score = np.sqrt(np.mean((y_hat_train.reshape(-1) - y_train.reshape(-1)) ** 2))
else:
train_score = np.sum((y_hat_train == y_train)) / len(y_train)
if "model__use_checkpoints" in config.keys() and config["model__use_checkpoints"]:
if not config["regression"]:
print("Using checkpoint")
model.load_params(r"skorch_cp/params_{}.pt".format(model_id))
else:
#TransformedTargetRegressor
if config["transformed_target"]:
model.regressor_.load_params(r"skorch_cp/params_{}.pt".format(model_id))
else:
model.load_params(r"skorch_cp/params_{}.pt".format(model_id))
if x_val is not None:
if "regression" in config.keys() and config["regression"]:
if return_r2:
val_score = r2_score(y_val.reshape(-1), y_hat_val.reshape(-1))
else:
val_score = np.sqrt(np.mean((y_hat_val.reshape(-1) - y_val.reshape(-1)) ** 2))
else:
val_score = np.sum((y_hat_val == y_val)) / len(y_val)
else:
val_score = None
if "regression" in config.keys() and config["regression"]:
if return_r2:
test_score = r2_score(y_test.reshape(-1), y_hat_test.reshape(-1))
else:
test_score = np.sqrt(np.mean((y_hat_test.reshape(-1) - y_test.reshape(-1)) ** 2))
else:
test_score = np.sum((y_hat_test == y_test)) / len(y_test)
if "model__use_checkpoints" in config.keys() and config["model__use_checkpoints"] and not return_r2:
try:
os.remove(r"skorch_cp/params_{}.pt".format(model_id))
except:
print("could not remove params file")
pass
return train_score, val_score, test_score
def sklearn_evaluation(fitted_model, x_train, x_val, x_test, y_train, y_val, y_test, config, return_r2):
"""
Evaluate a fitted model from sklearn
"""
y_hat_train = fitted_model.predict(x_train)
y_hat_val = fitted_model.predict(x_val)
y_hat_test = fitted_model.predict(x_test)
if "regression" in config.keys() and config["regression"]:
if return_r2:
train_score = r2_score(y_train.reshape(-1), y_hat_train.reshape(-1))
else:
train_score = np.sqrt(np.mean((y_hat_train.reshape(-1) - y_train.reshape(-1)) ** 2))
else:
train_score = np.sum((y_hat_train == y_train)) / len(y_train)
if "regression" in config.keys() and config["regression"]:
if return_r2:
val_score = r2_score(y_val.reshape(-1), y_hat_val.reshape(-1))
else:
val_score = np.sqrt(np.mean((y_hat_val.reshape(-1) - y_val.reshape(-1)) ** 2))
else:
val_score = np.sum((y_hat_val == y_val)) / len(y_val)
if "regression" in config.keys() and config["regression"]:
if return_r2:
test_score = r2_score(y_test.reshape(-1), y_hat_test.reshape(-1))
else:
test_score = np.sqrt(np.mean((y_hat_test.reshape(-1) - y_test.reshape(-1)) ** 2))
else:
test_score = np.sum((y_hat_test == y_test)) / len(y_test)
return train_score, val_score, test_score
def evaluate_model(fitted_model, x_train, y_train, x_val, y_val, x_test, y_test, config, model_id=None, return_r2=False):
"""
Evaluate the model
"""
if config["model_type"] == "sklearn":
train_score, val_score, test_score = sklearn_evaluation(fitted_model, x_train, x_val, x_test, y_train, y_val, y_test, config, return_r2=return_r2)
elif config["model_type"] == "skorch":
train_score, val_score, test_score = skorch_evaluation(fitted_model, x_train, x_val, x_test, y_train, y_val, y_test, config, model_id, return_r2=return_r2)
elif config["model_type"] == "tab_survey":
train_score, val_score, test_score = sklearn_evaluation(fitted_model, x_train, x_val, x_test, y_train, y_val, y_test, config, return_r2=return_r2)
return train_score, val_score, test_score
def train_model(iter, x_train, y_train, categorical_indicator, config):
"""
Train the model
"""
print("Training")
if config["model_type"] == "skorch":
id = hash(".".join(list(config.keys())) + "." + str(iter)) # uniquely identify the run (useful for checkpointing)
model_raw = create_model(config, categorical_indicator, id=id) # TODO rng ??
elif config["model_type"] == "sklearn":
id = None
model_raw = create_model(config, categorical_indicator)
elif config["model_type"] == "tab_survey":
id = hash(".".join(list(config.keys())) + "." + str(iter)) # uniquely identify the run (useful for checkpointing)
model_raw = create_model(config, categorical_indicator, num_features=x_train.shape[1], id=id,
cat_dims=list((x_train[:, categorical_indicator].max(0) + 1).astype(int)))
if config["regression"] and config["transformed_target"]:
model = TransformedTargetRegressor(model_raw, transformer=QuantileTransformer(output_distribution="normal"))
else:
model = model_raw
if config["data__categorical"] and "one_hot_encoder" in config.keys() and config["one_hot_encoder"]:
preprocessor = ColumnTransformer([("one_hot", OneHotEncoder(categories="auto", handle_unknown="ignore"),
[i for i in range(x_train.shape[1]) if categorical_indicator[i]]),
("numerical", "passthrough",
[i for i in range(x_train.shape[1]) if not categorical_indicator[i]])])
model = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
if config["model_type"] == "tab_survey":
x_val = x_train[int(len(x_train) * 0.8):]
y_val = y_train[int(len(y_train) * 0.8):]
x_train = x_train[:int(len(x_train) * 0.8)]
y_train = y_train[:int(len(y_train) * 0.8)]
model.fit(x_train, y_train, x_val, y_val)
else:
model.fit(x_train, y_train)
return model, id