-
Notifications
You must be signed in to change notification settings - Fork 0
/
reg_dml.py
300 lines (268 loc) · 13.3 KB
/
reg_dml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#################################################################
# Code written by Gang Xu ([email protected])
# For bug issues, please contact author using the email address
#################################################################
import numpy as np
import pandas as pd
from math import sqrt
import pickle
import time
import math
import statsmodels.api as sm
from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
from scipy import stats
from doubleml.datasets import make_plr_CCDDHNR2018
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import multiprocessing as mp
from sklearn.preprocessing import StandardScaler
import copy
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
Nested, Autoregressive)
# estimate the variance of dml estimator
def var_est_dml(dml_model):
#dml_model: a result object generated by doubleml.DoubleMLPLR function
(x, y, d) = (dml_model._dml_data.x,dml_model._dml_data.y,dml_model._dml_data.d)
theta_hat=dml_model.coef[0]
psi_a=dml_model.psi_elements['psi_a']
psi=dml_model.psi
model_g_predict=((dml_model.predictions)["ml_g"]).reshape(-1,1)
model_m_predict=((dml_model.predictions)["ml_m"]).reshape(-1,1)
if dml_model.draw_sample_splitting:
_var_scaling_factor = dml_model._dml_data.n_obs
y_test=np.array(y)
d_test=np.array(d)
x_test=np.array(x)
else:
smpls = dml_model._smpls
test_index =smpls[0][0][1]
_var_scaling_factor = len(test_index)
psi_a=psi_a[test_index]
psi = psi[test_index]
model_g_predict=model_g_predict[test_index]
model_m_predict=model_m_predict[test_index]
y_test=np.array(y)[test_index]
d_test=np.array(d)[test_index]
x_test=np.array(x)[test_index,:]
w1_x=np.concatenate((d_test.reshape(-1,1),x_test),axis=1)
w1_x_add_cons=np.array(sm.add_constant(w1_x))
J = np.mean(psi_a)
if dml_model.score=="IV-type":
sigma2_hat = 1 / _var_scaling_factor * np.mean(np.power(psi, 2)) / np.power(J, 2)
else:
sigma2_hat = np.linalg.inv(np.dot(w1_x_add_cons.T,w1_x_add_cons))[1,1]*np.sum(np.power(psi.reshape(-1)/d_test,2))/(_var_scaling_factor-x_test.shape[1]-2)
return sqrt(sigma2_hat)
#estimate the variance of DML-RC estimator
def var_est_gee(dml_model,using_lasso,w1_x_pred_per_add_cons, xw_nm_m,covmatrix_all,gee_params):
#dml_model: a result object generated by doubleml.DoubleMLPLR function
#using_lasso: Boolean value, indicate the main model is linear (True) or not (False)
#w1_x_pred_per_add_cons: matrix, corrected measurement of exposure and covariate in main study
#xw_nm_m: matrix, nearest monitor measurement of exposure and covariate in main study
#covmatrix_all: matrix,the covariance matrix estimated parameter from measurement error model using GEE
#gee_params:vector, estimated parameter from measurement error model using GEE
(x, y, d) = (dml_model._dml_data.x,dml_model._dml_data.y,dml_model._dml_data.d)
theta_hat=dml_model.coef[0]
psi_a=dml_model.psi_elements['psi_a']
psi=dml_model.psi
model_g_predict=((dml_model.predictions)["ml_g"]).reshape(-1,1)
model_m_predict=((dml_model.predictions)["ml_m"]).reshape(-1,1)
if dml_model.draw_sample_splitting:
_var_scaling_factor = dml_model._dml_data.n_obs
y_test=np.array(y)
d_test=np.array(d)
x_test=np.array(x)
nm_xd_test=xw_nm_m
else:
smpls = dml_model._smpls
test_index =smpls[0][0][1]
_var_scaling_factor = len(test_index)
psi_a=psi_a[test_index]
psi = psi[test_index]
model_g_predict=model_g_predict[test_index]
model_m_predict=model_m_predict[test_index]
y_test=np.array(y)[test_index]
d_test=np.array(d)[test_index]
x_test=np.array(x)[test_index,:]
nm_xd_test=xw_nm_m[test_index,:]
w1_x=np.concatenate((d_test.reshape(-1,1),x_test),axis=1)
w1_x_add_cons=np.array(sm.add_constant(w1_x))
J = np.mean(psi_a)
if dml_model.score=="IV-type":
sigma2_hat = 1 / _var_scaling_factor * np.mean(np.power(psi, 2)) / np.power(J, 2)
else:
sigma2_hat = np.linalg.inv(np.dot(w1_x_add_cons.T,w1_x_add_cons))[1,1]*np.sum(np.power(psi.reshape(-1)/d_test,2))/(_var_scaling_factor-x_test.shape[1]-2)
devpsi=[]
devpsi0=np.zeros(covmatrix_all.shape[0])
ele_total=int(len(gee_params)/xw_nm_m.shape[1])
ele_num=0
for ele_num in range(ele_total):
index=np.array(range(xw_nm_m.shape[1]))+(ele_num)*(xw_nm_m.shape[1])
x1=xw_nm_m
sum_all=np.zeros(x1.shape[1])
if(using_lasso==True):
for k in range(x1.shape[1]):
params=copy.deepcopy(gee_params[index])
params[k]=params[k]+0.0001
dml_model_copy=copy.deepcopy(dml_model)
if(ele_num==0):
pred_d=np.dot(x1,params)
for num in range(len(dml_model_copy._dml_data.d)):
dml_model_copy._dml_data.d[num]=pred_d[num]
else:
dml_model_copy._dml_data.x[:,ele_num-1]=np.dot(x1,params)
dml_model_copy.fit(store_predictions=True,store_models=True)
g_pred=((dml_model_copy.predictions)["ml_g"]).reshape(-1)
m_pred=((dml_model_copy.predictions)["ml_m"]).reshape(-1)
d_new=dml_model_copy._dml_data.d
dml_model_new_psi = dml_model_copy.psi
if dml_model.draw_sample_splitting:
d_new_test=np.array(d_new)
else:
d_new_test=np.array(d_new)[test_index]
g_pred=g_pred[test_index]
m_pred=m_pred[test_index]
dml_model_new_psi = dml_model_copy.psi[test_index]
if dml_model.score=="IV-type":
new_psi=(d_new_test-m_pred)*(y_test-(d_new_test)*theta_hat-g_pred)
else:
new_psi=(d_new_test)*(y_test-(d_new_test)*theta_hat-g_pred)
for kn in range(len(y_test)):
sum_all[k]=sum_all[k]+(new_psi)[kn]-psi[kn]
sum_all=sum_all*10000
devpsi=np.concatenate((devpsi,sum_all), axis=None)
var_nmall=np.dot(devpsi,np.dot(covmatrix_all,devpsi))/nm_xd_test.shape[0]
var_nmD=np.dot(devpsi0,np.dot(covmatrix_all,devpsi0))/nm_xd_test.shape[0]
var_total2=sigma2_hat+ var_nmall / np.power(np.sum(psi_a), 2)
return sqrt(sigma2_hat+1 / _var_scaling_factor * var_nmD / np.power(J, 2)),sqrt(var_total2)
#calculate estimator and covariance of estimator in measurement error model using GEE
def data_me_gee(x_per_v,x_nm_v,w1_per_v,w1_nm_v,w2_v):
#x_per_v: personal measurement of exposure of interest in validation study
#x_nm_v: nearest monitor measurement of exposure of interest in validation study
#x1_per_v: personal measurement of other exposure in validation study
#x1_nm_v: nearest monitor measurement of other exposure in validation study
#w2_v: other covariate without measurement error in validation study
x_per_v=np.array(x_per_v).reshape(-1)
w1_per_v=np.array(w1_per_v)
x_nm_v=np.array(x_nm_v).reshape(-1,1)
w1_nm_v=np.array(w1_nm_v)
xw_nm_v=np.concatenate((x_nm_v,w1_nm_v,w2_v),axis=1)
I=np.identity(1+w1_per_v.shape[1])
xw_nm_v =sm.add_constant(xw_nm_v)
x_all=np.kron(I,xw_nm_v)
y_all=x_per_v
id_all=range(len(y_all))
for i in range(w1_per_v.shape[1]):
y_all=np.concatenate((y_all,w1_per_v[:,i]),axis=0)
id=range(len(w1_per_v[:,i]))
id_all=np.concatenate((id_all,id),axis=0)
cov_struct = Independence()
gee_model = sm.GEE(y_all, x_all, groups=id_all, cov_struct=cov_struct)
gee_results = gee_model.fit()
sigma=np.zeros((w1_per_v.shape[1]+1,w1_per_v.shape[1]+1))
unique_id=np.unique(id_all)
for j in unique_id:
yj=y_all[id_all==j]
xj=x_all[id_all==j,:]
res_1=(yj-xj@gee_results.params).reshape(-1,1)
sigma=sigma+res_1@res_1.T
sigma=sigma/(len(unique_id)-1)
sigma_inv=np.linalg.inv(sigma)
I22=np.zeros((x_all.shape[1],x_all.shape[1]))
C22=np.zeros((x_all.shape[1],x_all.shape[1]))
for j in unique_id:
yj=y_all[id_all==j]
xj=x_all[id_all==j,:]
I22=I22+xj.T@sigma_inv@xj
res_vec=xj.T@sigma_inv@(yj-xj@gee_results.params).reshape(-1,1)
C22=C22+res_vec@res_vec.T
I22_inv=np.linalg.inv(I22)
var22=I22_inv@C22@I22_inv
data = list()
data.append(gee_results.params)
data.append(var22)
return data
#using estimated parameter in measurement erro model to estimate exposure in main study
def gee_predict(x_nm_m,w1_nm_m,w2_m,gee_params):
#x_nm_m: nearest monitor measurement of exposure of interest in the main study
#x1_nm_m: nearest monitor measurement of other exposure in the main study
#w2_m: other covariate without measurement error in the main study
#gee_params:vector, estimated parameter from measurement error model using GEE
x_nm_m=np.array(x_nm_m).reshape(-1,1)
w1_nm_m=np.array(w1_nm_m)
xw_nm_m=np.concatenate((x_nm_m,w1_nm_m,w2_m),axis=1)
xw_nm_m=sm.add_constant(xw_nm_m)
x_pred_per=xw_nm_m@gee_params[0:xw_nm_m.shape[1]]
w1_pred_per=np.zeros(w1_nm_m.shape)
for n in range(w1_pred_per.shape[1]):
index=np.array(range(xw_nm_m.shape[1]))+(n+1)*(xw_nm_m.shape[1])
w1_pred_per[:,n]=xw_nm_m@gee_params[index]
return (x_pred_per,w1_pred_per,xw_nm_m)
#estimate the parameter of interest and standard deviation of estimator by DML-RC
def dml(data,ml_l,ml_m,ml_g,true_para, ME,using_lasso, w1_x_pred_per_add_cons, xw_nm_m,gee_var,gee_params):
#data: object, (x,y,d) where x is the exposure of interest,y is the outcome, d is the covariates
#ml_l, ml_m, ml_g: prespecified machine learning functions
#true_para: true parameter of interest
#ME: Boolean value, indicate using measurement error model (True) or not (False)
#using_lassso: oolean value, indicate using lasso as machine learning function (True) or not (False)
#w1_x_pred_per_add_cons: matrix, corrected measurement of exposure and covariate in main study
#xw_nm_m: matrix, nearest monitor measurement of exposure and covariate in main study
#covmatrix_all: matrix,the covariance matrix estimated parameter from measurement error model using GEE
#gee_params:vector, estimated parameter from measurement error model using GEE
(x, y, d) = data
obj_dml_data = DoubleMLData.from_arrays(x, y, d)
obj_dml_plr = DoubleMLPLR(obj_dml_data,
ml_l, ml_m, ml_g,
n_folds=2,
score='IV-type')
obj_dml_plr.fit(store_predictions=True,store_models=True)
this_theta_dml_plr = obj_dml_plr.coef[0]
se_orth=obj_dml_plr.se[0]
se_orth_v2=var_est_dml(obj_dml_plr)
if ME:
se_orth1,se_orth2=var_est_gee(obj_dml_plr,using_lasso,w1_x_pred_per_add_cons, xw_nm_m,gee_var,gee_params)
results=[this_theta_dml_plr-true_para,se_orth2]
else:
results=[this_theta_dml_plr-true_para,se_orth_v2]
return results
#estimate the parameter of interest and standard deviation of estimator by DML-RC using LASSO
def dml_lasso(data,true_para, ME,using_lasso, w1_x_pred_per_add_cons, xw_nm_m,gee_var,gee_params):
#data: object, (x,y,d) where x is the exposure of interest,y is the outcome, d is the covariates
#true_para: true parameter of interest
#ME: Boolean value, indicate using measurement error model (True) or not (False)
#using_lassso: oolean value, indicate using lasso as machine learning function (True) or not (False)
#w1_x_pred_per_add_cons: matrix, corrected measurement of exposure and covariate in main study
#xw_nm_m: matrix, nearest monitor measurement of exposure and covariate in main study
#covmatrix_all: matrix,the covariance matrix estimated parameter from measurement error model using GEE
#gee_params:vector, estimated parameter from measurement error model using GEE
ml_l = linear_model.Lasso(max_iter=100000, warm_start=True)
ml_m = linear_model.Lasso(max_iter=100000, warm_start=True)
ml_g = linear_model.Lasso(max_iter=100000, warm_start=True)
(x, y, d) = data
obj_dml_data = DoubleMLData.from_arrays(x, y, d)
par_grids = {'ml_l': {'alpha': np.arange(0.001, 0.1, 0.005)},
'ml_m': {'alpha': np.arange(0.001, 0.1, 0.005)},
'ml_g': {'alpha': np.arange(0.001, 0.1, 0.005)}}
obj_dml_data = DoubleMLData.from_arrays(x, y, d)
obj_dml_plr = DoubleMLPLR(obj_dml_data,
ml_l, ml_m, ml_g,
n_folds=2,
score='IV-type')
obj_dml_plr.tune(par_grids, search_mode='grid_search');
print("ML_full_sample",obj_dml_plr.params)
obj_dml_plr.fit(store_predictions=True,store_models=True)
this_theta_dml_plr = obj_dml_plr.coef[0]
se_orth=obj_dml_plr.se[0]
se_orth_v2=var_est_dml(obj_dml_plr)
if ME:
se_orth1,se_orth2=var_est_gee(obj_dml_plr,using_lasso,w1_x_pred_per_add_cons, xw_nm_m,gee_var,gee_params)
results=[this_theta_dml_plr-true_para,se_orth2]
else:
results=[this_theta_dml_plr-true_para,se_orth_v2]
return results