From 3d9154906330d8c5ae0be525dd9a535ed913d426 Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Fri, 14 Oct 2022 14:27:06 +0200 Subject: [PATCH] package structure reorg --- __init__.py | 0 development_notebook.Rmd | 256 ++++++++++++++++++ readme.md | 33 ++- setup.py | 34 +++ wildboottest/__init__.py | 1 + .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 145 bytes .../__pycache__/wildboottest.cpython-36.pyc | Bin 0 -> 6063 bytes {src => wildboottest}/benchmarks.py | 0 .../wildboottest.py | 8 +- 9 files changed, 311 insertions(+), 21 deletions(-) delete mode 100644 __init__.py create mode 100644 development_notebook.Rmd create mode 100644 setup.py create mode 100644 wildboottest/__init__.py create mode 100644 wildboottest/__pycache__/__init__.cpython-36.pyc create mode 100644 wildboottest/__pycache__/wildboottest.cpython-36.pyc rename {src => wildboottest}/benchmarks.py (100%) rename src/Wildboottest-method.py => wildboottest/wildboottest.py (97%) diff --git a/__init__.py b/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/development_notebook.Rmd b/development_notebook.Rmd new file mode 100644 index 0000000..d9408c8 --- /dev/null +++ b/development_notebook.Rmd @@ -0,0 +1,256 @@ +--- +title: "development notebook" +author: "Alexander Fischer" +date: "2022-09-16" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## in R + +... create all required input parameters for `boot_algo()` functions + +```{r, warning=FALSE, message = FALSE} +library(fixest) +library(reticulate) + +path_to_fwildclusterboot <- "C:/Users/alexa/Dropbox/fwildclusterboot" +devtools::load_all(path_to_fwildclusterboot) + + +# create the data set +N_G1 <- 50 +data2 <- fwildclusterboot:::create_data(N = 1000, + N_G1 = N_G1, + icc1 = 0.8, + N_G2 = N_G1, + icc2 = 0.8, + numb_fe1 = 10, + numb_fe2 = 5, + seed = 41224, + #seed = 123, + weights = 1:N / N) + +data2$group_id1 <- as.factor(data2$group_id1) + +ssc <- ssc(adj = FALSE, cluster.adj = FALSE, cluster.df = "min", fixef.K = "none") +boot_ssc <- boot_ssc(adj = FALSE, cluster.adj = FALSE, cluster.df = "min", fixef.K = "none") + +boot_algo <- "R" +clustid <- cluster <- c("group_id2") +param = "log_income" +R <- NULL +r <- 0 +beta0 <- NULL +B <- boot_iter <- 9999 +bootcluster = "min" +fe = NULL +sign_level = NULL +conf_int = NULL +seed = NULL +# beta0 = 0.1 +type = "rademacher" +impose_null = TRUE +p_val_type = NULL +tol = 1e-6 +maxiter = 10 +na_omit = TRUE +nthreads = 1 +sign_level = 0.05 +# full_enumeration = FAL +floattype = "Float64" +p_val_type = "two-tailed" +getauxweights = FALSE +turbo = FALSE +bootstrapc = FALSE +maxmatsize = NULL + +engine = "R" +bootstrap_type = "11" + +object <- feols(proposition_vote ~ treatment + log_income + as.factor(group_id1), cluster = ~group_id1, data = data2) + +etable(object) +system.time(boot_res <- boottest(object, param, B = 99999, clustid = ~group_id2)) +#ssc +pval(boot_res) +``` + +run the first part of `boottest.fixest()` + +```{r, warning = FALSE, message = FALSE} + if (!is.null(beta0)) { + stop( + "The function argument 'beta0' is deprecated. Please use the + function argument 'r' instead, by which it is replaced." + ) + } + + if (inherits(clustid, "formula")) { + clustid <- attr(terms(clustid), "term.labels") + } + + if (inherits(bootcluster, "formula")) { + bootcluster <- attr(terms(bootcluster), "term.labels") + } + + if (inherits(param, "formula")) { + param <- attr(terms(param), "term.labels") + } + + if (inherits(fe, "formula")) { + fe <- attr(terms(fe), "term.labels") + } + + internal_seed <- set_seed( + seed = seed, + engine = engine, + type = type + ) + + if (!is.null(object$fixef_removed)) { + stop( + paste( + "feols() removes fixed effects with the following values: ", + object$fixef_removed, + ". Currently, boottest()'s internal pre-processing does not + account for this deletion. Therefore, please exclude such fixed + effects prior to estimation with feols(). You can find them listed + under '$fixef_removed' of your fixest object." + ) + ) + } + + # -------------------------------------------- + + # check appropriateness of nthreads + nthreads <- check_set_nthreads(nthreads) + + if (is.null(clustid)) { + heteroskedastic <- TRUE + if (engine == "R") { + # heteroskedastic models should always be run through R-lean + engine <- "R-lean" + } + } else { + heteroskedastic <- FALSE + } + + + R_long <- process_R( + R = R, + param = param + ) + + + if (engine != "WildBootTests.jl") { + r_algo_checks( + R = R_long, + p_val_type = p_val_type, + conf_int = conf_int, + B = B + ) + } + + # check_params_in_model(object = object, param = param) + + check_boottest_args_plus( + object = object, + R = R_long, + param = param, + sign_level = sign_level, + B = B, + fe = fe + ) + + # preprocess the data: Y, X, weights, fixed_effect + preprocess <- preprocess2.fixest( + object = object, + clustid = clustid, + R = R_long, + param = param, + bootcluster = bootcluster, + fe = fe, + engine = engine + ) + + enumerate <- + check_set_full_enumeration( + preprocess = preprocess, + heteroskedastic = heteroskedastic, + B = B, + type = type, + engine = engine + ) + full_enumeration <- enumerate$full_enumeration + B <- enumerate$B + + N <- preprocess$N + k <- preprocess$k + G <- + vapply(preprocess$clustid, function(x) { + length(unique(x)) + }, numeric(1)) + vcov_sign <- preprocess$vcov_sign + + small_sample_correction <- + get_ssc( + boot_ssc_object = ssc, + N = N, + k = k, + G = G, + vcov_sign = vcov_sign, + heteroskedastic = heteroskedastic + ) + + # clustermin, clusteradj + + + clustid_dims <- preprocess$clustid_dims + # R*beta; + point_estimate <- + as.vector(object$coefficients[param] %*% preprocess$R0[param]) + + boot_vcov <- boot_coef <- NULL + +``` + +make all objects from `preprocess()` available in the global namespace + +```{r, warning = FALSE, message = FALSE} +res <- lapply(names(preprocess), function(x) assign(x, preprocess[[x]], envir = .GlobalEnv)) +bootcluster <- as.vector(bootcluster)[[1]] +``` + +## pass all values to python + +via the `reticulate` package + +```{python} +import numpy as np + +X = np.array(r.X) +y = np.array(r.Y) +#clustid_df = r.clustid_df +bootstrap_type = r.bootstrap_type +N_G_bootcluster = r.N_G_bootcluster +bootcluster = np.array(r.bootcluster) +cluster = np.array(bootcluster) +R = np.array(r.R0) +impose_null = True +B = int(r.boot_iter) +ssc = int(r.small_sample_correction) + +pval_type = r.p_val_type + +type(X) +X[0:10, 0:10] + +``` +develop ... + + + diff --git a/readme.md b/readme.md index 589fb0a..3f5037c 100644 --- a/readme.md +++ b/readme.md @@ -15,7 +15,8 @@ If you'd like to cooperate, either send us an Note: everything is still very much work in progress, and there are multiple errors in the code that I am aware of. Still, I believe that the implementation of the WCR11 is more or less correct. ``` -import wildboottest +import wildboottest.wildboottest as wb +import numpy as np import timeit import time @@ -26,7 +27,7 @@ X = np.random.normal(0, 1, N * k).reshape((N,k)) beta = np.random.normal(0,1,k) beta[0] = 0.005 u = np.random.normal(0,1,N) -y = 1 + X @ beta + u +Y = 1 + X @ beta + u cluster = np.random.choice(list(range(0,G)), N) bootcluster = cluster R = np.zeros(k) @@ -34,21 +35,19 @@ R[0] = 1 B = 99999 start_time = timeit.default_timer() -wb = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341) -wb.get_scores(bootstrap_type = "11", impose_null = True) -wb.get_numer() -wb.get_denom() -wb.numer -wb.denom -wb.get_tboot() -wb.t_boot -wb.get_vcov() -wb.get_tstat() -wb.get_pvalue(pval_type = "two-tailed") +wcr = wb.Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341) +wcr.get_scores(bootstrap_type = "11", impose_null = True) +wcr.get_numer() +wcr.get_denom() +wcr.numer +wcr.denom +wcr.get_tboot() +wcr.t_boot +wcr.get_vcov() +wcr.get_tstat() +wcr.get_pvalue(pval_type = "two-tailed") print("estimation time:", timeit.default_timer() - start_time) -# >>> 0.1981981981981982 -print("p value:", wb.pvalue) +# >>> 0.9225496 seconds +print("p value:", wcr.pvalue) # >>> p value: 0.15258152581525816 - - ``` diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1e7aa32 --- /dev/null +++ b/setup.py @@ -0,0 +1,34 @@ +import pathlib +from setuptools import setup, find_packages + +HERE = pathlib.Path(__file__).parent + +VERSION = '0.1.0' +PACKAGE_NAME = 'wildboottest' +AUTHOR = ['Alexander Fischer', 'Aleksandr Michuda'] +AUTHOR_EMAIL = ['alexander-fischer1801@t-online.de', 'amichuda@gmail.com'] +URL = 'https://github.com/s3alfisc/wildboottest' + +LICENSE = 'MIT' +DESCRIPTION = 'Wild Cluster Bootstrap Inference for Linear Models in Python' +LONG_DESCRIPTION = (HERE / "readme.md").read_text() +LONG_DESC_TYPE = "text/markdown" + +INSTALL_REQUIRES = [ + 'numpy', + 'pandas', + 'numba' +] + +setup(name=PACKAGE_NAME, + version=VERSION, + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + long_description_content_type=LONG_DESC_TYPE, + author=AUTHOR, + license=LICENSE, + author_email=AUTHOR_EMAIL, + url=URL, + install_requires=INSTALL_REQUIRES, + packages=find_packages() + ) diff --git a/wildboottest/__init__.py b/wildboottest/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/wildboottest/__init__.py @@ -0,0 +1 @@ + diff --git a/wildboottest/__pycache__/__init__.cpython-36.pyc b/wildboottest/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5e813334ebcdf697d6d0aae630fad305bed6362 GIT binary patch literal 145 zcmXr!<>e9!_Dp7CU|@I*#Bjg}WH|tFF$<7LVF+f>Wb|9fPy`Z25Wno4tztrpQ;UjY z5_3{35@THQlS^|`^Gb?i$}@9PlJfIQN>YnUP&x7OnR%Hd@$q^EmA5!-a`RJ4b5iX< J<`x4n0{~GCB+LK+ literal 0 HcmV?d00001 diff --git a/wildboottest/__pycache__/wildboottest.cpython-36.pyc b/wildboottest/__pycache__/wildboottest.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28a4096418d2752495eaeb5c95246d02fb62643c GIT binary patch literal 6063 zcmZ`-O>o=B6~-<=kc3EyqNpGLM1f@|5mVcZ>Q0)vjwi7_iPN~6)QMvmbq0Y5NFq&< z)B^hREIXZ{)9Inp@u9~~554u)L#K!K)LV}|q=)o-497u2+h!X zWQ5j0dUhy2HtM-2m}9KhMQXO1NrO70-+0+iz=?QCPlf_Z)_57=Sz#`;9~oXQ%!dWk zc32D@)cJ5KETJxh=tcG)sgf|sRJbWQ7-EFqR&2BeIqBse@5{BWp z?sXLfN#q6{x4YSj8j0K8awo?@61v+_;`S~a1nvFE-8^>V{YE2-+ky@Q#gi)uvJIP3B*-6GhqkAyf$1{Ux2K0D?R_kmfw7i9lRqkQr zHn6)Y>a~ML#CDEtB+Cw)$!h_^l5K{Dm?<~pPG^ek*xLi<(h z=F)&He@!oeu7)Nd*~2p*bec&sXg425?Cf;fMh6Uz8^o~!z!*=6tUvh*zj{4&nsKv( z1KnvvsqBSb@kWpY@8R%9shyqPRBWW;UTW`mntS_EnjaqEbc)sj@qL1vxC3I?B~cKglC>P^ z2r8vvZ43UB(_%p^i3;_kC9=P1;aHAY6;&~fKg{F5f|!=WS=n67C_(;kelogU9BI>_ z08GCn6dQ$+@IJU&gd&t*$PkaB9>B_K2|Zz zA*WcvidoI*<3N%hap1pGN54(r{Vkv;km&J!AYQ+$?Uti+7&(=cbU8yLMvr`p=0mPa zCk!AF*A=vYJB8m_zf5>#6^-2w`Eq0Qxr3d)-v%jj;@_z8F4E{ zm|sGaTQd6ddY&zz=bVR3gtg2Pto!nC_9;ZZ1&#edBv=@YhNX)~-h`#MGD{OzSCcaX z`PfvedJ!jQXd7Y$_EOJQc;|)DS$a-7mcaV|@|!M0hdf0pe%~vhfjBb1t}Hm)6J#7f zp|#X%sDrgsT=J-Ki6clVjz00syVq|Mxs&G5x^zUQcRlb=3Dj>Vn&- zhP}5k=J%;On-;JPzZ8Nq+$1O-*FTUgg<+@Q-*W zJPEb(8*8kQmmliq*_l#QlRoMlTJv3yR5p*)1zNle3w^{f6vRTW%-X@U1bIT_m9lQu zOD*c9;zypt>}|3l74HtaJVRZhMW=?AkXam(GV*PDvh(zdAVfb_Acm|Uv>;r_ibP6W>J}Ap{}lw0IRO5T61O zxTYm)2rkQVSw38RCcHe=sX>Vl{ivqn&H;vw09=ZSU}u*M1W1u7O>SA-%dxA7^}3x! zeR|#M21qKeZ*zoCb9^Xq+WGBLu%6hNB5A<`{sKWv4riD(@o?d5s`8+*rg@B`4+Pfq zk7$sl2uo7^fztY+jtEi+HR32t5ExrI4mFtmHAN{uheM?3yk|&pThjxCL4St# z&01HNpnHR?q2CM&S@khW{3nnEd#@307irb}(eDz8L_h0G3Y+*qp%MInf`EjO&Nh0m z$LZtSx?ehaW6V7VZn=-(h_o3WD`W>Wbaa_oH41{Iy;}$dhISyyLBeBd_{2akFKP$2 zz6rUFXX1kkmK8&DfG1)^UKb!;hBOc8gkAI|86(viS~wnK$VziaUAH6kHuy+w_jl7= zuX`A&`i%M^vBb>`y*PoKqh&ZzJVmk`e+s8!GnF1?s7KqGN6Uko)Z0nL&hx%kZgh8h z`$^>U)W2huxCUaF)3SzatO_To3Rk!$9OS~+P90CWc%|KK1nv0h#b?&4SJY)J^wcuN z+LB$F1q{{f5$A|UZlTD%*v>3L9-F3q3_(4gImt&fh;CchI(3IypAxxC5%x_%hg22^+*lrgp+#J&1t}0Uix}kr8~Y@lQxI9FWx);Q$XBt4O{4p~n7+yQFd(qJj^ps$7shDpcsorDAD@Jq=rFt>`-h<> z+?-*bT_61c3@3O6*Vc4OB<>DqDbC-K)D{|>9YZ@gdoqX_<5r_itwG+e=t3!{KAmu;N34c`aJAQCpqk1NP=cN3LA9% z=|}pFjsFcgbm$RLR%sJZcoPO9p&E$$fX|Dc>+~>AV%!#uBjZU<(mrj+Qx2gtE^ZPV zkps9%+lkrV!E6h&A3Za6w07+oE>;*jUT*#2gb@!*(cXU09-73M`=z)FJRvwp>F9J) zGuVvP=jbc)K+n$Z1XFXj*{NIECKrj9C7-g_#y!Z_;FMFDZ07&4N?ZjokifV|X6QS& zihAKisYW)Zj#NRYP0Xp6=zF_^`xW0$9pA^b;eMOiCEq7ghLXJRhuwznt2Z!5eL&tDWh%|{j1r}~m{-x+x6H2hQg{mc;4-0n-cHwLxS11*1me=!~sq~I{%dk7R kz6oMQq1mGpW;4jJo7Q^KL(;;P%!I3yMe!Ld(A|OjKLCKK00000 literal 0 HcmV?d00001 diff --git a/src/benchmarks.py b/wildboottest/benchmarks.py similarity index 100% rename from src/benchmarks.py rename to wildboottest/benchmarks.py diff --git a/src/Wildboottest-method.py b/wildboottest/wildboottest.py similarity index 97% rename from src/Wildboottest-method.py rename to wildboottest/wildboottest.py index d208ecd..681d49b 100644 --- a/src/Wildboottest-method.py +++ b/wildboottest/wildboottest.py @@ -30,8 +30,8 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None): if isinstance(X, pd.DataFrame): self.X = X.values - if isinstance(y, pd.DataFrame): - self.y = y.values + if isinstance(Y, pd.DataFrame): + self.Y = Y.values if isinstance(cluster, pd.DataFrame): clustid = cluster.unique() self.cluster = cluster.values @@ -74,7 +74,7 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None): # split X and Y by (boot)cluster X_g = X[np.where(bootcluster == g)] - Y_g = y[np.where(bootcluster == g)] + Y_g = Y[np.where(bootcluster == g)] tXgXg = np.transpose(X_g) @ X_g tXgyg = np.transpose(X_g) @ Y_g X_list.append(X_g) @@ -228,7 +228,7 @@ def compute_denom(Cg, H, bootclustid, B, G, v, ssc): def get_tboot(self): t_boot = self.numer / np.sqrt(self.denom) - self.t_boot = t_boot[1:(B+1)] # drop first element - might be useful for comp. of + self.t_boot = t_boot[1:(self.B+1)] # drop first element - might be useful for comp. of def get_vcov(self):