package structure reorg

py-econometrics · Oct 14, 2022 · 3d91549 · 3d91549
1 parent e752f03
commit 3d91549
Show file tree

Hide file tree

Showing 9 changed files with 311 additions and 21 deletions.
diff --git a/__init__.py b/__init__.py
diff --git a/development_notebook.Rmd b/development_notebook.Rmd
@@ -0,0 +1,256 @@
+---
+title: "development notebook"
+author: "Alexander Fischer"
+date: "2022-09-16"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## in R
+
+... create all required input parameters for `boot_algo()` functions
+
+```{r, warning=FALSE, message = FALSE}
+library(fixest)
+library(reticulate)
+
+path_to_fwildclusterboot <- "C:/Users/alexa/Dropbox/fwildclusterboot"
+devtools::load_all(path_to_fwildclusterboot)
+
+
+# create the data set
+N_G1 <- 50
+data2 <- fwildclusterboot:::create_data(N = 1000,
+                                         N_G1 = N_G1,
+                                         icc1 = 0.8,
+                                         N_G2 = N_G1,
+                                         icc2 = 0.8,
+                                         numb_fe1 = 10,
+                                         numb_fe2 = 5,
+                                         seed = 41224,
+                                         #seed = 123,
+                                         weights = 1:N / N)
+
+data2$group_id1 <- as.factor(data2$group_id1)
+
+ssc <- ssc(adj = FALSE, cluster.adj = FALSE, cluster.df = "min", fixef.K = "none")
+boot_ssc <- boot_ssc(adj = FALSE, cluster.adj = FALSE, cluster.df = "min", fixef.K = "none")
+
+boot_algo <- "R"
+clustid <- cluster <-  c("group_id2")
+param = "log_income"
+R <- NULL
+r <-  0
+beta0 <- NULL
+B <- boot_iter <-  9999
+bootcluster =  "min"
+fe = NULL
+sign_level = NULL
+conf_int = NULL
+seed = NULL
+# beta0 = 0.1
+type = "rademacher"
+impose_null = TRUE
+p_val_type = NULL
+tol = 1e-6
+maxiter = 10
+na_omit = TRUE
+nthreads = 1
+sign_level = 0.05
+# full_enumeration = FAL
+floattype = "Float64"
+p_val_type = "two-tailed"
+getauxweights = FALSE
+turbo = FALSE
+bootstrapc = FALSE
+maxmatsize = NULL
+
+engine = "R"
+bootstrap_type = "11"
+
+object <- feols(proposition_vote ~ treatment + log_income + as.factor(group_id1), cluster = ~group_id1, data = data2)
+
+etable(object)
+system.time(boot_res <- boottest(object, param, B = 99999, clustid = ~group_id2))
+#ssc
+pval(boot_res)
+```
+
+run the first part of `boottest.fixest()`
+
+```{r, warning = FALSE, message = FALSE}
+  if (!is.null(beta0)) {
+    stop(
+      "The function argument 'beta0' is deprecated. Please use the
+      function argument 'r' instead, by which it is replaced."
+    )
+  }
+  
+  if (inherits(clustid, "formula")) {
+    clustid <- attr(terms(clustid), "term.labels")
+  }
+  
+  if (inherits(bootcluster, "formula")) {
+    bootcluster <- attr(terms(bootcluster), "term.labels")
+  }
+  
+  if (inherits(param, "formula")) {
+    param <- attr(terms(param), "term.labels")
+  }
+  
+  if (inherits(fe, "formula")) {
+    fe <- attr(terms(fe), "term.labels")
+  }
+  
+  internal_seed <- set_seed(
+    seed = seed,
+    engine = engine,
+    type = type
+  )
+  
+  if (!is.null(object$fixef_removed)) {
+    stop(
+      paste(
+        "feols() removes fixed effects with the following values: ",
+        object$fixef_removed,
+        ". Currently, boottest()'s internal pre-processing does not
+        account for this deletion. Therefore, please exclude such fixed
+        effects prior to estimation with feols(). You can find them listed
+        under '$fixef_removed' of your fixest object."
+      )
+    )
+  }
+  
+  # --------------------------------------------
+  
+  # check appropriateness of nthreads
+  nthreads <- check_set_nthreads(nthreads)
+  
+  if (is.null(clustid)) {
+    heteroskedastic <- TRUE
+    if (engine == "R") {
+      # heteroskedastic models should always be run through R-lean
+      engine <- "R-lean"
+    }
+  } else {
+    heteroskedastic <- FALSE
+  }
+  
+  
+  R_long <- process_R(
+    R = R,
+    param = param
+  )
+  
+  
+  if (engine != "WildBootTests.jl") {
+    r_algo_checks(
+      R = R_long,
+      p_val_type = p_val_type,
+      conf_int = conf_int,
+      B = B
+    )
+  }
+  
+  # check_params_in_model(object = object, param = param)
+  
+  check_boottest_args_plus(
+    object = object,
+    R = R_long,
+    param = param,
+    sign_level = sign_level,
+    B = B,
+    fe = fe
+  )
+  
+  # preprocess the data: Y, X, weights, fixed_effect
+  preprocess <- preprocess2.fixest(
+    object = object,
+    clustid = clustid,
+    R = R_long,
+    param = param,
+    bootcluster = bootcluster,
+    fe = fe,
+    engine = engine
+  )
+  
+  enumerate <-
+    check_set_full_enumeration(
+      preprocess = preprocess,
+      heteroskedastic = heteroskedastic,
+      B = B,
+      type = type,
+      engine = engine
+    )
+  full_enumeration <- enumerate$full_enumeration
+  B <- enumerate$B
+  
+  N <- preprocess$N
+  k <- preprocess$k
+  G <-
+    vapply(preprocess$clustid, function(x) {
+      length(unique(x))
+    }, numeric(1))
+  vcov_sign <- preprocess$vcov_sign
+  
+  small_sample_correction <-
+    get_ssc(
+      boot_ssc_object = ssc,
+      N = N,
+      k = k,
+      G = G,
+      vcov_sign = vcov_sign,
+      heteroskedastic = heteroskedastic
+    )
+  
+  # clustermin, clusteradj
+  
+  
+  clustid_dims <- preprocess$clustid_dims
+  # R*beta;
+  point_estimate <-
+    as.vector(object$coefficients[param] %*% preprocess$R0[param])
+  
+  boot_vcov <- boot_coef <- NULL
+
+```
+
+make all objects from `preprocess()` available in the global namespace
+
+```{r, warning = FALSE, message = FALSE}
+res <- lapply(names(preprocess), function(x) assign(x, preprocess[[x]], envir = .GlobalEnv))
+bootcluster <- as.vector(bootcluster)[[1]]
+```
+
+## pass all values to python
+
+via the `reticulate` package
+
+```{python}
+import numpy as np
+
+X = np.array(r.X)
+y = np.array(r.Y)
+#clustid_df = r.clustid_df
+bootstrap_type = r.bootstrap_type
+N_G_bootcluster = r.N_G_bootcluster
+bootcluster = np.array(r.bootcluster)
+cluster = np.array(bootcluster)
+R = np.array(r.R0)
+impose_null = True
+B = int(r.boot_iter)
+ssc = int(r.small_sample_correction)
+
+pval_type = r.p_val_type
+
+type(X)
+X[0:10, 0:10]
+
+```
+develop ... 
+
+
+
diff --git a/readme.md b/readme.md
@@ -15,7 +15,8 @@ If you'd like to cooperate, either send us an
 Note: everything is still very much work in progress, and there are multiple errors in the code that I am aware of. Still, I believe that the implementation of the WCR11 is more or less correct.
 
 ```
-import wildboottest
+import wildboottest.wildboottest as wb
+import numpy as np
 import timeit 
 import time
 
@@ -26,29 +27,27 @@ X = np.random.normal(0, 1, N * k).reshape((N,k))
 beta = np.random.normal(0,1,k)
 beta[0] = 0.005
 u = np.random.normal(0,1,N)
-y = 1 + X @ beta + u
+Y = 1 + X @ beta + u
 cluster = np.random.choice(list(range(0,G)), N)
 bootcluster = cluster
 R = np.zeros(k)
 R[0] = 1
 B = 99999
 
 start_time = timeit.default_timer()
-wb = Wildboottest(X = X, Y = y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341)
-wb.get_scores(bootstrap_type = "11", impose_null = True)
-wb.get_numer()
-wb.get_denom()
-wb.numer
-wb.denom
-wb.get_tboot()
-wb.t_boot
-wb.get_vcov()
-wb.get_tstat()
-wb.get_pvalue(pval_type = "two-tailed")
+wcr = wb.Wildboottest(X = X, Y = Y, cluster = cluster, bootcluster = bootcluster, R = R, B = B, seed = 12341)
+wcr.get_scores(bootstrap_type = "11", impose_null = True)
+wcr.get_numer()
+wcr.get_denom()
+wcr.numer
+wcr.denom
+wcr.get_tboot()
+wcr.t_boot
+wcr.get_vcov()
+wcr.get_tstat()
+wcr.get_pvalue(pval_type = "two-tailed")
 print("estimation time:", timeit.default_timer() - start_time)
-# >>> 0.1981981981981982
-print("p value:", wb.pvalue)
+# >>> 0.9225496 seconds
+print("p value:", wcr.pvalue)
 # >>> p value: 0.15258152581525816
-
-
 ```
diff --git a/setup.py b/setup.py
@@ -0,0 +1,34 @@
+import pathlib
+from setuptools import setup, find_packages
+
+HERE = pathlib.Path(__file__).parent
+
+VERSION = '0.1.0'
+PACKAGE_NAME = 'wildboottest'
+AUTHOR = ['Alexander Fischer', 'Aleksandr Michuda']
+AUTHOR_EMAIL = ['[email protected]', '[email protected]']
+URL = 'https://github.com/s3alfisc/wildboottest'
+
+LICENSE = 'MIT'
+DESCRIPTION = 'Wild Cluster Bootstrap Inference for Linear Models in Python'
+LONG_DESCRIPTION = (HERE / "readme.md").read_text()
+LONG_DESC_TYPE = "text/markdown"
+
+INSTALL_REQUIRES = [
+      'numpy',
+      'pandas', 
+      'numba'
+]
+
+setup(name=PACKAGE_NAME,
+      version=VERSION,
+      description=DESCRIPTION,
+      long_description=LONG_DESCRIPTION,
+      long_description_content_type=LONG_DESC_TYPE,
+      author=AUTHOR,
+      license=LICENSE,
+      author_email=AUTHOR_EMAIL,
+      url=URL,
+      install_requires=INSTALL_REQUIRES,
+      packages=find_packages()
+      )
diff --git a/wildboottest/__init__.py b/wildboottest/__init__.py
@@ -0,0 +1 @@
+
diff --git a/wildboottest/__pycache__/__init__.cpython-36.pyc b/wildboottest/__pycache__/__init__.cpython-36.pyc
diff --git a/wildboottest/__pycache__/wildboottest.cpython-36.pyc b/wildboottest/__pycache__/wildboottest.cpython-36.pyc
diff --git a/src/benchmarks.py → wildboottest/benchmarks.py b/src/benchmarks.py → wildboottest/benchmarks.py
diff --git a/src/Wildboottest-method.py → wildboottest/wildboottest.py b/src/Wildboottest-method.py → wildboottest/wildboottest.py
@@ -30,8 +30,8 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None):
 
       if isinstance(X, pd.DataFrame):
         self.X = X.values
-      if isinstance(y, pd.DataFrame):
-        self.y = y.values
+      if isinstance(Y, pd.DataFrame):
+        self.Y = Y.values
       if isinstance(cluster, pd.DataFrame):
         clustid = cluster.unique()
         self.cluster = cluster.values
@@ -74,7 +74,7 @@ def __init__(self, X, Y, cluster, bootcluster, R, B, seed = None):
 
         # split X and Y by (boot)cluster
         X_g = X[np.where(bootcluster == g)]
-        Y_g = y[np.where(bootcluster == g)]
+        Y_g = Y[np.where(bootcluster == g)]
         tXgXg = np.transpose(X_g) @ X_g
         tXgyg = np.transpose(X_g) @ Y_g
         X_list.append(X_g)
@@ -228,7 +228,7 @@ def compute_denom(Cg, H, bootclustid, B, G, v, ssc):
   def get_tboot(self):
 
       t_boot = self.numer / np.sqrt(self.denom)
-      self.t_boot = t_boot[1:(B+1)] # drop first element - might be useful for comp. of
+      self.t_boot = t_boot[1:(self.B+1)] # drop first element - might be useful for comp. of
 
   def get_vcov(self):