Add some example actions for r/python/stata

opensafely · Mar 15, 2022 · b2bd262 · b2bd262
1 parent 4ff67ff
commit b2bd262
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 3 deletions.
diff --git a/analysis/example.do b/analysis/example.do
@@ -0,0 +1,13 @@
+// stata cannot handle compressed csv files directly, so unzip first to a plain csv file
+!gunzip output/input.csv.gz
+
+// now import the uncompressed csv using delimited
+import delimited using output/input.csv
+
+
+// your analysis code goes here
+
+
+// all dta file outputs should be saved using `gzsave` and a .dta.gz extension
+// In subsequent actions, use `gzuse` to load them.
+gzsave output/stata.dta.gz
diff --git a/analysis/example.py b/analysis/example.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import pyarrow.feather
+
+df = pd.read_csv("output/input.csv.gz")
+
+
+# feather files are compressed by default in python
+df.to_feather("output/python.feather")
+
+pyarrow.feather.write_feather(df, "output/python.feather.raw", compression="uncompressed")
diff --git a/analysis/example.r b/analysis/example.r
@@ -0,0 +1,6 @@
+# read compressed .csv file
+df <- readr::read_csv("output/input.csv.gz")
+
+# write a .feather file output
+arrow::write_feather(df, "output/r.feather")
+arrow::write_feather(df, "output/r.feather.raw", compression = "uncompressed")
diff --git a/analysis/stata.do b/analysis/stata.do
diff --git a/analysis/study_definition.py b/analysis/study_definition.py
@@ -10,4 +10,11 @@
     population=patients.registered_with_one_practice_between(
         "2019-02-01", "2020-02-01"
     ),
+    age=patients.age_as_of(
+        "2019-09-01",
+        return_expectations={
+            "rate": "universal",
+            "int": {"distribution": "population_ages"},
+        },
+    ),
 )
diff --git a/project.yaml b/project.yaml
@@ -1,12 +1,33 @@
 version: '3.0'
 
 expectations:
-  population_size: 1000
+  population_size: 10000
 
 actions:
 
   generate_study_population:
-    run: cohortextractor:latest generate_cohort --study-definition study_definition
+    run: cohortextractor:latest generate_cohort --output-format csv.gz --study-definition study_definition
     outputs:
       highly_sensitive:
-        cohort: output/input.csv
+        cohort: output/input.csv.gz
+
+  python_example:
+    run: python:latest analysis/example.py
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/python.feather*
+
+  stata_example:
+    run: stata-mp:latest analysis/example.do
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/stata.dta.gz
+
+  r_example:
+    run: r:latest analysis/example.r
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/r.feather*