Complete evaluation and add experiments

gao-lab · Aug 24, 2023 · d318e41 · d318e41
1 parent 0e1a349
commit d318e41
Show file tree

Hide file tree

Showing 17 changed files with 275 additions and 39 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+exclude = .git
+max-line-length = 90
+per-file-ignores =
+    tests/test_*.py:F401,F403,F405,F811
+    __init__.py:F401,E402
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,11 @@
 /conda/
 /dist/
 __pycache__/
-.coverage
+.ipynb_checkpoints/
+.pytest_cache/
+.snakemake/
+.coverage*
+.DS_Store
 *.pyc
 *.csv
+*.ipynb
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,34 @@
+{
+    "files.trimTrailingWhitespace": true,
+    "files.trimFinalNewlines": true,
+    "files.watcherExclude": {
+        "**/conda/**": true,
+        "**/__pycache__/**": true,
+        "**/.ipynb_checkpoints/**": true,
+        "**/.pytest_cache/**": true,
+        "**/.snakemake/**": true,
+        "**/.coverage*": true
+    },
+    "files.exclude": {
+        "**/conda/**": true,
+        "**/__pycache__/**": true,
+        "**/.ipynb_checkpoints/**": true,
+        "**/.pytest_cache/**": true,
+        "**/.snakemake/**": true,
+        "**/.coverage*": true
+    },
+    "editor.formatOnSave": true,
+    "python.defaultInterpreterPath": "./conda/bin/python",
+    "python.testing.pytestEnabled": true,
+    "python.analysis.typeCheckingMode": "off",
+    "python.analysis.diagnosticSeverityOverrides": {
+        "reportShadowedImports": "none"
+    },
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter",
+        "editor.formatOnPaste": false,
+        "editor.codeActionsOnSave": {
+            "source.organizeImports": true
+        }
+    }
+}
diff --git a/LICENSE b/LICENSE
@@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
-
diff --git a/README.md b/README.md
@@ -16,27 +16,35 @@ Then install necessary packages in this environment.
 
 ### Export environment to file
 
-Exporting environment to `conda.yaml` enables git version tracking and syncing across machines:
+Exporting environment to `conda.yaml` enables git version tracking and syncing
+across machines:
 
 ```sh
 mamba env export -p ./conda --no-build > conda.yaml
 ```
 
 > **Note**
-> We may also chooose to remove the `name:` and `prefix:` lines from the exported file to redact the trace of absolute paths:
-> `mamba env export -p ./conda --no-build | grep -Ev '^(name|prefix):' > conda.yaml`
+>
+> We may also choose to remove the `name:` and `prefix:` lines from the exported
+> file to redact the trace of absolute paths, as well as to remove our
+> "mypackage" entry which should be installed as a development version:
+>
+> `mamba env export -p ./conda --no-build | grep -Ev '^name|^prefix|mypackage' >
+> conda.yaml`
 
 ### Clone the environment on another machine
 
-If we wish to use computing resource on another machine, we may clone the repo and rebuild the environment with the `conda.yaml` file.
+If we wish to use computing resource on another machine, we may clone the repo
+and rebuild the environment with the `conda.yaml` file.
 
 ```sh
 mamba env create -p ./conda -f conda.yaml
 ```
 
 ### Sync environment change across machines
 
-If we changed the environment on one machine, we may update the `conda.yaml` file, sync it over git, and use it update the environment on another machine.
+If we changed the environment on one machine, we may update the `conda.yaml`
+file, sync it over git, and use it update the environment on another machine.
 
 ```sh
 mamba env update -p ./conda -f conda.yaml --prune
@@ -47,6 +55,34 @@ mamba env update -p ./conda -f conda.yaml --prune
 Use the `pyproject.toml` file to instruct the build and install process.
 
 - In this demo, we are using the build tool `flit`.
-- Use `flit install -s` to install our own package in symlink mode, so changes in our package are immediately effective without reinstalling.
-- To make a formal release of the package, use `flit build` and `twine upload`, or this [github action](https://github.com/pypa/gh-action-pypi-publish).
+- Use `flit install -s` to install our own package in symlink mode, so changes
+  in our package are immediately effective without reinstalling.
+- To make a formal release of the package, use `flit build` and `twine upload`,
+  or this [github action](https://github.com/pypa/gh-action-pypi-publish).
 
+
+## Data preparation
+
+Go to the `data/download` directory, download each dataset as described in their
+README file, and run the preprocessing scripts. These will produce a
+`data/processed` directory containing standardized data files.
+
+## Run evaluation
+
+Go to the `evaluation` directory, and run the following command:
+
+```sh
+snakemake -prk -j4
+```
+
+## Run experiments
+
+Go to the `experiments` subdirectories, and convert percent scripts to Jupyter
+notebooks with the following command:
+
+```sh
+jupytext --to notebook *.py
+```
+
+Then play with the notebooks in Jupyter Lab, they will stay in sync with the
+percent script automatically.
diff --git a/conda.yaml b/conda.yaml
@@ -203,4 +203,9 @@ dependencies:
   - zeromq=4.3.4
   - zipp=3.16.2
   - pip:
-      - mypackage==0.1.0
+      - cattrs==23.1.2
+      - esbonio==0.16.1
+      - lsprotocol==2023.0.0a2
+      - pygls==1.0.2
+      - pyspellchecker==0.7.2
+      - typeguard==3.0.2
diff --git a/evaluation/workflow/Snakefile b/evaluation/workflow/Snakefile
@@ -1,16 +1,19 @@
 configfile: "config/config.yaml"
 
+
 rule all:
     input:
-        "results/summary.pdf"
+        "results/summary.pdf",
+
 
 rule plot:
     input:
         "results/summary.csv",
     output:
         "results/summary.pdf",
     script:
-        "scripts/plot.py"
+        "scripts/plot.R"
+
 
 rule summarize:
     input:
@@ -24,6 +27,7 @@ rule summarize:
     script:
         "scripts/summarize.py"
 
+
 rule compute_metrics:
     input:
         true="results/{dataset}/test_y.csv",
@@ -33,25 +37,31 @@ rule compute_metrics:
     log:
         "results/{dataset}/{method}/compute_metrics.log",
     shell:
-        "python scripts/compute_metrics.py "
+        "python workflow/scripts/compute_metrics.py "
         "--true {input.true} "
         "--pred {input.pred} "
         "--output {output} "
         "&> {log}"
 
+
 rule random_pred:
     input:
-        "results/{dataset}/train_x.csv",
+        train_x="results/{dataset}/train_x.csv",
+        train_y="results/{dataset}/train_y.csv",
+        test_x="results/{dataset}/test_x.csv",
     output:
         "results/{dataset}/random_pred/test_y.csv",
     log:
         "results/{dataset}/random_pred/run.log",
     shell:
-        "python scripts/random_pred.py "
-        "-i {input} "
-        "-o {output} "
+        "python workflow/scripts/random_pred.py "
+        "--train-x {input.train_x} "
+        "--train-y {input.train_y} "
+        "--test-x {input.test_x} "
+        "--test-y {output} "
         "&> {log}"
 
+
 rule mypackage:
     input:
         train_x="results/{dataset}/train_x.csv",
@@ -69,6 +79,7 @@ rule mypackage:
         "--test-y {output} "
         "&> {log}"
 
+
 rule split_train_test:
     input:
         x="../data/processed/{dataset}/x.csv",
@@ -81,7 +92,7 @@ rule split_train_test:
     log:
         "results/{dataset}/split_train_test.log",
     shell:
-        "python scripts/split_train_test.py "
+        "python workflow/scripts/split_train_test.py "
         "--input-x {input.x} "
         "--input-y {input.y} "
         "--output-train-x {output.train_x} "

diff --git a/evaluation/workflow/scripts/compute_metrics.py b/evaluation/workflow/scripts/compute_metrics.py
@@ -0,0 +1,33 @@
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+import pandas as pd
+import yaml
+from sklearn.metrics import mean_squared_error, r2_score
+
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("--true", type=Path, required=True)
+    parser.add_argument("--pred", type=Path, required=True)
+    parser.add_argument("--output", type=Path, required=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace) -> None:
+    true = pd.read_csv(args.true, index_col=0)
+    pred = pd.read_csv(args.pred, index_col=0)
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with args.output.open("w") as f:
+        yaml.dump(
+            {
+                "r2": r2_score(true, pred).item(),
+                "mse": mean_squared_error(true, pred).item(),
+            },
+            f,
+        )
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/evaluation/workflow/scripts/plot.R b/evaluation/workflow/scripts/plot.R
@@ -0,0 +1,24 @@
+suppressPackageStartupMessages({
+    library(dplyr)
+    library(ggplot2)
+    library(reshape2)
+})
+
+
+main <- function(snakemake) {
+    df <- read.csv(snakemake@input[[1]]) %>%
+        transmute(Method = method, Dataset = dataset, MSE = mse, R2 = r2) %>%
+        melt(
+            value.vars = c("MSE", "R2"),
+            value.name="Score",
+            variable.name="Metric",
+        )
+    gp <- ggplot(df, aes(x = Method, y = Score, fill = Method)) +
+        geom_bar(stat = "identity") +
+        facet_grid(Metric ~ Dataset, labeller = label_both, scales = "free_y") +
+        theme_bw()
+    ggsave(snakemake@output[[1]], gp, width = 7, height = 7)
+}
+
+
+main(snakemake)
diff --git a/evaluation/workflow/scripts/random_pred.py b/evaluation/workflow/scripts/random_pred.py
@@ -0,0 +1,30 @@
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("--train-x", type=Path, required=True)
+    parser.add_argument("--train-y", type=Path, required=True)
+    parser.add_argument("--test-x", type=Path, required=True)
+    parser.add_argument("--test-y", type=Path, required=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace) -> None:
+    train_y = pd.read_csv(args.train_y, index_col=0)
+    test_x = pd.read_csv(args.test_x, index_col=0)
+    mean, std = train_y.mean().iloc[0], train_y.std().iloc[0]
+    test_y = pd.DataFrame(
+        np.random.randn(test_x.shape[0]) * std + mean,
+        index=test_x.index,
+    )
+    args.test_y.parent.mkdir(parents=True, exist_ok=True)
+    test_y.to_csv(args.test_y)
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/evaluation/workflow/scripts/split_train_test.py b/evaluation/workflow/scripts/split_train_test.py
@@ -0,0 +1,37 @@
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("--input-x", type=Path, required=True)
+    parser.add_argument("--input-y", type=Path, required=True)
+    parser.add_argument("--output-train-x", type=Path, required=True)
+    parser.add_argument("--output-train-y", type=Path, required=True)
+    parser.add_argument("--output-test-x", type=Path, required=True)
+    parser.add_argument("--output-test-y", type=Path, required=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace) -> None:
+    x = pd.read_csv(args.input_x, index_col=0)
+    y = pd.read_csv(args.input_y, index_col=0)
+
+    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)
+
+    args.output_train_x.parent.mkdir(parents=True, exist_ok=True)
+    args.output_train_y.parent.mkdir(parents=True, exist_ok=True)
+    args.output_test_x.parent.mkdir(parents=True, exist_ok=True)
+    args.output_test_y.parent.mkdir(parents=True, exist_ok=True)
+
+    train_x.to_csv(args.output_train_x)
+    train_y.to_csv(args.output_train_y)
+    test_x.to_csv(args.output_test_x)
+    test_y.to_csv(args.output_test_y)
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/evaluation/workflow/scripts/summarize.py b/evaluation/workflow/scripts/summarize.py
@@ -3,18 +3,13 @@
 from parse import parse
 from snakemake.script import Snakemake
 
-
 PATTERN = "results/{dataset}/{method}/metrics.yaml"
 
 
 def main(snakemake: Snakemake) -> None:
     df = []
-    for item in set(snakemake.input):
-        entry = parse(PATTERN, item)
-        if entry:
-            conf = entry.named
-        else:
-            continue
+    for item in snakemake.input:
+        conf = parse(PATTERN, item).named
         with open(item) as f:
             content = yaml.load(f, Loader=yaml.Loader)
         df.append({**conf, **content})
@@ -24,4 +19,4 @@ def main(snakemake: Snakemake) -> None:
 
 
 if __name__ == "__main__":
-    main(snakemake)
+    main(snakemake)  # noqa: F821  # pyright: ignore[reportUndefinedVariable]