From d318e415850133567d86ce87a6b65e04e3a00168 Mon Sep 17 00:00:00 2001
From: Zhi-Jie Cao <caozj@mail.cbi.pku.edu.cn>
Date: Thu, 24 Aug 2023 20:24:47 +0800
Subject: [PATCH] Complete evaluation and add experiments

---
 .flake8                                       |  6 +++
 .gitignore                                    |  7 ++-
 .vscode/settings.json                         | 34 +++++++++++++
 LICENSE                                       |  1 -
 README.md                                     | 50 ++++++++++++++++---
 conda.yaml                                    |  7 ++-
 evaluation/workflow/Snakefile                 | 27 +++++++---
 .../workflow/scripts/compute_metrics.py       | 33 ++++++++++++
 evaluation/workflow/scripts/plot.R            | 24 +++++++++
 evaluation/workflow/scripts/random_pred.py    | 30 +++++++++++
 .../workflow/scripts/split_train_test.py      | 37 ++++++++++++++
 evaluation/workflow/scripts/summarize.py      | 11 ++--
 experiments/visualize_data/visualize_data.py  | 24 +++++++++
 mypackage/__init__.py                         |  4 +-
 pyproject.toml                                | 14 ++----
 tests/test_main.py                            |  2 +
 tests/test_model.py                           |  3 +-
 17 files changed, 275 insertions(+), 39 deletions(-)
 create mode 100644 .flake8
 create mode 100644 .vscode/settings.json
 create mode 100644 evaluation/workflow/scripts/compute_metrics.py
 create mode 100644 evaluation/workflow/scripts/plot.R
 create mode 100644 evaluation/workflow/scripts/random_pred.py
 create mode 100644 evaluation/workflow/scripts/split_train_test.py
 create mode 100644 experiments/visualize_data/visualize_data.py

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..5660bec
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+exclude = .git
+max-line-length = 90
+per-file-ignores =
+    tests/test_*.py:F401,F403,F405,F811
+    __init__.py:F401,E402
diff --git a/.gitignore b/.gitignore
index 7ff9d6b..f3f4117 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,11 @@
 /conda/
 /dist/
 __pycache__/
-.coverage
+.ipynb_checkpoints/
+.pytest_cache/
+.snakemake/
+.coverage*
+.DS_Store
 *.pyc
 *.csv
+*.ipynb
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..fd7e415
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,34 @@
+{
+    "files.trimTrailingWhitespace": true,
+    "files.trimFinalNewlines": true,
+    "files.watcherExclude": {
+        "**/conda/**": true,
+        "**/__pycache__/**": true,
+        "**/.ipynb_checkpoints/**": true,
+        "**/.pytest_cache/**": true,
+        "**/.snakemake/**": true,
+        "**/.coverage*": true
+    },
+    "files.exclude": {
+        "**/conda/**": true,
+        "**/__pycache__/**": true,
+        "**/.ipynb_checkpoints/**": true,
+        "**/.pytest_cache/**": true,
+        "**/.snakemake/**": true,
+        "**/.coverage*": true
+    },
+    "editor.formatOnSave": true,
+    "python.defaultInterpreterPath": "./conda/bin/python",
+    "python.testing.pytestEnabled": true,
+    "python.analysis.typeCheckingMode": "off",
+    "python.analysis.diagnosticSeverityOverrides": {
+        "reportShadowedImports": "none"
+    },
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter",
+        "editor.formatOnPaste": false,
+        "editor.codeActionsOnSave": {
+            "source.organizeImports": true
+        }
+    }
+}
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 3e2e370..1c06942 100644
--- a/LICENSE
+++ b/LICENSE
@@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
-
diff --git a/README.md b/README.md
index 9117cc6..1028f25 100644
--- a/README.md
+++ b/README.md
@@ -16,19 +16,26 @@ Then install necessary packages in this environment.
 
 ### Export environment to file
 
-Exporting environment to `conda.yaml` enables git version tracking and syncing across machines:
+Exporting environment to `conda.yaml` enables git version tracking and syncing
+across machines:
 
 ```sh
 mamba env export -p ./conda --no-build > conda.yaml
 ```
 
 > **Note**
-> We may also chooose to remove the `name:` and `prefix:` lines from the exported file to redact the trace of absolute paths:
-> `mamba env export -p ./conda --no-build | grep -Ev '^(name|prefix):' > conda.yaml`
+>
+> We may also choose to remove the `name:` and `prefix:` lines from the exported
+> file to redact the trace of absolute paths, as well as to remove our
+> "mypackage" entry which should be installed as a development version:
+>
+> `mamba env export -p ./conda --no-build | grep -Ev '^name|^prefix|mypackage' >
+> conda.yaml`
 
 ### Clone the environment on another machine
 
-If we wish to use computing resource on another machine, we may clone the repo and rebuild the environment with the `conda.yaml` file.
+If we wish to use computing resource on another machine, we may clone the repo
+and rebuild the environment with the `conda.yaml` file.
 
 ```sh
 mamba env create -p ./conda -f conda.yaml
@@ -36,7 +43,8 @@ mamba env create -p ./conda -f conda.yaml
 
 ### Sync environment change across machines
 
-If we changed the environment on one machine, we may update the `conda.yaml` file, sync it over git, and use it update the environment on another machine.
+If we changed the environment on one machine, we may update the `conda.yaml`
+file, sync it over git, and use it update the environment on another machine.
 
 ```sh
 mamba env update -p ./conda -f conda.yaml --prune
@@ -47,6 +55,34 @@ mamba env update -p ./conda -f conda.yaml --prune
 Use the `pyproject.toml` file to instruct the build and install process.
 
 - In this demo, we are using the build tool `flit`.
-- Use `flit install -s` to install our own package in symlink mode, so changes in our package are immediately effective without reinstalling.
-- To make a formal release of the package, use `flit build` and `twine upload`, or this [github action](https://github.com/pypa/gh-action-pypi-publish).
+- Use `flit install -s` to install our own package in symlink mode, so changes
+  in our package are immediately effective without reinstalling.
+- To make a formal release of the package, use `flit build` and `twine upload`,
+  or this [github action](https://github.com/pypa/gh-action-pypi-publish).
 
+
+## Data preparation
+
+Go to the `data/download` directory, download each dataset as described in their
+README file, and run the preprocessing scripts. These will produce a
+`data/processed` directory containing standardized data files.
+
+## Run evaluation
+
+Go to the `evaluation` directory, and run the following command:
+
+```sh
+snakemake -prk -j4
+```
+
+## Run experiments
+
+Go to the `experiments` subdirectories, and convert percent scripts to Jupyter
+notebooks with the following command:
+
+```sh
+jupytext --to notebook *.py
+```
+
+Then play with the notebooks in Jupyter Lab, they will stay in sync with the
+percent script automatically.
diff --git a/conda.yaml b/conda.yaml
index 3cfa4f0..0aeb9b7 100644
--- a/conda.yaml
+++ b/conda.yaml
@@ -203,4 +203,9 @@ dependencies:
   - zeromq=4.3.4
   - zipp=3.16.2
   - pip:
-      - mypackage==0.1.0
+      - cattrs==23.1.2
+      - esbonio==0.16.1
+      - lsprotocol==2023.0.0a2
+      - pygls==1.0.2
+      - pyspellchecker==0.7.2
+      - typeguard==3.0.2
diff --git a/evaluation/workflow/Snakefile b/evaluation/workflow/Snakefile
index 354c930..8e16a60 100644
--- a/evaluation/workflow/Snakefile
+++ b/evaluation/workflow/Snakefile
@@ -1,8 +1,10 @@
 configfile: "config/config.yaml"
 
+
 rule all:
     input:
-        "results/summary.pdf"
+        "results/summary.pdf",
+
 
 rule plot:
     input:
@@ -10,7 +12,8 @@ rule plot:
     output:
         "results/summary.pdf",
     script:
-        "scripts/plot.py"
+        "scripts/plot.R"
+
 
 rule summarize:
     input:
@@ -24,6 +27,7 @@ rule summarize:
     script:
         "scripts/summarize.py"
 
+
 rule compute_metrics:
     input:
         true="results/{dataset}/test_y.csv",
@@ -33,25 +37,31 @@ rule compute_metrics:
     log:
         "results/{dataset}/{method}/compute_metrics.log",
     shell:
-        "python scripts/compute_metrics.py "
+        "python workflow/scripts/compute_metrics.py "
         "--true {input.true} "
         "--pred {input.pred} "
         "--output {output} "
         "&> {log}"
 
+
 rule random_pred:
     input:
-        "results/{dataset}/train_x.csv",
+        train_x="results/{dataset}/train_x.csv",
+        train_y="results/{dataset}/train_y.csv",
+        test_x="results/{dataset}/test_x.csv",
     output:
         "results/{dataset}/random_pred/test_y.csv",
     log:
         "results/{dataset}/random_pred/run.log",
     shell:
-        "python scripts/random_pred.py "
-        "-i {input} "
-        "-o {output} "
+        "python workflow/scripts/random_pred.py "
+        "--train-x {input.train_x} "
+        "--train-y {input.train_y} "
+        "--test-x {input.test_x} "
+        "--test-y {output} "
         "&> {log}"
 
+
 rule mypackage:
     input:
         train_x="results/{dataset}/train_x.csv",
@@ -69,6 +79,7 @@ rule mypackage:
         "--test-y {output} "
         "&> {log}"
 
+
 rule split_train_test:
     input:
         x="../data/processed/{dataset}/x.csv",
@@ -81,7 +92,7 @@ rule split_train_test:
     log:
         "results/{dataset}/split_train_test.log",
     shell:
-        "python scripts/split_train_test.py "
+        "python workflow/scripts/split_train_test.py "
         "--input-x {input.x} "
         "--input-y {input.y} "
         "--output-train-x {output.train_x} "
diff --git a/evaluation/workflow/scripts/compute_metrics.py b/evaluation/workflow/scripts/compute_metrics.py
new file mode 100644
index 0000000..46a8398
--- /dev/null
+++ b/evaluation/workflow/scripts/compute_metrics.py
@@ -0,0 +1,33 @@
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+import pandas as pd
+import yaml
+from sklearn.metrics import mean_squared_error, r2_score
+
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("--true", type=Path, required=True)
+    parser.add_argument("--pred", type=Path, required=True)
+    parser.add_argument("--output", type=Path, required=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace) -> None:
+    true = pd.read_csv(args.true, index_col=0)
+    pred = pd.read_csv(args.pred, index_col=0)
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with args.output.open("w") as f:
+        yaml.dump(
+            {
+                "r2": r2_score(true, pred).item(),
+                "mse": mean_squared_error(true, pred).item(),
+            },
+            f,
+        )
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/evaluation/workflow/scripts/plot.R b/evaluation/workflow/scripts/plot.R
new file mode 100644
index 0000000..e8f528a
--- /dev/null
+++ b/evaluation/workflow/scripts/plot.R
@@ -0,0 +1,24 @@
+suppressPackageStartupMessages({
+    library(dplyr)
+    library(ggplot2)
+    library(reshape2)
+})
+
+
+main <- function(snakemake) {
+    df <- read.csv(snakemake@input[[1]]) %>%
+        transmute(Method = method, Dataset = dataset, MSE = mse, R2 = r2) %>%
+        melt(
+            value.vars = c("MSE", "R2"),
+            value.name="Score",
+            variable.name="Metric",
+        )
+    gp <- ggplot(df, aes(x = Method, y = Score, fill = Method)) +
+        geom_bar(stat = "identity") +
+        facet_grid(Metric ~ Dataset, labeller = label_both, scales = "free_y") +
+        theme_bw()
+    ggsave(snakemake@output[[1]], gp, width = 7, height = 7)
+}
+
+
+main(snakemake)
diff --git a/evaluation/workflow/scripts/random_pred.py b/evaluation/workflow/scripts/random_pred.py
new file mode 100644
index 0000000..d7a91c4
--- /dev/null
+++ b/evaluation/workflow/scripts/random_pred.py
@@ -0,0 +1,30 @@
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("--train-x", type=Path, required=True)
+    parser.add_argument("--train-y", type=Path, required=True)
+    parser.add_argument("--test-x", type=Path, required=True)
+    parser.add_argument("--test-y", type=Path, required=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace) -> None:
+    train_y = pd.read_csv(args.train_y, index_col=0)
+    test_x = pd.read_csv(args.test_x, index_col=0)
+    mean, std = train_y.mean().iloc[0], train_y.std().iloc[0]
+    test_y = pd.DataFrame(
+        np.random.randn(test_x.shape[0]) * std + mean,
+        index=test_x.index,
+    )
+    args.test_y.parent.mkdir(parents=True, exist_ok=True)
+    test_y.to_csv(args.test_y)
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/evaluation/workflow/scripts/split_train_test.py b/evaluation/workflow/scripts/split_train_test.py
new file mode 100644
index 0000000..e7d1115
--- /dev/null
+++ b/evaluation/workflow/scripts/split_train_test.py
@@ -0,0 +1,37 @@
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+def parse_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("--input-x", type=Path, required=True)
+    parser.add_argument("--input-y", type=Path, required=True)
+    parser.add_argument("--output-train-x", type=Path, required=True)
+    parser.add_argument("--output-train-y", type=Path, required=True)
+    parser.add_argument("--output-test-x", type=Path, required=True)
+    parser.add_argument("--output-test-y", type=Path, required=True)
+    return parser.parse_args()
+
+
+def main(args: Namespace) -> None:
+    x = pd.read_csv(args.input_x, index_col=0)
+    y = pd.read_csv(args.input_y, index_col=0)
+
+    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)
+
+    args.output_train_x.parent.mkdir(parents=True, exist_ok=True)
+    args.output_train_y.parent.mkdir(parents=True, exist_ok=True)
+    args.output_test_x.parent.mkdir(parents=True, exist_ok=True)
+    args.output_test_y.parent.mkdir(parents=True, exist_ok=True)
+
+    train_x.to_csv(args.output_train_x)
+    train_y.to_csv(args.output_train_y)
+    test_x.to_csv(args.output_test_x)
+    test_y.to_csv(args.output_test_y)
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/evaluation/workflow/scripts/summarize.py b/evaluation/workflow/scripts/summarize.py
index a04afd1..9364fb5 100644
--- a/evaluation/workflow/scripts/summarize.py
+++ b/evaluation/workflow/scripts/summarize.py
@@ -3,18 +3,13 @@
 from parse import parse
 from snakemake.script import Snakemake
 
-
 PATTERN = "results/{dataset}/{method}/metrics.yaml"
 
 
 def main(snakemake: Snakemake) -> None:
     df = []
-    for item in set(snakemake.input):
-        entry = parse(PATTERN, item)
-        if entry:
-            conf = entry.named
-        else:
-            continue
+    for item in snakemake.input:
+        conf = parse(PATTERN, item).named
         with open(item) as f:
             content = yaml.load(f, Loader=yaml.Loader)
         df.append({**conf, **content})
@@ -24,4 +19,4 @@ def main(snakemake: Snakemake) -> None:
 
 
 if __name__ == "__main__":
-    main(snakemake)
+    main(snakemake)  # noqa: F821  # pyright: ignore[reportUndefinedVariable]
diff --git a/experiments/visualize_data/visualize_data.py b/experiments/visualize_data/visualize_data.py
new file mode 100644
index 0000000..87583b0
--- /dev/null
+++ b/experiments/visualize_data/visualize_data.py
@@ -0,0 +1,24 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.15.0
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %%
+import pandas as pd
+
+# %%
+mtcars = pd.read_csv("../../data/processed/mtcars/x.csv")
+mtcars.head()
+
+# %%
+mtcars.boxplot()
diff --git a/mypackage/__init__.py b/mypackage/__init__.py
index b4bf34c..fdc8cda 100644
--- a/mypackage/__init__.py
+++ b/mypackage/__init__.py
@@ -7,7 +7,7 @@ def version(name):
         return get_distribution(name).version
 
 
+from .model import fit, predict
+
 name = "mypackage"
 version = version(name)
-
-from .model import fit, predict
diff --git a/pyproject.toml b/pyproject.toml
index aaa7fff..3c180bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ build-backend = "flit_core.buildapi"
 requires = ["flit_core >=3.2,<4"]
 
 [project]
-authors = [{ name = "Zhi-Jie Cao", email = "caozj@mail.cbi.pku.edu.cn"  }]
+authors = [{ name = "Zhi-Jie Cao", email = "caozj@mail.cbi.pku.edu.cn" }]
 classifiers = [
   "Intended Audience :: Science/Research",
   "License :: OSI Approved :: MIT License",
@@ -16,16 +16,10 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
-dependencies = [
-  "numpy",
-  "pandas",
-  "scikit-learn",
-]
+dependencies = ["numpy", "pandas", "scikit-learn"]
 description = "Project management workshop demo"
-keywords = [
-  "bioinformatics",
-]
-license = { file = "LICENSE"  }
+keywords = ["bioinformatics"]
+license = { file = "LICENSE" }
 name = "mypackage"
 readme = "README.md"
 requires-python = ">=3.9"
diff --git a/tests/test_main.py b/tests/test_main.py
index 9601c0c..8d750d6 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,6 +1,8 @@
 from subprocess import run
+
 from .fixtures import *
 
+
 def test_main(train_x, train_y, test_x, tmp_path):
     train_x.to_csv(tmp_path / "train_x.csv")
     train_y.to_csv(tmp_path / "train_y.csv")
diff --git a/tests/test_model.py b/tests/test_model.py
index 5870e76..ba88973 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1,9 +1,10 @@
 from mypackage.model import *
+
 from .fixtures import *
 
 
 def test_fit(train_x, train_y):  # Smoke test
-    model = fit(train_x, train_y)
+    fit(train_x, train_y)
 
 
 def test_predict(train_x, train_y, test_x, test_y):