Skip to content

Commit

Permalink
Complete evaluation and add experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeff1995 committed Aug 24, 2023
1 parent 0e1a349 commit d318e41
Show file tree
Hide file tree
Showing 17 changed files with 275 additions and 39 deletions.
6 changes: 6 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[flake8]
exclude = .git
max-line-length = 90
per-file-ignores =
tests/test_*.py:F401,F403,F405,F811
__init__.py:F401,E402
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
/conda/
/dist/
__pycache__/
.coverage
.ipynb_checkpoints/
.pytest_cache/
.snakemake/
.coverage*
.DS_Store
*.pyc
*.csv
*.ipynb
34 changes: 34 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"files.trimTrailingWhitespace": true,
"files.trimFinalNewlines": true,
"files.watcherExclude": {
"**/conda/**": true,
"**/__pycache__/**": true,
"**/.ipynb_checkpoints/**": true,
"**/.pytest_cache/**": true,
"**/.snakemake/**": true,
"**/.coverage*": true
},
"files.exclude": {
"**/conda/**": true,
"**/__pycache__/**": true,
"**/.ipynb_checkpoints/**": true,
"**/.pytest_cache/**": true,
"**/.snakemake/**": true,
"**/.coverage*": true
},
"editor.formatOnSave": true,
"python.defaultInterpreterPath": "./conda/bin/python",
"python.testing.pytestEnabled": true,
"python.analysis.typeCheckingMode": "off",
"python.analysis.diagnosticSeverityOverrides": {
"reportShadowedImports": "none"
},
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnPaste": false,
"editor.codeActionsOnSave": {
"source.organizeImports": true
}
}
}
1 change: 0 additions & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

50 changes: 43 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,35 @@ Then install necessary packages in this environment.

### Export environment to file

Exporting environment to `conda.yaml` enables git version tracking and syncing across machines:
Exporting environment to `conda.yaml` enables git version tracking and syncing
across machines:

```sh
mamba env export -p ./conda --no-build > conda.yaml
```

> **Note**
> We may also chooose to remove the `name:` and `prefix:` lines from the exported file to redact the trace of absolute paths:
> `mamba env export -p ./conda --no-build | grep -Ev '^(name|prefix):' > conda.yaml`
>
> We may also choose to remove the `name:` and `prefix:` lines from the exported
> file to redact the trace of absolute paths, as well as to remove our
> "mypackage" entry which should be installed as a development version:
>
> `mamba env export -p ./conda --no-build | grep -Ev '^name|^prefix|mypackage' >
> conda.yaml`
### Clone the environment on another machine

If we wish to use computing resource on another machine, we may clone the repo and rebuild the environment with the `conda.yaml` file.
If we wish to use computing resource on another machine, we may clone the repo
and rebuild the environment with the `conda.yaml` file.

```sh
mamba env create -p ./conda -f conda.yaml
```

### Sync environment change across machines

If we changed the environment on one machine, we may update the `conda.yaml` file, sync it over git, and use it update the environment on another machine.
If we changed the environment on one machine, we may update the `conda.yaml`
file, sync it over git, and use it update the environment on another machine.

```sh
mamba env update -p ./conda -f conda.yaml --prune
Expand All @@ -47,6 +55,34 @@ mamba env update -p ./conda -f conda.yaml --prune
Use the `pyproject.toml` file to instruct the build and install process.

- In this demo, we are using the build tool `flit`.
- Use `flit install -s` to install our own package in symlink mode, so changes in our package are immediately effective without reinstalling.
- To make a formal release of the package, use `flit build` and `twine upload`, or this [github action](https://github.com/pypa/gh-action-pypi-publish).
- Use `flit install -s` to install our own package in symlink mode, so changes
in our package are immediately effective without reinstalling.
- To make a formal release of the package, use `flit build` and `twine upload`,
or this [github action](https://github.com/pypa/gh-action-pypi-publish).


## Data preparation

Go to the `data/download` directory, download each dataset as described in their
README file, and run the preprocessing scripts. These will produce a
`data/processed` directory containing standardized data files.

## Run evaluation

Go to the `evaluation` directory, and run the following command:

```sh
snakemake -prk -j4
```

## Run experiments

Go to the `experiments` subdirectories, and convert percent scripts to Jupyter
notebooks with the following command:

```sh
jupytext --to notebook *.py
```

Then play with the notebooks in Jupyter Lab, they will stay in sync with the
percent script automatically.
7 changes: 6 additions & 1 deletion conda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -203,4 +203,9 @@ dependencies:
- zeromq=4.3.4
- zipp=3.16.2
- pip:
- mypackage==0.1.0
- cattrs==23.1.2
- esbonio==0.16.1
- lsprotocol==2023.0.0a2
- pygls==1.0.2
- pyspellchecker==0.7.2
- typeguard==3.0.2
27 changes: 19 additions & 8 deletions evaluation/workflow/Snakefile
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
configfile: "config/config.yaml"


rule all:
input:
"results/summary.pdf"
"results/summary.pdf",


rule plot:
input:
"results/summary.csv",
output:
"results/summary.pdf",
script:
"scripts/plot.py"
"scripts/plot.R"


rule summarize:
input:
Expand All @@ -24,6 +27,7 @@ rule summarize:
script:
"scripts/summarize.py"


rule compute_metrics:
input:
true="results/{dataset}/test_y.csv",
Expand All @@ -33,25 +37,31 @@ rule compute_metrics:
log:
"results/{dataset}/{method}/compute_metrics.log",
shell:
"python scripts/compute_metrics.py "
"python workflow/scripts/compute_metrics.py "
"--true {input.true} "
"--pred {input.pred} "
"--output {output} "
"&> {log}"


rule random_pred:
input:
"results/{dataset}/train_x.csv",
train_x="results/{dataset}/train_x.csv",
train_y="results/{dataset}/train_y.csv",
test_x="results/{dataset}/test_x.csv",
output:
"results/{dataset}/random_pred/test_y.csv",
log:
"results/{dataset}/random_pred/run.log",
shell:
"python scripts/random_pred.py "
"-i {input} "
"-o {output} "
"python workflow/scripts/random_pred.py "
"--train-x {input.train_x} "
"--train-y {input.train_y} "
"--test-x {input.test_x} "
"--test-y {output} "
"&> {log}"


rule mypackage:
input:
train_x="results/{dataset}/train_x.csv",
Expand All @@ -69,6 +79,7 @@ rule mypackage:
"--test-y {output} "
"&> {log}"


rule split_train_test:
input:
x="../data/processed/{dataset}/x.csv",
Expand All @@ -81,7 +92,7 @@ rule split_train_test:
log:
"results/{dataset}/split_train_test.log",
shell:
"python scripts/split_train_test.py "
"python workflow/scripts/split_train_test.py "
"--input-x {input.x} "
"--input-y {input.y} "
"--output-train-x {output.train_x} "
Expand Down
33 changes: 33 additions & 0 deletions evaluation/workflow/scripts/compute_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from argparse import ArgumentParser, Namespace
from pathlib import Path

import pandas as pd
import yaml
from sklearn.metrics import mean_squared_error, r2_score


def parse_args() -> Namespace:
parser = ArgumentParser()
parser.add_argument("--true", type=Path, required=True)
parser.add_argument("--pred", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
return parser.parse_args()


def main(args: Namespace) -> None:
true = pd.read_csv(args.true, index_col=0)
pred = pd.read_csv(args.pred, index_col=0)

args.output.parent.mkdir(parents=True, exist_ok=True)
with args.output.open("w") as f:
yaml.dump(
{
"r2": r2_score(true, pred).item(),
"mse": mean_squared_error(true, pred).item(),
},
f,
)


if __name__ == "__main__":
main(parse_args())
24 changes: 24 additions & 0 deletions evaluation/workflow/scripts/plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
suppressPackageStartupMessages({
library(dplyr)
library(ggplot2)
library(reshape2)
})


main <- function(snakemake) {
df <- read.csv(snakemake@input[[1]]) %>%
transmute(Method = method, Dataset = dataset, MSE = mse, R2 = r2) %>%
melt(
value.vars = c("MSE", "R2"),
value.name="Score",
variable.name="Metric",
)
gp <- ggplot(df, aes(x = Method, y = Score, fill = Method)) +
geom_bar(stat = "identity") +
facet_grid(Metric ~ Dataset, labeller = label_both, scales = "free_y") +
theme_bw()
ggsave(snakemake@output[[1]], gp, width = 7, height = 7)
}


main(snakemake)
30 changes: 30 additions & 0 deletions evaluation/workflow/scripts/random_pred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from argparse import ArgumentParser, Namespace
from pathlib import Path

import numpy as np
import pandas as pd


def parse_args() -> Namespace:
parser = ArgumentParser()
parser.add_argument("--train-x", type=Path, required=True)
parser.add_argument("--train-y", type=Path, required=True)
parser.add_argument("--test-x", type=Path, required=True)
parser.add_argument("--test-y", type=Path, required=True)
return parser.parse_args()


def main(args: Namespace) -> None:
train_y = pd.read_csv(args.train_y, index_col=0)
test_x = pd.read_csv(args.test_x, index_col=0)
mean, std = train_y.mean().iloc[0], train_y.std().iloc[0]
test_y = pd.DataFrame(
np.random.randn(test_x.shape[0]) * std + mean,
index=test_x.index,
)
args.test_y.parent.mkdir(parents=True, exist_ok=True)
test_y.to_csv(args.test_y)


if __name__ == "__main__":
main(parse_args())
37 changes: 37 additions & 0 deletions evaluation/workflow/scripts/split_train_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from argparse import ArgumentParser, Namespace
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split


def parse_args() -> Namespace:
parser = ArgumentParser()
parser.add_argument("--input-x", type=Path, required=True)
parser.add_argument("--input-y", type=Path, required=True)
parser.add_argument("--output-train-x", type=Path, required=True)
parser.add_argument("--output-train-y", type=Path, required=True)
parser.add_argument("--output-test-x", type=Path, required=True)
parser.add_argument("--output-test-y", type=Path, required=True)
return parser.parse_args()


def main(args: Namespace) -> None:
x = pd.read_csv(args.input_x, index_col=0)
y = pd.read_csv(args.input_y, index_col=0)

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)

args.output_train_x.parent.mkdir(parents=True, exist_ok=True)
args.output_train_y.parent.mkdir(parents=True, exist_ok=True)
args.output_test_x.parent.mkdir(parents=True, exist_ok=True)
args.output_test_y.parent.mkdir(parents=True, exist_ok=True)

train_x.to_csv(args.output_train_x)
train_y.to_csv(args.output_train_y)
test_x.to_csv(args.output_test_x)
test_y.to_csv(args.output_test_y)


if __name__ == "__main__":
main(parse_args())
11 changes: 3 additions & 8 deletions evaluation/workflow/scripts/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@
from parse import parse
from snakemake.script import Snakemake


PATTERN = "results/{dataset}/{method}/metrics.yaml"


def main(snakemake: Snakemake) -> None:
df = []
for item in set(snakemake.input):
entry = parse(PATTERN, item)
if entry:
conf = entry.named
else:
continue
for item in snakemake.input:
conf = parse(PATTERN, item).named
with open(item) as f:
content = yaml.load(f, Loader=yaml.Loader)
df.append({**conf, **content})
Expand All @@ -24,4 +19,4 @@ def main(snakemake: Snakemake) -> None:


if __name__ == "__main__":
main(snakemake)
main(snakemake) # noqa: F821 # pyright: ignore[reportUndefinedVariable]
Loading

0 comments on commit d318e41

Please sign in to comment.