Merge pull request #86 from Arcadia-Science/release/0.5.0

[Sync] Reorganize output directories + deduplicate snakefiles + bugfixes
Arcadia-Science · May 10, 2024 · b6dfc76 · b6dfc76
2 parents 3af6f09 + 128945c
commit b6dfc76
Show file tree

Hide file tree

Showing 88 changed files with 71,840 additions and 1,182 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -12,7 +12,7 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Install Python
         uses: actions/setup-python@v4

diff --git a/.github/workflows/open-public-sync-pr.yml b/.github/workflows/open-public-sync-pr.yml
@@ -0,0 +1,74 @@
+name: open-public-sync-pr
+
+on:
+  # allow this workflow to be run manually from the Actions tab in the private repo
+  workflow_dispatch:
+
+env:
+  PUBLIC_REPO_URL: https://github.com/Arcadia-Science/ProteinCartography
+  PUBLIC_REPO_REMOTE: public
+  # the name of the branch to use for syncing the private repo with the public repo
+  SYNC_BRANCH: sync-from-public
+
+jobs:
+    open-public-sync-pr:
+      runs-on: ubuntu-latest
+      # only run this job in the private repo
+      if: github.repository == 'Arcadia-Science/ProteinCartography-private'
+      steps:
+        - name: Checkout the repo
+          uses: actions/checkout@v4
+          with:
+            # this token should be a fine-grained PAT with write permissions
+            # for repository "contents", "pull-requests", and "workflows"
+            # note: a PAT is needed to allow the action to push the sync branch to the private repo;
+            # it is not sufficient to grant `permissions: write-all` to the default GITHUB_TOKEN
+            # (see https://github.com/orgs/community/discussions/35410)
+            token: ${{ secrets.OPEN_PUBLIC_SYNC_PR_PAT }}
+
+            # fetch the full history so that we can check for new commits
+            fetch-depth: 0
+
+        - name: Add and fetch the public repo
+          run: |
+            git remote add ${{ env.PUBLIC_REPO_REMOTE }} ${{ env.PUBLIC_REPO_URL }}
+            git fetch --all
+
+        - name: Check for new commits in the public repo and exit if there are none
+          run: |
+            NEW_COMMITS=$(git log --oneline --no-merges origin/main..${{ env.PUBLIC_REPO_REMOTE }}/main)
+            if [ -z "$NEW_COMMITS" ]; then
+                echo "There are no new commits in the public repo, so no PR will be opened."
+                exit 1
+            fi
+
+        - name: Delete the sync branch if it exists
+          # note: if the sync branch already exists and there is an open PR associated with it,
+          # this will automatically close the PR
+          run: |
+            git branch -D ${{ env.SYNC_BRANCH }} || true
+            git push --force --delete origin ${{ env.SYNC_BRANCH }} || true
+
+        - name: Create and push the sync branch to the private repo
+          # we use `git branch` here instead of `git checkout` followed by `git pull public main`
+          # because the latter will fail if there are merge conflicts
+          run: |
+            git branch ${{ env.SYNC_BRANCH }} ${{ env.PUBLIC_REPO_REMOTE }}/main
+            git branch --unset-upstream ${{ env.SYNC_BRANCH }}
+            git push origin ${{ env.SYNC_BRANCH }}
+
+        - name: Open a PR to merge the sync branch into the private repo's main branch
+          env:
+            GH_TOKEN: ${{ secrets.OPEN_PUBLIC_SYNC_PR_PAT }}
+            PR_BODY: >
+                This PR adds new contributions made to the public repo
+                in order to keep the private repo up to date with the public repo.
+                It was automatically generated by the "open-public-sync-pr"
+                GitHub Actions workflow.
+          run: |
+            gh pr create \
+              --repo "${{ github.repository }}" \
+              --base main \
+              --head "${{ env.SYNC_BRANCH }}" \
+              --title "Sync with the public repo" \
+              --body "${{ env.PR_BODY }}"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,6 +9,8 @@ on:
   pull_request:
     branches:
       - main
+    # 'synchronize' means the PR was updated with new commits
+    types: [opened, reopened, labeled, synchronize]
 
 jobs:
   test:
@@ -17,7 +19,7 @@ jobs:
       run:
         shell: bash -el {0}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Setup conda
         uses: conda-incubator/setup-miniconda@v2
@@ -57,9 +59,14 @@ jobs:
 
       - name: Create the snakemake conda envs
         run: |
-          snakemake --configfile demo/config_actin.yml --use-conda --conda-create-envs-only --cores 1
+          snakemake --configfile demo/search-mode/config_actin.yml --use-conda --conda-create-envs-only --cores 1
         if: steps.cache-snakemake-conda-envs.outputs.cache-hit != 'true'
 
       - name: Run the tests
         run: |
-          pytest -vv -s .
+          make test
+
+      - name: Run the tests without mocks
+        if: contains(github.event.pull_request.labels.*.name, 'run-slow-tests')
+        run: |
+          make test-without-mocks
diff --git a/.github/workflows/verify-no-new-public-commits.yml b/.github/workflows/verify-no-new-public-commits.yml
@@ -0,0 +1,44 @@
+name: verify-no-new-public-commits
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+
+env:
+  PUBLIC_REPO_URL: https://github.com/Arcadia-Science/ProteinCartography
+
+jobs:
+  verify-no-new-public-commits:
+    runs-on: ubuntu-latest
+    # only run this job in the private repo
+    # (but do not check the repo owner, because forks will have a different owner)
+    if: github.event.repository.name == 'ProteinCartography-private'
+    steps:
+      - name: Checkout the repo
+        uses: actions/checkout@v4
+        with:
+          # fetch the full history so that we can check for new commits
+          fetch-depth: 0
+
+      - name: Add and fetch the public repo
+        run: |
+          git remote add public ${{ env.PUBLIC_REPO_URL }}
+          git fetch --all
+
+      - name: Check for new commits in the public repo
+        run: |
+          NEW_COMMITS=$(git log --oneline --no-merges ..public/main)
+          if [ -z "$NEW_COMMITS" ]; then
+            echo "There are no new commits in the public repo."
+            exit 0
+          else
+            echo -e "This action failed because there are new commits in the public repo:\n\
+          $NEW_COMMITS\n\n\
+          Please use the \"open-public-sync-pr\" GitHub Action to merge them \
+          into the private repo before merging new PRs."
+            exit 1
+          fi
diff --git a/.gitignore b/.gitignore
@@ -5,8 +5,7 @@ envs/src/**
 .snakemake/
 /input/
 /output/
-/demo/output/
-/demo/input/
+/demo/**/output/
 /logs
 /tmp
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,153 @@
+# Contributing to ProteinCartography
+Thanks for your interest in contributing to ProteinCartography!
+Please read this document in its entirety before contributing to help ensure that your contribution meets our standards and is readily accepted.
+
+## Getting Started
+All the packages needed to develop for ProteinCartography are found in the `envs/cartography_dev.yml` conda environment.
+You can install this environment as follows:
+
+1. Make sure `miniconda` is installed. Even if you’re using an Apple Silicon (M1, M2, etc. macOS) laptop, you will need to install the macOS Intel x86-64 version of `miniconda` [here](https://docs.conda.io/projects/miniconda/en/latest/).
+
+2. Create a conda environment from the `cartography_dev.yml` file in the `envs/` directory.
+```sh
+conda env create -n cartography_dev --file envs/cartography_dev.yml
+```
+
+3. Activate the environment.
+```sh
+conda activate cartography_dev
+```
+
+## How to contribute
+### Bug reports and feature requests
+We track all bugs, new feature requests, enhancements, etc. using [GitHub Issues](https://github.com/Arcadia-Science/ProteinCartography/issues). Please check to make sure that your issue has not already been reported. If it has, please add a comment to the existing issue instead of creating a new one.
+
+### Making a contribution
+The steps below apply to both external and internal contributors and also apply to working both with this repo itself and with your own fork. However, if you are an external contributor, please fork this repository and make your changes in a new branch on your fork. Please see the GitHub docs on [working with forks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) for more details about how to do this.
+
+1. Whenever you start work, you should make sure to `git pull` on the main branch to get the latest version of the `main` branch.
+
+1. When you’re planning on working on an issue, you should claim it by assigning it to yourself. You should also add a comment briefly explaining how you plan to address the issue. If you are an external contributor, please wait for a maintainer to sign off on your plan before you start working; this will make it easier for us to accept your PR later on.
+
+1. After claiming an issue, create a new branch from the `main` branch. __Make sure your branch begins with your initials, followed by a forward slash__. This is very important to keep everyone's branches well-organized. Use short descriptive names for the branch name itself. For example, if your initials are `abc` and you are adding a new feature to evaluate clustering, you might name your branch `abc/add-cluster-evaluation`. If you are working on an issue to fix a bug, you might name your branch `abc/fix-foldseek-format-bug`.
+	Create the new branch using the following command:
+	```sh
+	git checkout -b <your-initials>/<branch-name>
+	```
+
+1. Once you’ve created a branch, push the branch to the GitHub repo so you can keep a remote record of your work.
+	```sh
+	git push -u origin <your-initials>/<branch-name>
+	```
+
+1. Once you’ve completed the feature or fixed the bug, you are ready to open a PR. Please aim to keep the PRs as small as possible to increase the speed and ease of review; a PR can be as small as changing a few characters or resolving a single bug. When you open a PR, please use a succinct and human-readable title and always add a description of the changes you made as well as a link to the issue that the PR addresses.
+
+1. Check that your PR is passing the CI checks. These checks verify that the changes in your PR are formatted correctly (we use a tool called `ruff` for formatting; see below for details) and also that your PR passes the automated tests of the pipeline.
+
+### Keeping your development branches up to date
+Occasionally, your development branch will be behind the main branch. If this happens and you’re working on a file that was changed in the updated version of the main branch, you may need to merge the updated `main` branch into your local development branch.
+
+1. First, update the main branch in your local repo using the following in main:
+	```sh
+	git checkout main
+	git pull
+	```
+
+1. Next, check out your development branch:
+	```sh
+	git checkout <your-initials>/<branch-name>
+	```
+
+1. Now, merge the main branch into your local branch:
+	```sh
+	git merge origin/main
+	```
+	__Note that you must be on your local development branch when you call `git merge`!__
+
+1. Once you’ve merged the main branch into your local development branch, use `git push` to push the merged changes to your branch on GitHub.
+
+### Linting
+We use `ruff` to lint and format our Python code. We also use snakemake’s code formatter, `snakefmt` , to format the snakefiles. You can and should run these tools in your local repo using the commands `make format` and `make lint`. Note that `ruff` is also available as an extension in VS Code, allowing you to configure VS Code to automatically format your code whenever you save the file you are currently editing.
+
+### Testing
+Tests are found in the `ProteinCartography/tests/` directory. We use `pytest` for testing; you can run the tests locally using the command `make test`. Currently, we only have integration-like tests that run the pipeline in both 'search' and 'cluster' modes using a test dataset and test config designed to allow the pipeline run very quickly (2-3min). The tests then check that the output files are created and have the correct shape. We plan to add unit tests in the future.
+
+#### Running the tests without mocked API responses
+When the pipeline is run in 'search' mode, it makes many calls to external APIs (including Foldseek, Blast, and Alphafold). By default, these calls are mocked during testing so that the tests do not depend on external APIs; this is important to ensure test reproducibility and also helps to make the tests run quickly. However, it is important to periodically test that the pipeline also runs correctly when real API calls are made. To do this, you can run the tests without mocks using `make test-without-mocks`.
+
+When merging PRs on GitHub, it is likewise important to test that the pipeline runs correctly with real API calls. To do so, add the label `run-slow-tests` to your PR. This will trigger the CI actions (see below) to run again on your PR, but now without mocks. __Please add this label only when your PR is ready to merge, as it will cause the CI to run more slowly and will also result in unnecessary API calls.__
+
+#### Updating the mocked API responses
+When changes you have made involve changes to the API calls made by the pipeline, it will be necessary to update the mocked responses in order for the tests to pass. Currently, this is a manual process.
+1. Enable API response logging in your local environment by setting the following environment variable:
+	```sh
+	export PROTEINCARTOGRAPHY_SHOULD_LOG_API_REQUESTS=true
+	```
+
+1. Run the pipeline in 'search' mode using the 'small' search-mode demo (this demo uses the same input PDB file as the tests). The API responses made by the pipeline will be logged a `logs/` directory in the root of the repo.
+	```sh
+	snakemake --cores all --use-conda --configfile demo/search-mode/config_actin_small.yml
+	```
+
+1. Use the logged responses to update the mocked responses constructed in the `ProteinCartography/tests/mocks.py` module. For large responses, response contents should be added to `ProteinCartography/tests/integration-test-artifacts/search-mode/actin/api_response_content/`.
+
+1. When you're finished, don't forget to delete the `logs/` directory and unset the `PROTEINCARTOGRAPHY_SHOULD_LOG_API_REQUESTS` environment variable.
+
+### CI
+We use GitHub Actions for CI. Currently, there is one workflow for linting and one for testing. Both workflows are run automatically on GitHub when a PR is opened and also whenever new commits are pushed to an open PR. PRs cannot be merged until the CI checks pass.
+
+The linting workflow runs `ruff --check` and `snakefmt --check` on all Python and snakefiles in the repo. This means that the workflow does not modify any files; it only checks that your code is formatted correctly. If the workflow fails for your PR, you can run `make format` locally to format your code and `make lint` to determine if there are lint errors that need to be fixed.
+
+The testing workflow runs pytest using the same `make test` command that is used locally. If the workflow fails for your PR, it is usually best to run `make test` locally to recapitulate the failure and determine which tests are failing.
+
+### Style guide
+In addition to the formatting and lint rules imposed by `ruff` and `snakefmt`, we also have a few additional style rules that are not enforced by these tools. These rules are listed below.
+
+- Function and variable names should be in `lower_snake_case` and should be descriptive; avoid abbreviations.
+- Function arguments and return values should have [type hints](https://docs.python.org/3/library/typing.html).
+- Functions should include a [Google-style docstring](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) explaining the arguments and what the function returns (if not `None`).
+- Comments should be written in complete sentences in the present tense and should end with a period.
+- Comments should be used sparingly and only when necessary to explain something that is not obvious from the code itself.
+- Class names should use `CapitalizedCamelCase` with descriptive names.
+- Currently, we don’t use many custom classes, but the conventions for functions apply to class methods as well.
+
+Here is an example of a function that adheres to all of these rules:
+```python
+def add_integers(first_integer: int, second_integer: int) -> int:
+	"""
+	Add two integers together, returning the result.
+
+	Args:
+		first_integer (int): first integer to add.
+		second_integer (int): second integer to add.
+
+	Returns:
+		The sum of the two integers.
+	"""
+	result = first_integer + second_integer
+	return result
+```
+
+### Code organization
+We strive to encapsulate new functionality within modular Python scripts that accept arguments from the command line using `argparse`. These scripts are then called from snakemake rules and can also be run directly from the command line by the user.
+- Every script should include a `parse_args()` function and a `main()` function.
+- Every script with `#!/usr/bin/env python` (so that the scripts are executable from the command line on unix systems).
+- An example template for new scripts is found in [`template.py`](./ProteinCartography/template.py).
+
+### Adding new dependencies
+First, please consider carefully whether you need to add a new dependency to the project.
+When changes you have made absolutely require new dependencies, please make sure that they are `conda`-installable.
+Dependencies should be added to two environment files:
+1. the `cartography_dev.yml` file in the `envs/` directory.
+2. the appropriate snakemake rule environment file in the `envs/` directory.
+
+In both files, please include the version of the dependency you are using (this is called "pinning" the dependency).
+Include only the exact version number; do not include the package hash.
+For example, if you are adding a new dependency called `new_dependency` and you are using version `1.2.3`, you would add the following line to the `cartography_dev.yml` file:
+```yaml
+- new_dependency=1.2.3
+```
+
+
+## Crediting Contributions
+See how we recognize feedback and contributions to our code at Arcadia [here](https://github.com/Arcadia-Science/arcadia-software-handbook/blob/main/guides-and-standards/guide-credit-for-contributions.md).
diff --git a/Makefile b/Makefile
@@ -15,8 +15,33 @@ pre-commit:
 
 .PHONY: test
 test:
-	pytest -v
+	pytest -vv -s .
+
+.PHONY: test-without-mocks
+test-without-mocks:
+	pytest -vv -s --no-mocks
 
 .PHONY: run-demo-workflow
 run-demo-workflow:
-	snakemake --snakefile Snakefile --configfile demo/config_actin.yml --use-conda $(ARGS)
+	snakemake \
+		--snakefile Snakefile \
+		--configfile demo/search-mode/config_actin.yml \
+		--use-conda  \
+		--cores all \
+		$(ARGS)
+
+.PHONY: generate-search-mode-rulegraph
+generate-search-mode-rulegraph:
+	snakemake \
+		--configfile demo/search-mode/config_actin.yml \
+		--rulegraph \
+		| dot -Tpng \
+		> rulegraph-search-mode.png
+
+.PHONY: generate-cluster-mode-rulegraph
+generate-cluster-mode-rulegraph:
+	snakemake \
+		--configfile demo/cluster-mode/config.yml \
+		--rulegraph \
+		| dot -Tpng \
+		> rulegraph-cluster-mode.png