forked from broadinstitute/seqr
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
346b864
commit 1b9576e
Showing
138 changed files
with
8,761 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
.idea* | ||
*.iml | ||
*.log | ||
|
||
*.pyc | ||
*.DS_Store | ||
|
||
temp | ||
|
||
*.vcf.gz | ||
download_and_create_reference_datasets/*.txt | ||
download_and_create_reference_datasets/*.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
language: python | ||
|
||
python: | ||
- '2.7' | ||
|
||
install: | ||
- pip install -r requirements.txt | ||
|
||
script: | ||
- python -m unittest discover -p '*test*.py' | ||
|
||
#notifications: | ||
# slack: | ||
# secure: tYNwi4kff+8FWbv+nWqKZt28Qez0Dv77rJG6nvdiaO7QgPZmaiG5l8NqYdxV63CMgNzHwTFAz8ca9uOoh06o7+HuiFNNiqXEAu8umLvlnvkKT6itSDyImxxD7ypitUwcBEKkNy1fxhrLRPQVLjxz37885kA/VtWpm19A5vQ8A7w= | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Changes | ||
|
||
* Added function `wait_for_loading_shards_transfer` to luigi SNV pipeline [(#261)](https://github.com/broadinstitute/hail-elasticsearch-pipelines/pull/261) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Contributing | ||
|
||
## Tests | ||
|
||
Tests for Hail 0.1 and Hail 0.2 scripts must be run in different environments. | ||
|
||
### Hail 0.1 | ||
|
||
Currently, no tests in the hail_scripts/v01 directory involve Hail. To run tests: | ||
```shell | ||
python2 -m unittest discover -s hail_scripts/v01 -p "*test*.py" | ||
``` | ||
|
||
### Hail 0.2 | ||
|
||
Tests for hail_scripts/v02 require Hail. Instructions for installing Hail can be found | ||
at https://hail.is/docs/0.2/getting_started.html. | ||
|
||
To run tests: | ||
```shell | ||
hail -m unittest discover -s hail_scripts/v02 -p "*test*.py" | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2020 Broad Institute | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
The hail scripts in this repo can be used to pre-process variant callsets and export them to elasticsearch. | ||
|
||
See [hail_elasticsearch_pipelines/luigi_pipeline/README.md](luigi_pipeline/README.md) for details. |
Empty file.
8 changes: 8 additions & 0 deletions
8
hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__cadd.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from kubernetes.shell_utils import simple_run as run | ||
|
||
run(( | ||
"python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-ht-cadd " | ||
"download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py")) |
8 changes: 8 additions & 0 deletions
8
...-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__clinvar.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from kubernetes.shell_utils import simple_run as run | ||
|
||
run(( | ||
"python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-ht-clinvar " | ||
"download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py")) |
14 changes: 14 additions & 0 deletions
14
...ipelines/download_and_create_reference_datasets/v02/create_ht__combined_reference_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
from kubernetes.shell_utils import simple_run as run | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-b', '--build', help='Reference build, 37 or 38', choices=["37", "38"], required=True) | ||
args = parser.parse_args() | ||
|
||
run(( | ||
"python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-ht-combined-reference-data " | ||
"download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py " | ||
f"--build {args.build}")) |
14 changes: 14 additions & 0 deletions
14
hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__eigen.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from kubernetes.shell_utils import simple_run as run | ||
|
||
for genome_version, vcf_path in [ | ||
("37", "gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.vcf.gz"), | ||
("38", "gs://seqr-reference-data/GRCh38/eigen/EIGEN_coding_noncoding.liftover_grch38.vcf.gz"), | ||
]: | ||
run(("python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-ht-eigen " | ||
"hail_scripts/v02/convert_vcf_to_hail.py " | ||
"--output-sites-only-ht " | ||
f"--genome-version {genome_version} " | ||
f"{vcf_path}")) |
8 changes: 8 additions & 0 deletions
8
...lasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__gnomad_38.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from kubernetes.shell_utils import simple_run as run | ||
|
||
run(( | ||
"python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-gnomad-38-hts " | ||
"download_and_create_reference_datasets/v02/hail_scripts/write_gnomad_38_hts.py")) |
14 changes: 14 additions & 0 deletions
14
hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__mpc.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from kubernetes.shell_utils import simple_run as run | ||
|
||
for genome_version, vcf_path in [ | ||
("37", "gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.vcf.gz"), | ||
("38", "gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.vcf.gz"), | ||
]: | ||
run(("python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-ht-mpc " | ||
"hail_scripts/v02/convert_vcf_to_hail.py " | ||
"--output-sites-only-ht " | ||
f"--genome-version {genome_version} " | ||
f"{vcf_path}")) |
14 changes: 14 additions & 0 deletions
14
...asticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__primate_ai.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from kubernetes.shell_utils import simple_run as run | ||
|
||
for genome_version, vcf_path in [ | ||
("37", "gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.vcf.gz"), | ||
("38", "gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.vcf.gz"), | ||
]: | ||
run(("python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-ht-primate-ai " | ||
"hail_scripts/v02/convert_vcf_to_hail.py " | ||
"--output-sites-only-ht " | ||
f"--genome-version {genome_version} " | ||
f"{vcf_path}")) |
14 changes: 14 additions & 0 deletions
14
hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__topmed.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from kubernetes.shell_utils import simple_run as run | ||
|
||
for genome_version, vcf_path in [ | ||
("37", "gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz"), | ||
("38", "gs://seqr-reference-data/GRCh38/TopMed/bravo-dbsnp-all.vcf.gz"), | ||
]: | ||
run(("python3 gcloud_dataproc/v02/run_script.py " | ||
"--cluster create-ht-topmed " | ||
"hail_scripts/v02/convert_vcf_to_hail.py " | ||
"--output-sites-only-ht " | ||
f"--genome-version {genome_version} " | ||
f"{vcf_path}")) |
71 changes: 71 additions & 0 deletions
71
...csearch-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_1kg_ht.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import logging | ||
|
||
import hail as hl | ||
|
||
from hail_scripts.utils.hail_utils import import_vcf | ||
|
||
logger = logging.getLogger('v02.hail_scripts.create_1kg_ht') | ||
|
||
CONFIG= { | ||
"37": "gs://seqr-reference-data/GRCh37/1kg/1kg.wgs.phase3.20130502.GRCh37_sites.vcf.gz", | ||
"38": "gs://seqr-reference-data/GRCh38/1kg/1kg.wgs.phase3.20170504.GRCh38_sites.vcf.gz" | ||
} | ||
|
||
def vcf_to_mt(path, genome_version): | ||
''' | ||
Converts 1kg vcf to mt. The 1kg dataset has multi-allelic variants and duplicates. | ||
This function independently filters the mutli-allelics to split, then unions with | ||
the bi-allelics. | ||
:param path: vcf path | ||
:param genome_version: genome version | ||
:return: | ||
''' | ||
# Import but do not split multis here. | ||
mt = import_vcf(path, | ||
genome_version=genome_version, | ||
min_partitions=1000, | ||
split_multi_alleles=False) | ||
|
||
multiallelic_mt = mt.filter_rows(hl.len(mt.alleles) > 2) | ||
multiallelic_mt = hl.split_multi_hts(multiallelic_mt) | ||
|
||
# We annotate some rows manually to conform to the multiallelic_mt (after split). | ||
# Calling split_multi_hts on biallelic to annotate the rows causes problems. | ||
biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2) | ||
biallelic_mt = biallelic_mt.annotate_rows(a_index=1, was_split=False) | ||
|
||
all_mt = biallelic_mt.union_rows(multiallelic_mt) | ||
all_mt = all_mt.key_rows_by(all_mt.locus, all_mt.alleles) | ||
|
||
# 37 is known to have some unneeded symbolic alleles, so we filter out. | ||
all_mt = all_mt.filter_rows( | ||
hl.allele_type(all_mt.alleles[0], all_mt.alleles[1]) == 'Symbolic', | ||
keep=False | ||
) | ||
|
||
return all_mt | ||
|
||
def annotate_mt(mt): | ||
# Annotate POPMAX_AF, which is max of respective fields using a_index for multi-allelics. | ||
return mt.annotate_rows(POPMAX_AF=hl.max(mt.info.AFR_AF[mt.a_index-1], | ||
mt.info.AMR_AF[mt.a_index - 1], | ||
mt.info.EAS_AF[mt.a_index - 1], | ||
mt.info.EUR_AF[mt.a_index - 1], | ||
mt.info.SAS_AF[mt.a_index - 1])) | ||
|
||
def run(): | ||
for genome_version, path in CONFIG.items(): | ||
logger.info('reading from input path: %s' % path) | ||
|
||
mt = vcf_to_mt(path, genome_version) | ||
mt = annotate_mt(mt) | ||
|
||
mt.describe() | ||
|
||
output_path = path.replace(".vcf", "").replace(".gz", "").replace(".bgz", "")\ | ||
.replace(".*", "").replace("*", "") + ".ht" | ||
logger.info('writing to output path: %s' % output_path) | ||
mt.rows().write(output_path) | ||
|
||
run() |
49 changes: 49 additions & 0 deletions
49
...search-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# combine the pre-computed CADD .tsvs from https://cadd.gs.washington.edu/download into 1 Table for each genome build | ||
|
||
import logging | ||
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s') | ||
logger = logging.getLogger() | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
import hail as hl | ||
from hail_scripts.utils.hail_utils import write_ht, import_table | ||
|
||
hl.init() | ||
|
||
|
||
def import_cadd_table(path: str, genome_version: str) -> hl.Table: | ||
if genome_version not in ("37", "38"): | ||
raise ValueError(f"Invalid genome version: {genome_version}") | ||
|
||
column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'} | ||
types = {'f0': hl.tstr, 'f1': hl.tint, 'f4': hl.tfloat32, 'f5': hl.tfloat32} | ||
|
||
cadd_ht = hl.import_table(path, force_bgz=True, comment="#", no_header=True, types=types, min_partitions=10000) | ||
cadd_ht = cadd_ht.rename(column_names) | ||
chrom = hl.format("chr%s", cadd_ht.chrom) if genome_version == "38" else cadd_ht.chrom | ||
locus = hl.locus(chrom, cadd_ht.pos, reference_genome=hl.get_reference(f"GRCh{genome_version}")) | ||
alleles = hl.array([cadd_ht.ref, cadd_ht.alt]) | ||
cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles) | ||
|
||
cadd_union_ht = cadd_ht.head(0) | ||
for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]): | ||
contigs = ["chr%s" % contig for contig in contigs] if genome_version == "38" else contigs | ||
cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig)) | ||
cadd_union_ht = cadd_union_ht.union(cadd_ht_subset) | ||
|
||
cadd_union_ht = cadd_union_ht.key_by("locus", "alleles") | ||
|
||
cadd_union_ht.describe() | ||
|
||
return cadd_union_ht | ||
|
||
for genome_version in ["37", "38"]: | ||
snvs_ht = import_cadd_table(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/CADD_snvs.v1.6.tsv.gz", genome_version) | ||
indel_ht = import_cadd_table(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/InDels_v1.6.tsv.gz", genome_version) | ||
|
||
ht = snvs_ht.union(indel_ht) | ||
|
||
ht.naive_coalesce(10000).write(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/CADD_snvs_and_indels.v1.6.ht", overwrite=True) |
17 changes: 17 additions & 0 deletions
17
...rch-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import hail as hl | ||
from hail_scripts.utils.clinvar import download_and_import_latest_clinvar_vcf, CLINVAR_HT_PATH, CLINVAR_GOLD_STARS_LOOKUP | ||
from hail_scripts.utils.hail_utils import write_ht | ||
|
||
|
||
for genome_version in ["37", "38"]: | ||
|
||
mt = download_and_import_latest_clinvar_vcf(genome_version) | ||
|
||
timestamp = hl.eval(mt.version) | ||
|
||
ht = mt.rows() | ||
ht = ht.annotate(gold_stars=CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT))) | ||
|
||
ht.describe() | ||
|
||
write_ht(ht, CLINVAR_HT_PATH.format(genome_version=genome_version).replace(".ht", ".") + timestamp + ".ht") |
Oops, something went wrong.