Pull ccm-seqr

ccmbioinfo · Oct 15, 2024 · 1b9576e · 1b9576e
1 parent 346b864
commit 1b9576e
Show file tree

Hide file tree

Showing 138 changed files with 8,761 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,11 +21,13 @@ django_key
 
 data/*
 
-hail-elasticsearch-pipelines
+# hail-elasticsearch-pipelines
 nginx/certs/*
 !nginx/certs/.gitkeep
 pipeline-logs
 sql-dumps
 
 .vscode/
-pipeline_runner/
+# pipeline_runner/
+pipeline_runner/*/.github
+*/*.zip
diff --git a/hail-elasticsearch-pipelines/.gitignore b/hail-elasticsearch-pipelines/.gitignore
@@ -0,0 +1,12 @@
+.idea*
+*.iml
+*.log
+
+*.pyc
+*.DS_Store
+
+temp
+
+*.vcf.gz
+download_and_create_reference_datasets/*.txt
+download_and_create_reference_datasets/*.gz
diff --git a/hail-elasticsearch-pipelines/.travis.yml b/hail-elasticsearch-pipelines/.travis.yml
@@ -0,0 +1,15 @@
+language: python
+
+python:
+- '2.7'
+
+install:
+- pip install -r requirements.txt
+
+script:
+- python -m unittest discover -p '*test*.py'
+
+#notifications:
+#  slack:
+#    secure: tYNwi4kff+8FWbv+nWqKZt28Qez0Dv77rJG6nvdiaO7QgPZmaiG5l8NqYdxV63CMgNzHwTFAz8ca9uOoh06o7+HuiFNNiqXEAu8umLvlnvkKT6itSDyImxxD7ypitUwcBEKkNy1fxhrLRPQVLjxz37885kA/VtWpm19A5vQ8A7w=
+
diff --git a/hail-elasticsearch-pipelines/CHANGELOG.md b/hail-elasticsearch-pipelines/CHANGELOG.md
@@ -0,0 +1,3 @@
+# Changes
+
+* Added function `wait_for_loading_shards_transfer` to luigi SNV pipeline [(#261)](https://github.com/broadinstitute/hail-elasticsearch-pipelines/pull/261)
diff --git a/hail-elasticsearch-pipelines/CONTRIBUTING.md b/hail-elasticsearch-pipelines/CONTRIBUTING.md
@@ -0,0 +1,22 @@
+# Contributing
+
+## Tests
+
+Tests for Hail 0.1 and Hail 0.2 scripts must be run in different environments.
+
+### Hail 0.1
+
+Currently, no tests in the hail_scripts/v01 directory involve Hail. To run tests:
+```shell
+python2 -m unittest discover -s hail_scripts/v01 -p "*test*.py"
+```
+
+### Hail 0.2
+
+Tests for hail_scripts/v02 require Hail. Instructions for installing Hail can be found
+at https://hail.is/docs/0.2/getting_started.html.
+
+To run tests:
+```shell
+hail -m unittest discover -s hail_scripts/v02 -p "*test*.py"
+```
diff --git a/hail-elasticsearch-pipelines/LICENSE b/hail-elasticsearch-pipelines/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Broad Institute
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/hail-elasticsearch-pipelines/README.md b/hail-elasticsearch-pipelines/README.md
@@ -0,0 +1,3 @@
+The hail scripts in this repo can be used to pre-process variant callsets and export them to elasticsearch. 
+
+See [hail_elasticsearch_pipelines/luigi_pipeline/README.md](luigi_pipeline/README.md) for details.
diff --git a/hail-elasticsearch-pipelines/__init__.py b/hail-elasticsearch-pipelines/__init__.py
diff --git a/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__cadd.py b/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__cadd.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+
+from kubernetes.shell_utils import simple_run as run
+
+run((
+    "python3 gcloud_dataproc/v02/run_script.py "
+    "--cluster create-ht-cadd "
+    "download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py"))
diff --git a/...-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__clinvar.py b/...-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__clinvar.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+
+from kubernetes.shell_utils import simple_run as run
+
+run((
+    "python3 gcloud_dataproc/v02/run_script.py "
+    "--cluster create-ht-clinvar "
+    "download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py"))
diff --git a/...ipelines/download_and_create_reference_datasets/v02/create_ht__combined_reference_data.py b/...ipelines/download_and_create_reference_datasets/v02/create_ht__combined_reference_data.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+import argparse
+from kubernetes.shell_utils import simple_run as run
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-b', '--build', help='Reference build, 37 or 38', choices=["37", "38"], required=True)
+args = parser.parse_args()
+
+run((
+    "python3 gcloud_dataproc/v02/run_script.py "
+    "--cluster create-ht-combined-reference-data "
+    "download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py "
+    f"--build {args.build}"))
diff --git a/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__eigen.py b/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__eigen.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from kubernetes.shell_utils import simple_run as run
+
+for genome_version, vcf_path in [
+    ("37", "gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.vcf.gz"),
+    ("38", "gs://seqr-reference-data/GRCh38/eigen/EIGEN_coding_noncoding.liftover_grch38.vcf.gz"),
+]:
+    run(("python3 gcloud_dataproc/v02/run_script.py "
+        "--cluster create-ht-eigen "
+        "hail_scripts/v02/convert_vcf_to_hail.py "
+        "--output-sites-only-ht "
+        f"--genome-version {genome_version} "
+        f"{vcf_path}"))
diff --git a/...lasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__gnomad_38.py b/...lasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__gnomad_38.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+
+from kubernetes.shell_utils import simple_run as run
+
+run((
+    "python3 gcloud_dataproc/v02/run_script.py "
+    "--cluster create-gnomad-38-hts "
+    "download_and_create_reference_datasets/v02/hail_scripts/write_gnomad_38_hts.py"))
diff --git a/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__mpc.py b/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__mpc.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from kubernetes.shell_utils import simple_run as run
+
+for genome_version, vcf_path in [
+    ("37", "gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.vcf.gz"),
+    ("38", "gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.vcf.gz"),
+]:
+    run(("python3 gcloud_dataproc/v02/run_script.py "
+        "--cluster create-ht-mpc "
+        "hail_scripts/v02/convert_vcf_to_hail.py "
+        "--output-sites-only-ht "
+        f"--genome-version {genome_version} "
+        f"{vcf_path}"))
diff --git a/...asticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__primate_ai.py b/...asticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__primate_ai.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from kubernetes.shell_utils import simple_run as run
+
+for genome_version, vcf_path in [
+    ("37", "gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.vcf.gz"),
+    ("38", "gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.vcf.gz"),
+]:
+    run(("python3 gcloud_dataproc/v02/run_script.py "
+        "--cluster create-ht-primate-ai "
+        "hail_scripts/v02/convert_vcf_to_hail.py "
+        "--output-sites-only-ht "
+        f"--genome-version {genome_version} "
+        f"{vcf_path}"))
diff --git a/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__topmed.py b/hail-elasticsearch-pipelines/download_and_create_reference_datasets/v02/create_ht__topmed.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from kubernetes.shell_utils import simple_run as run
+
+for genome_version, vcf_path in [
+    ("37", "gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz"),
+    ("38", "gs://seqr-reference-data/GRCh38/TopMed/bravo-dbsnp-all.vcf.gz"),
+]:
+    run(("python3 gcloud_dataproc/v02/run_script.py "
+        "--cluster create-ht-topmed "
+        "hail_scripts/v02/convert_vcf_to_hail.py "
+        "--output-sites-only-ht "
+        f"--genome-version {genome_version} "
+        f"{vcf_path}"))
diff --git a/...csearch-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_1kg_ht.py b/...csearch-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_1kg_ht.py
@@ -0,0 +1,71 @@
+import logging
+
+import hail as hl
+
+from hail_scripts.utils.hail_utils import import_vcf
+
+logger = logging.getLogger('v02.hail_scripts.create_1kg_ht')
+
+CONFIG= {
+    "37": "gs://seqr-reference-data/GRCh37/1kg/1kg.wgs.phase3.20130502.GRCh37_sites.vcf.gz",
+    "38": "gs://seqr-reference-data/GRCh38/1kg/1kg.wgs.phase3.20170504.GRCh38_sites.vcf.gz"
+}
+
+def vcf_to_mt(path, genome_version):
+    '''
+    Converts 1kg vcf to mt. The 1kg dataset has multi-allelic variants and duplicates.
+    This function independently filters the mutli-allelics to split, then unions with
+    the bi-allelics.
+
+    :param path: vcf path
+    :param genome_version: genome version
+    :return:
+    '''
+    # Import but do not split multis here.
+    mt = import_vcf(path,
+                    genome_version=genome_version,
+                    min_partitions=1000,
+                    split_multi_alleles=False)
+
+    multiallelic_mt = mt.filter_rows(hl.len(mt.alleles) > 2)
+    multiallelic_mt = hl.split_multi_hts(multiallelic_mt)
+
+    # We annotate some rows manually to conform to the multiallelic_mt (after split).
+    # Calling split_multi_hts on biallelic to annotate the rows causes problems.
+    biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2)
+    biallelic_mt = biallelic_mt.annotate_rows(a_index=1, was_split=False)
+
+    all_mt = biallelic_mt.union_rows(multiallelic_mt)
+    all_mt = all_mt.key_rows_by(all_mt.locus, all_mt.alleles)
+
+    # 37 is known to have some unneeded symbolic alleles, so we filter out.
+    all_mt = all_mt.filter_rows(
+        hl.allele_type(all_mt.alleles[0], all_mt.alleles[1]) == 'Symbolic',
+        keep=False
+    )
+
+    return all_mt
+
+def annotate_mt(mt):
+    # Annotate POPMAX_AF, which is max of respective fields using a_index for multi-allelics.
+    return mt.annotate_rows(POPMAX_AF=hl.max(mt.info.AFR_AF[mt.a_index-1],
+                                             mt.info.AMR_AF[mt.a_index - 1],
+                                             mt.info.EAS_AF[mt.a_index - 1],
+                                             mt.info.EUR_AF[mt.a_index - 1],
+                                             mt.info.SAS_AF[mt.a_index - 1]))
+
+def run():
+   for genome_version, path in CONFIG.items():
+       logger.info('reading from input path: %s' % path)
+
+       mt = vcf_to_mt(path, genome_version)
+       mt = annotate_mt(mt)
+
+       mt.describe()
+
+       output_path = path.replace(".vcf", "").replace(".gz", "").replace(".bgz", "")\
+                         .replace(".*", "").replace("*", "") + ".ht"
+       logger.info('writing to output path: %s' % output_path)
+       mt.rows().write(output_path)
+
+run()
diff --git a/...search-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py b/...search-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+# combine the pre-computed CADD .tsvs from https://cadd.gs.washington.edu/download into 1 Table for each genome build
+
+import logging
+logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s')
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+import hail as hl
+from hail_scripts.utils.hail_utils import write_ht, import_table
+
+hl.init()
+
+
+def import_cadd_table(path: str, genome_version: str) -> hl.Table:
+    if genome_version not in ("37", "38"):
+        raise ValueError(f"Invalid genome version: {genome_version}")
+
+    column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'}
+    types = {'f0': hl.tstr, 'f1': hl.tint, 'f4': hl.tfloat32, 'f5': hl.tfloat32}
+
+    cadd_ht = hl.import_table(path, force_bgz=True, comment="#", no_header=True, types=types, min_partitions=10000)
+    cadd_ht = cadd_ht.rename(column_names)
+    chrom = hl.format("chr%s", cadd_ht.chrom) if genome_version == "38" else cadd_ht.chrom
+    locus = hl.locus(chrom, cadd_ht.pos, reference_genome=hl.get_reference(f"GRCh{genome_version}"))
+    alleles = hl.array([cadd_ht.ref, cadd_ht.alt])
+    cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles)
+
+    cadd_union_ht = cadd_ht.head(0)
+    for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]):
+        contigs = ["chr%s" % contig for contig in contigs] if genome_version == "38" else contigs
+        cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig))
+        cadd_union_ht = cadd_union_ht.union(cadd_ht_subset)
+
+    cadd_union_ht = cadd_union_ht.key_by("locus", "alleles")
+
+    cadd_union_ht.describe()
+
+    return cadd_union_ht
+
+for genome_version in ["37", "38"]:
+    snvs_ht = import_cadd_table(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/CADD_snvs.v1.6.tsv.gz", genome_version)
+    indel_ht = import_cadd_table(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/InDels_v1.6.tsv.gz", genome_version)
+
+    ht = snvs_ht.union(indel_ht)
+
+    ht.naive_coalesce(10000).write(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/CADD_snvs_and_indels.v1.6.ht", overwrite=True)
diff --git a/...rch-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py b/...rch-pipelines/download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py
@@ -0,0 +1,17 @@
+import hail as hl
+from hail_scripts.utils.clinvar import download_and_import_latest_clinvar_vcf, CLINVAR_HT_PATH, CLINVAR_GOLD_STARS_LOOKUP
+from hail_scripts.utils.hail_utils import write_ht
+
+
+for genome_version in ["37", "38"]:
+
+    mt = download_and_import_latest_clinvar_vcf(genome_version)
+
+    timestamp = hl.eval(mt.version)
+
+    ht = mt.rows()
+    ht = ht.annotate(gold_stars=CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT)))
+
+    ht.describe()
+
+    write_ht(ht, CLINVAR_HT_PATH.format(genome_version=genome_version).replace(".ht", ".") + timestamp + ".ht")