Update to VEP 104 (#13)

googlegenomics · Jun 4, 2021 · 2d324c0 · 2d324c0
1 parent 935a7d3
commit 2d324c0
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 17 deletions.
diff --git a/batch/vep/Dockerfile b/batch/vep/Dockerfile
@@ -17,7 +17,7 @@
 #
 # Example:
 #
-# docker build . --build-arg ENSEMBL_RELEASE=101 --tag vep_101
+# docker build . --build-arg ENSEMBL_RELEASE=104 --tag vep:104
 #
 # To run vep through containers created by this file, the VEP cache has to be
 # downloaded separately and made available through command line arguments.
@@ -27,7 +27,7 @@
 # retry logic.
 FROM gcr.io/cloud-genomics-pipelines/io
 
-ARG ENSEMBL_RELEASE=101
+ARG ENSEMBL_RELEASE=104
 ARG VEP_BASE=/opt/variant_effect_predictor
 
 RUN apt-get -y update && apt-get install -y procps\

diff --git a/batch/vep/README.md b/batch/vep/README.md
@@ -27,17 +27,17 @@ Inside this directory, run:
 
 This will download the source from
 [VEP GitHub repo](https://github.com/Ensembl/ensembl-vep) and build VEP from
-that source. By default, it uses version 101 of VEP. This can be changed by
+that source. By default, it uses version 104 of VEP. This can be changed by
 `ENSEMBL_RELEASE` build argument, e.g.,
 
-`docker build . -t [IMAGE_TAG] --build-arg ENSEMBL_RELEASE=90`
+`docker build . -t [IMAGE_TAG] --build-arg ENSEMBL_RELEASE=104`
 
 Let's say we want to push this image to the
 [Container Registry](https://cloud.google.com/container-registry/) of
 `my-project` on Google Cloud, so we can pick `[IMAGE_TAG]` as
-`gcr.io/my-project/vep:101`. Then push this image by:
+`gcr.io/my-project/vep:104`. Then push this image by:
 
-`gcloud docker -- push gcr.io/my-project/vep:101`
+`gcloud docker -- push gcr.io/my-project/vep:104`
 
 **TODO**: Add `cloudbuild.yaml` files for both easy push and integration test.
 
@@ -48,7 +48,7 @@ download and integrate different pieces of the VEP database or cache files.
 Then from within that directory run the
 [`build_vep_cache.sh`](build_vep_cache.sh) script. By default this script
 creates the database for human (homo_sapiens), referenec sequence `GRCh38`,
-and release 101 of VEP. These values can be overwritten by the following
+and release 104 of VEP. These values can be overwritten by the following
 environment variables (note you should use the same VEP release
 that you used for creating VEP docker image above):
 
@@ -74,7 +74,7 @@ gcloud alpha genomics pipelines run \
   --inputs VCF_INFO_FILED=CSQ_RERUN
 ```
 
-Note the `vep_cache_homo_sapiens_GRCh38_101.tar.gz` file that is referenced in
+Note the `vep_cache_homo_sapiens_GRCh38_104.tar.gz` file that is referenced in
 the sample `yaml` file, is the output file that you get from the above database
 creation step.
 

diff --git a/batch/vep/build_vep_cache.sh b/batch/vep/build_vep_cache.sh
@@ -26,16 +26,16 @@
 # Capital letter variables refer to environment variables that can be set from
 # outside. Internal variables have small letters. All environment variables
 # have a default value as well to set up cache for homo_sapiens with reference
-# GRCh38 and release 101 of VEP.
+# GRCh38 and release 104 of VEP.
 #
 # More details on cache files can be found here:
 # https://ensembl.org/info/docs/tools/vep/script/vep_cache.html
 
 set -euo pipefail
 
-readonly release="${ENSEMBL_RELEASE:-101}"
-readonly species="${VEP_SPECIES:-homo_sapiens}"
-readonly assembly="${GENOME_ASSEMBLY:-GRCh38}"
+readonly release="${ENSEMBL_RELEASE:-104}"
+readonly species="${VEP_SPECIES:-homo_sapiens}" # or "${VEP_SPECIES:-mus_musculus}"
+readonly assembly="${GENOME_ASSEMBLY:-GRCh38}" # or "${GENOME_ASSEMBLY:-GRCh37}" for homo_sapiens or "${GENOME_ASSEMBLY:-GRCm38}" for mus_musculus
 readonly work_dir="vep_cache"
 
 mkdir -p "${work_dir}"
@@ -63,7 +63,7 @@ if [[ $species == "homo_sapiens" ]] && [[ $assembly == "GRCh37" ]]; then
     exit 1
   fi
   readonly ftp_GRCh37="ftp://ftp.ensembl.org/pub/grch37/release-${release}"
-  readonly remote_fasta="${ftp_GRCh37}/fasta/homo_sapiens/dna/${fasta_file}"
+  readonly remote_fasta="${ftp_GRCh37}/fasta/${species}/dna/${fasta_file}"
   echo "Downloading ${remote_fasta}"
   curl -O "${remote_fasta}"
   echo "Decompressing fasta file..."
@@ -74,7 +74,7 @@ if [[ $species == "homo_sapiens" ]] && [[ $assembly == "GRCh37" ]]; then
   echo "Creating .fai index..."
   samtools faidx "${fasta_file}"
 else
-  readonly remote_fasta="${ftp_base}/fasta/homo_sapiens/dna_index/${fasta_file}"
+  readonly remote_fasta="${ftp_base}/fasta/${species}/dna_index/${fasta_file}"
   echo "Downloading ${remote_fasta} and its index files ..."
   curl -O "${remote_fasta}"
   curl -O "${remote_fasta}.fai"

diff --git a/batch/vep/sample_pipeline.yaml b/batch/vep/sample_pipeline.yaml
@@ -9,10 +9,10 @@ resources:
   minimumCpuCores: 12
 inputParameters:
   - name: VEP_CACHE
-    defaultValue: gs://my_bucket/vep_cache_homo_sapiens_GRCh38_91.tar.gz
+    defaultValue: gs://my_bucket/vep_cache_homo_sapiens_GRCh38_104.tar.gz
     localCopy:
       disk: datadisk
-      path: vep_cache_91.tar.gz
+      path: vep_cache_104.tar.gz
   - name: INPUT_FILE
     defaultValue: gs://my_bucket/input.vcf
     localCopy:
@@ -29,5 +29,5 @@ outputParameters:
       disk: datadisk
       path: output.vcf
 docker:
-  imageName: gcr.io/my-project/vep_91
+  imageName: gcr.io/my-project/vep:104
   cmd: /opt/variant_effect_predictor/run_vep.sh ${INPUT_FILE} ${OUTPUT_FILE}