From 43799760919ce561a48c6b77a0aa9cb967f8b5d0 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Fri, 2 Feb 2024 13:51:26 -0500 Subject: [PATCH 01/34] add ensembl genome links for rnasamba model building --- inputs/models/rnasamba/build/train_data_links.tsv | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 inputs/models/rnasamba/build/train_data_links.tsv diff --git a/inputs/models/rnasamba/build/train_data_links.tsv b/inputs/models/rnasamba/build/train_data_links.tsv new file mode 100644 index 0000000..65ec5f4 --- /dev/null +++ b/inputs/models/rnasamba/build/train_data_links.tsv @@ -0,0 +1,9 @@ +organism root filename cdna_suffix ncrna_suffix +human https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/ Homo_sapiens.GRCh38 cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz +yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/ Saccharomyces_cerevisiae.R64-1-1 cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz +worm https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/ Caenorhabditis_elegans.WBcel235 cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz +arabadopsis https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/arabidopsis_thaliana/ Arabidopsis_thaliana.TAIR10 cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz +drosophila https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/ Drosophila_melanogaster.BDGP6.46 cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz +dictyostelium_discoideum https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/ Dictyostelium_discoideum.dicty_2.7 cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz +mouse https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/ Mus_musculus.GRCm39 cdna/Mus_musculus.GRCm39.cdna.all.fa.gz ncrna/Mus_musculus.GRCm39.ncrna.fa.gz +zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz \ No newline at end of file From 5c0cde39e5cc2c2758544b3cb40422578f6541a7 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Mon, 5 Feb 2024 14:32:11 -0500 Subject: [PATCH 02/34] check in cluster all and dumb split code --- build_rnasamba_euk_model.snakefile | 197 +++++++++++++++++++++++++++++ envs/mmseqs2.yml | 6 + envs/seqkit.yml | 6 + envs/seqtk.yml | 6 + 4 files changed, 215 insertions(+) create mode 100644 build_rnasamba_euk_model.snakefile create mode 100644 envs/mmseqs2.yml create mode 100644 envs/seqkit.yml create mode 100644 envs/seqtk.yml diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile new file mode 100644 index 0000000..9892bbe --- /dev/null +++ b/build_rnasamba_euk_model.snakefile @@ -0,0 +1,197 @@ +import pandas as pd +import os +import re + +metadata = pd.read_csv("inputs/models/rnasamba/build/train_data_links.tsv", sep = "\t") +GENOMES = metadata['genome'].unique().tolist() +RNA_TYPES = ['cdna', 'ncrna'] # inherits names from ensembl +VALIDATION_TYPES = ['mRNAs', 'ncRNAs'] # inherits names from https://github.com/cbl-nabi/RNAChallenge + +rule all: + input: + "outputs/models/rnasamba/build/stats.tsv", + "outputs/models/rnasamba/build/2_sequence_sets/protein_coding_traintest.fa", + "outputs/models/rnasamba/build/2_sequence_sets/non_coding_traintest.fa", + expand("outputs/models/rnasamba/build/2_sequence_sets/validation/{validation_type}.fa", validation_type = VALIDATION_TYPES) + +rule download_ensembl_data: + """ + Download ensembl cDNA and ncRNA files. + Ensembl annotates protein coding and non-coding RNA transcripts in their files. + This information will be used to separate protein coding from non-coding RNAs to build an RNAsamba model. + Note this download renames genome files from their names on ensembl to make them simpler to point to. + See example transformations below: + - cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz -> cdna/Homo_sapiens.GRCh38.cdna.fa.gz (dropped "all.") + - ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz -> ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz (no change) + """ + output: "inputs/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz" + run: + genome_df = metadata.loc[(metadata['genome'] == wildcards.genome)] + root_url = genome_df['root_url'].values[0] + if wildcards.rna_type == "cdna": + suffix = genome_df['cdna_suffix'].values[0] + else: + suffix = genome_df['ncrna_suffix'].values[0] + + url = root_url + suffix + shell("curl -JLo {output} {url}") + + +rule extract_protein_coding_orfs_from_cdna: + """ + Ensembl cDNA files consist of transcript sequences for actual and possible genes, including pseudogenes, NMD and the like. + Transcripts in the cDNA files have headers like: >TRANSCRIPT_ID SEQTYPE LOCATION GENE_ID GENE_BIOTYPE TRANSCRIPT_BIOTYPE, where the gene_biotype and transcript_biotype both contain information about whether the gene is coding or not. + """ + input: "inputs/ensembl/cdna/{genome}.cdna.fa.gz" + output: "outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz" + conda: "envs/seqkit.yml" + shell:''' + seqkit grep --use-regexp --by-name --pattern "transcript_biotype:protein_coding" -o {output} {input} + ''' + +rule download_validation_data: + output: "inputs/validation/rnachallenge/{validation_type}.fa", + shell:''' + curl -JLo {output} https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa + ''' + +rule combine_sequences: + input: + coding = expand("outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz", genome = GENOMES), + noncoding = expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome = GENOMES), + validation = expand("inputs/validation/rnachallenge/{validation_type}.fa", validation_type = VALIDATION_TYPES) + output: "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz" + shell:''' + cat {input} > {output} + ''' + +rule reduce_sequence_homology: + """ + To reduce pollution between training and testing set, cluster sequences at 80% sequence identity. + """ + input: "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz" + output: + "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", + "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv" + params: prefix = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences" + conda: "envs/mmseqs2.yml" + shell:''' + mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 + ''' + +rule grab_representative_sequence_names: + input: "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", + output: "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" + shell:''' + awk 'sub(/^>/, "")' {input} > {output} + ''' + +rule filter_validation: + input: + fa = "inputs/validation/rnachallenge/{validation_type}.fa", + names = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" + output: "outputs/models/rnasamba/build/2_sequence_sets/validation/{validation_type}.fa" + conda: "envs/seqtk.yml" + shell:''' + seqtk subseq {input.fa} {input.names} > {output} + ''' + +rule filter_traintest_coding: + input: + protein_coding = expand("outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz", genome = GENOMES), + names = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" + output: + protein_coding = temp("outputs/models/rnasamba/build/2_sequence_sets/all_protein_coding.fa.gz"), + protein_coding_filt = "outputs/models/rnasamba/build/2_sequence_sets/protein_coding_traintest.fa" + conda: "envs/seqtk.yml" + shell:''' + cat {input.protein_coding} > {output.protein_coding} + seqtk subseq {output.protein_coding} {input.names} > {output.protein_coding_filt} + ''' + + +rule filter_traintest_noncoding: + input: + non_coding = expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome = GENOMES), + names = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" + output: + non_coding = temp("outputs/models/rnasamba/build/2_sequence_sets/all_non_coding.fa.gz"), + non_coding_filt = "outputs/models/rnasamba/build/2_sequence_sets/non_coding_traintest.fa" + conda: "envs/seqtk.yml" + shell:''' + cat {input.non_coding} > {output.non_coding} + seqtk subseq {output.non_coding} {input.names} > {output.non_coding_filt} + ''' + +################################################################## +## Get sequence statistics +################################################################## + +rule get_sequence_statistics: + input: "inputs/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz" + output: "outputs/models/rnasamba/build/stats/full/{genome}.{rna_type}.tsv" + conda: "envs/seqkit.yml" + shell:''' + seqkit stats --all -o {output} -T {input} + ''' + +rule get_sequence_statistics_less_than_300_nt_ncrna: + input: "inputs/ensembl/ncrna/{genome}.ncrna.fa.gz" + output: "outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.ncrna.tsv" + conda: "envs/seqkit.yml" + shell:''' + seqkit seq --max-len 300 {input} | seqkit stats --all -T -o {output} + ''' +rule get_sequence_statistics_less_than_300nt_protein_coding: + input: "outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz" + output: "outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.protein_coding.tsv" + conda: "envs/seqkit.yml" + shell:''' + seqkit seq --max-len 300 {input} | seqkit stats --all -T -o {output} + ''' + +rule get_sequence_statistics_protein_coding: + input: "outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz" + output: "outputs/models/rnasamba/build/stats/protein_coding/{genome}.protein_coding.tsv" + conda: "envs/seqkit.yml" + shell:''' + seqkit stats --all -T -o {output} {input} + ''' + +def read_tsv_files(file_paths): + all_data = [] + + for file_path in file_paths: + file_name = re.sub("outputs/models/rnasamba/build/stats/", "", file_path) + file_type = re.sub("/.*", "", file_name) + if "cdna" in file_name: + rna_type = "cnda" + elif "ncrna" in file_name: + rna_type = "ncrna" + else: + rna_type = "protein_coding" + rm_from_genome_str1 = "." + rna_type + ".tsv" + genome = re.sub(rm_from_genome_str1, "", file_name) + rm_from_genome_str2 = file_type + "/" + genome = re.sub(rm_from_genome_str2, "", genome) + df = pd.read_csv(file_path, sep='\t', header=0) + df['file_type'] = file_type + df['rna_type'] = rna_type + df['genome'] = genome + all_data.append(df) + + # Concatenate all dataframes + combined_df = pd.concat(all_data, ignore_index=True) + return combined_df + + +rule combine_stats_files: + input: + expand("outputs/models/rnasamba/build/stats/full/{genome}.{rna_type}.tsv", rna_type = RNA_TYPES, genome = GENOMES), + expand("outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.ncrna.tsv", genome = GENOMES), + expand("outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.protein_coding.tsv", genome = GENOMES), + expand("outputs/models/rnasamba/build/stats/protein_coding/{genome}.protein_coding.tsv", genome = GENOMES) + output: "outputs/models/rnasamba/build/stats.tsv" + run: + combined_df = read_tsv_files(input) + combined_df.to_csv(str(output), sep = "\t") diff --git a/envs/mmseqs2.yml b/envs/mmseqs2.yml new file mode 100644 index 0000000..339d8b2 --- /dev/null +++ b/envs/mmseqs2.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - mmseqs2=15.6f452 diff --git a/envs/seqkit.yml b/envs/seqkit.yml new file mode 100644 index 0000000..c9e3ad5 --- /dev/null +++ b/envs/seqkit.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - seqkit=2.6.1 diff --git a/envs/seqtk.yml b/envs/seqtk.yml new file mode 100644 index 0000000..ecb2f15 --- /dev/null +++ b/envs/seqtk.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - seqtk=1.4 From 28477ff20a78cfcd6f2d19e294614d0fb8ffc0ec Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 6 Feb 2024 09:09:15 -0500 Subject: [PATCH 03/34] prelim processing into sets finished (still need to refactor and add stats) --- build_rnasamba_euk_model.snakefile | 116 +++++++++--------- envs/tidyverse.yml | 6 + ...ocess_sequences_into_nonoverlapping_sets.R | 86 +++++++++++++ 3 files changed, 150 insertions(+), 58 deletions(-) create mode 100644 envs/tidyverse.yml create mode 100644 scripts/process_sequences_into_nonoverlapping_sets.R diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index 9892bbe..c7450b3 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -6,13 +6,12 @@ metadata = pd.read_csv("inputs/models/rnasamba/build/train_data_links.tsv", sep GENOMES = metadata['genome'].unique().tolist() RNA_TYPES = ['cdna', 'ncrna'] # inherits names from ensembl VALIDATION_TYPES = ['mRNAs', 'ncRNAs'] # inherits names from https://github.com/cbl-nabi/RNAChallenge +SET_TYPES = ['coding', 'noncoding'] +SET_NAMES = ['train', 'test', 'validation'] rule all: input: - "outputs/models/rnasamba/build/stats.tsv", - "outputs/models/rnasamba/build/2_sequence_sets/protein_coding_traintest.fa", - "outputs/models/rnasamba/build/2_sequence_sets/non_coding_traintest.fa", - expand("outputs/models/rnasamba/build/2_sequence_sets/validation/{validation_type}.fa", validation_type = VALIDATION_TYPES) + expand("outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa", set_type = SET_TYPES, set_name = SET_NAMES) rule download_ensembl_data: """ @@ -43,23 +42,23 @@ rule extract_protein_coding_orfs_from_cdna: Transcripts in the cDNA files have headers like: >TRANSCRIPT_ID SEQTYPE LOCATION GENE_ID GENE_BIOTYPE TRANSCRIPT_BIOTYPE, where the gene_biotype and transcript_biotype both contain information about whether the gene is coding or not. """ input: "inputs/ensembl/cdna/{genome}.cdna.fa.gz" - output: "outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz" + output: "outputs/models/rnasamba/build/0_coding/{genome}.fa.gz" conda: "envs/seqkit.yml" shell:''' seqkit grep --use-regexp --by-name --pattern "transcript_biotype:protein_coding" -o {output} {input} ''' rule download_validation_data: - output: "inputs/validation/rnachallenge/{validation_type}.fa", + output: "inputs/validation/rnachallenge/{validation_type}.fa.gz", shell:''' - curl -JLo {output} https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa + curl -JL https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa | gzip > {output} ''' rule combine_sequences: input: - coding = expand("outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz", genome = GENOMES), + coding = expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome = GENOMES), noncoding = expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome = GENOMES), - validation = expand("inputs/validation/rnachallenge/{validation_type}.fa", validation_type = VALIDATION_TYPES) + validation = expand("inputs/validation/rnachallenge/{validation_type}.fa.gz", validation_type = VALIDATION_TYPES) output: "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz" shell:''' cat {input} > {output} @@ -79,48 +78,64 @@ rule reduce_sequence_homology: mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 ''' -rule grab_representative_sequence_names: - input: "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", - output: "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" +rule grab_validation_set_names_and_lengths: + input: "inputs/validation/rnachallenge/{validation_type}.fa.gz", + output: + validation = "inputs/validation/rnachallenge/{validation_type}.fa", + validation_fai = "inputs/validation/rnachallenge/{validation_type}.fa.fai", + conda: "envs/seqkit.yml" shell:''' - awk 'sub(/^>/, "")' {input} > {output} + cat {input} | gunzip > {output.validation} + seqkit faidx {output.validation} ''' -rule filter_validation: - input: - fa = "inputs/validation/rnachallenge/{validation_type}.fa", - names = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" - output: "outputs/models/rnasamba/build/2_sequence_sets/validation/{validation_type}.fa" - conda: "envs/seqtk.yml" +# TER TODO: consider changing the ncrna input path to remove some duplication in the rules +rule grab_traintest_coding_names_and_lengths: + input: expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome = GENOMES), + output: + coding = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa", + coding_fai = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa.fai" + conda: "envs/seqkit.yml" shell:''' - seqtk subseq {input.fa} {input.names} > {output} + cat {input} | gunzip > {output.coding} + seqkit faidx {output.coding} ''' -rule filter_traintest_coding: - input: - protein_coding = expand("outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz", genome = GENOMES), - names = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" +rule grab_traintest_noncoding_names_and_lengths: + input: expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome = GENOMES), output: - protein_coding = temp("outputs/models/rnasamba/build/2_sequence_sets/all_protein_coding.fa.gz"), - protein_coding_filt = "outputs/models/rnasamba/build/2_sequence_sets/protein_coding_traintest.fa" - conda: "envs/seqtk.yml" + noncoding = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa", + noncoding_fai = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa.fai" + conda: "envs/seqkit.yml" shell:''' - cat {input.protein_coding} > {output.protein_coding} - seqtk subseq {output.protein_coding} {input.names} > {output.protein_coding_filt} + cat {input} | gunzip > {output.noncoding} + seqkit faidx {output.noncoding} ''' - -rule filter_traintest_noncoding: - input: - non_coding = expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome = GENOMES), - names = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq_names.txt" +rule process_sequences_into_nonoverlapping_sets: + input: + traintest_fai = expand("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_{set_type}.fa.fai", set_type = SET_TYPES), + validation_fai = expand("inputs/validation/rnachallenge/{validation_type}.fa.fai", validation_type = VALIDATION_TYPES), + clusters = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv" output: - non_coding = temp("outputs/models/rnasamba/build/2_sequence_sets/all_non_coding.fa.gz"), - non_coding_filt = "outputs/models/rnasamba/build/2_sequence_sets/non_coding_traintest.fa" + # TER TODO: the set_name/set_types could be born here, but I need to check the order in which they would be built to make sure files aren't named the wrong thing. actually i could do this by number of lines in the final file. Also need to update the R script to point to the right thing + coding_train = "outputs/models/rnasamba/build/2_sequence_sets/coding_train.txt", + coding_test = "outputs/models/rnasamba/build/2_sequence_sets/coding_test.txt", + noncoding_train = "outputs/models/rnasamba/build/2_sequence_sets/noncoding_train.txt", + noncoding_test = "outputs/models/rnasamba/build/2_sequence_sets/noncoding_test.txt", + coding_validation = "outputs/models/rnasamba/build/2_sequence_sets/coding_validation.txt", + noncoding_validation = "outputs/models/rnasamba/build/2_sequence_sets/noncoding_validation.txt" + conda: "envs/tidyverse.yml" + script: "scripts/process_sequences_into_nonoverlapping_sets.R" + +rule filter_sequence_sets: + input: + fa = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", + names = "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.txt" + output: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa" conda: "envs/seqtk.yml" shell:''' - cat {input.non_coding} > {output.non_coding} - seqtk subseq {output.non_coding} {input.names} > {output.non_coding_filt} + seqtk subseq {input.fa} {input.names} > {output} ''' ################################################################## @@ -128,35 +143,20 @@ rule filter_traintest_noncoding: ################################################################## rule get_sequence_statistics: - input: "inputs/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz" - output: "outputs/models/rnasamba/build/stats/full/{genome}.{rna_type}.tsv" + input: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa" + output: "outputs/models/rnasamba/build/stats/full/{set_type}_{set_name}.tsv" conda: "envs/seqkit.yml" shell:''' seqkit stats --all -o {output} -T {input} ''' -rule get_sequence_statistics_less_than_300_nt_ncrna: - input: "inputs/ensembl/ncrna/{genome}.ncrna.fa.gz" - output: "outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.ncrna.tsv" +rule get_sequence_statistics_less_than_300_nt: + input: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa" + output: "outputs/models/rnasamba/build/stats/300nt_or_less/{set_type}_{set_name}.tsv" conda: "envs/seqkit.yml" shell:''' seqkit seq --max-len 300 {input} | seqkit stats --all -T -o {output} ''' -rule get_sequence_statistics_less_than_300nt_protein_coding: - input: "outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz" - output: "outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.protein_coding.tsv" - conda: "envs/seqkit.yml" - shell:''' - seqkit seq --max-len 300 {input} | seqkit stats --all -T -o {output} - ''' - -rule get_sequence_statistics_protein_coding: - input: "outputs/models/rnasamba/build/0_protein_coding/{genome}.fa.gz" - output: "outputs/models/rnasamba/build/stats/protein_coding/{genome}.protein_coding.tsv" - conda: "envs/seqkit.yml" - shell:''' - seqkit stats --all -T -o {output} {input} - ''' def read_tsv_files(file_paths): all_data = [] diff --git a/envs/tidyverse.yml b/envs/tidyverse.yml new file mode 100644 index 0000000..ee0ca28 --- /dev/null +++ b/envs/tidyverse.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-tidyverse=2.0.0 diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R new file mode 100644 index 0000000..7926ee3 --- /dev/null +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -0,0 +1,86 @@ +library(readr) +library(dplyr) +# library(sourmashconsumr) + +# functions --------------------------------------------------------------- + +split_train_and_test_data <- function(df, fraction = 0.8, seed = 1){ + # sample without replacement to select fraction of the data set as a training data set + set.seed(1) # set seed so that the sample command gives the same results each time it is run + df_annotation <- sort(sample(nrow(df), nrow(df)* fraction)) + train <- df[df_annotation, ] + test <- df[-df_annotation, ] + return(list(train_df = train, test_df = test)) +} + +# read in data sets ------------------------------------------------------ + +fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") +clusters <- read_tsv(snakemake@input[['clusters']], col_names = c("rep", "cluster_member")) +coding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[1], col_names = fai_col_names) +noncoding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[2], col_names = fai_col_names) +coding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[1], col_names = fai_col_names) +noncoding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[2], col_names = fai_col_names) + +# check overlap between sets ---------------------------------------------- + +# upset_df <- from_list_to_upset_df(list(noncoding_validation = noncoding_validation$sequence, +# coding_validation = coding_validation$sequence, +# noncoding_traintest = noncoding_traintest$sequence, +# coding_traintest = coding_traintest$sequence, +# cluster_rep = unique(clusters$rep))) + +# filter to non-overlapping sets ------------------------------------------ + +noncoding_validation_filtered <- noncoding_validation %>% + filter(sequence %in% clusters$rep) %>% # keep sequences that had =<80% homology to other sequences + filter(!sequence %in% noncoding_traintest$sequence) %>% # remove sequences that are in the noncoding train/test set + filter(!sequence %in% coding_traintest$sequence) # remove sequences that ensembl now annotates as coding from validation + +coding_validation_filtered <- coding_validation %>% + filter(sequence %in% clusters$rep) %>% # keep sequences that had =<80% homology to other sequences + filter(!sequence %in% coding_traintest$sequence) %>% # remove seqencues that are in the coding train/test set + filter(!sequence %in% noncoding_traintest$sequence) # remove sequences that are in the noncoding train/test set + +noncoding_traintest_filtered <- noncoding_traintest %>% + filter(sequence %in% clusters$rep) # keep sequences that had =<80% homology to other sequences + +coding_traintest_filtered <- coding_traintest %>% + filter(sequence %in% clusters$rep) # keep sequences that had =<80% homology to other sequences + +# confirm no unexpected overlap between data sets ----------------------- +# upset_df_filtered <- from_list_to_upset_df(list(noncoding_validation = noncoding_validation_filtered$sequence, +# coding_validation = coding_validation_filtered$sequence, +# noncoding_traintest = noncoding_traintest_filtered$sequence, +# coding_traintest = coding_traintest_filtered$sequence, +# cluster_rep = unique(clusters$rep))) + +# subsample the coding traintest set to balance sizes ------------------ + +# to avoid biases in classification, it's better to have the same number of sequences in the different classes + +# select out all sORFs so these won't be filtered out since they are tricky cases +coding_traintest_filtered_sorfs <- coding_traintest_filtered %>% + filter(length <= 300) + +# sample out larger coding sequences +coding_traintest_filtered_other <- coding_traintest_filtered %>% + filter(length > 300) %>% + sample_n(size = nrow(noncoding_traintest_filtered) - nrow(coding_traintest_filtered_sorfs)) + +# combine to the final set of coding sequences that we'll use for training & testing +coding_traintest_keep <- bind_rows(coding_traintest_filtered_other, coding_traintest_filtered_sorfs) + +# split training and testing sets ----------------------------------------- + +coding_traintest_split <- split_train_and_test_data(coding_traintest_keep) +noncoding_traintest_split <- split_train_and_test_data(noncoding_traintest_filtered) + +# write out contigs names for each set ------------------------------------ + +write.table(coding_traintest_split$train_df$sequence, snakemake@output[['coding_train']], quote = F, col.names = F, row.names = F) +write.table(coding_traintest_split$test_df$sequence, snakemake@output[['coding_test']], quote = F, col.names = F, row.names = F) +write.table(noncoding_traintest_split$train_df$sequence, snakemake@output[['noncoding_train']], quote = F, col.names = F, row.names = F) +write.table(noncoding_traintest_split$test_df$sequence, snakemake@output[['noncoding_test']], quote = F, col.names = F, row.names = F) +write.table(coding_validation_filtered$sequence, snakemake@output[['coding_validation']], quote = F, col.names = F, row.names = F) +write.table(noncoding_validation_filtered$sequence, snakemake@output[['noncoding_validation']], quote = F, col.names = F, row.names = F) From bf20a00ea0269297d349739c8daad6c5ddfb7e20 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 6 Feb 2024 10:11:36 -0500 Subject: [PATCH 04/34] add summary stats --- build_rnasamba_euk_model.snakefile | 62 ++++--------------- .../rnasamba/build/train_data_links.tsv | 18 +++--- scripts/calculate_sequence_statistics.R | 38 ++++++++++++ 3 files changed, 60 insertions(+), 58 deletions(-) create mode 100644 scripts/calculate_sequence_statistics.R diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index c7450b3..eb82b8f 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -11,7 +11,8 @@ SET_NAMES = ['train', 'test', 'validation'] rule all: input: - expand("outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa", set_type = SET_TYPES, set_name = SET_NAMES) + expand("outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa", set_type = SET_TYPES, set_name = SET_NAMES), + "outputs/models/rnasamba/build/6_stats/set_summary.tsv" rule download_ensembl_data: """ @@ -142,56 +143,19 @@ rule filter_sequence_sets: ## Get sequence statistics ################################################################## -rule get_sequence_statistics: +rule get_sequence_descriptors: input: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa" - output: "outputs/models/rnasamba/build/stats/full/{set_type}_{set_name}.tsv" + output: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa.seqkit.fai" conda: "envs/seqkit.yml" shell:''' - seqkit stats --all -o {output} -T {input} + seqkit faidx -f {input} ''' -rule get_sequence_statistics_less_than_300_nt: - input: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa" - output: "outputs/models/rnasamba/build/stats/300nt_or_less/{set_type}_{set_name}.tsv" - conda: "envs/seqkit.yml" - shell:''' - seqkit seq --max-len 300 {input} | seqkit stats --all -T -o {output} - ''' - -def read_tsv_files(file_paths): - all_data = [] - - for file_path in file_paths: - file_name = re.sub("outputs/models/rnasamba/build/stats/", "", file_path) - file_type = re.sub("/.*", "", file_name) - if "cdna" in file_name: - rna_type = "cnda" - elif "ncrna" in file_name: - rna_type = "ncrna" - else: - rna_type = "protein_coding" - rm_from_genome_str1 = "." + rna_type + ".tsv" - genome = re.sub(rm_from_genome_str1, "", file_name) - rm_from_genome_str2 = file_type + "/" - genome = re.sub(rm_from_genome_str2, "", genome) - df = pd.read_csv(file_path, sep='\t', header=0) - df['file_type'] = file_type - df['rna_type'] = rna_type - df['genome'] = genome - all_data.append(df) - - # Concatenate all dataframes - combined_df = pd.concat(all_data, ignore_index=True) - return combined_df - - -rule combine_stats_files: - input: - expand("outputs/models/rnasamba/build/stats/full/{genome}.{rna_type}.tsv", rna_type = RNA_TYPES, genome = GENOMES), - expand("outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.ncrna.tsv", genome = GENOMES), - expand("outputs/models/rnasamba/build/stats/300nt_or_less/{genome}.protein_coding.tsv", genome = GENOMES), - expand("outputs/models/rnasamba/build/stats/protein_coding/{genome}.protein_coding.tsv", genome = GENOMES) - output: "outputs/models/rnasamba/build/stats.tsv" - run: - combined_df = read_tsv_files(input) - combined_df.to_csv(str(output), sep = "\t") +rule calculate_sequence_statistics: + input: expand("outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa.seqkit.fai", set_type = SET_TYPES, set_name = SET_NAMES) + output: + set_summary = "outputs/models/rnasamba/build/6_stats/set_summary.tsv", + set_length_summary = "outputs/models/rnasamba/build/6_stats/set_length_summary.tsv", + set_length_genome_summary = "outputs/models/rnasamba/build/6_stats/set_length_genome_summary.tsv" + conda: "envs/tidyverse.yml" + script: "scripts/calculate_sequence_statistics.R" diff --git a/inputs/models/rnasamba/build/train_data_links.tsv b/inputs/models/rnasamba/build/train_data_links.tsv index 65ec5f4..63f63aa 100644 --- a/inputs/models/rnasamba/build/train_data_links.tsv +++ b/inputs/models/rnasamba/build/train_data_links.tsv @@ -1,9 +1,9 @@ -organism root filename cdna_suffix ncrna_suffix -human https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/ Homo_sapiens.GRCh38 cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz -yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/ Saccharomyces_cerevisiae.R64-1-1 cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz -worm https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/ Caenorhabditis_elegans.WBcel235 cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz -arabadopsis https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/arabidopsis_thaliana/ Arabidopsis_thaliana.TAIR10 cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz -drosophila https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/ Drosophila_melanogaster.BDGP6.46 cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz -dictyostelium_discoideum https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/ Dictyostelium_discoideum.dicty_2.7 cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz -mouse https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/ Mus_musculus.GRCm39 cdna/Mus_musculus.GRCm39.cdna.all.fa.gz ncrna/Mus_musculus.GRCm39.ncrna.fa.gz -zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz \ No newline at end of file +organism root_url genome cdna_suffix ncrna_suffix genome_abbreviation +human https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/ Homo_sapiens.GRCh38 cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz GRCh38 +yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/ Saccharomyces_cerevisiae.R64-1-1 cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz R64-1-1 +worm https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/ Caenorhabditis_elegans.WBcel235 cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz WBcel235 +arabadopsis https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/arabidopsis_thaliana/ Arabidopsis_thaliana.TAIR10 cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz TAIR10 +drosophila https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/ Drosophila_melanogaster.BDGP6.46 cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz BDGP6.46 +dictyostelium_discoideum https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/ Dictyostelium_discoideum.dicty_2.7 cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz dicty_2.7 +mouse https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/ Mus_musculus.GRCm39 cdna/Mus_musculus.GRCm39.cdna.all.fa.gz ncrna/Mus_musculus.GRCm39.ncrna.fa.gz GRCm39 +zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz GRCz11 \ No newline at end of file diff --git a/scripts/calculate_sequence_statistics.R b/scripts/calculate_sequence_statistics.R new file mode 100644 index 0000000..2943880 --- /dev/null +++ b/scripts/calculate_sequence_statistics.R @@ -0,0 +1,38 @@ +library(tidyverse) + +files <- unlist(snakemake@input) +fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") +fai <- files %>% + set_names() %>% + map_dfr(read_tsv, col_names = fai_col_names, .id = "sequence_set") %>% + mutate(sequence_set = gsub(".fa.seqkit.fai", "", basename(sequence_set))) %>% + separate(sequence_set, into = c("coding_type", "set_name"), sep = "_", remove = FALSE) %>% + separate(sequence, into = c("sequence_id", "sequence_type", "chromosome", "gene", + "gene_biotype", "transcript_biotype", "gene_symbol", + "description"), + sep = " ", remove = FALSE, extra = "merge", fill = "right") %>% + separate(chromosome, into = c("chromosome_type", "genome", "chromosome_id", + "start", "end", "strand"), + sep = ":", remove = FALSE, extra = "merge", fill = "right") %>% + mutate(genome = ifelse(genome == "BDGP6", "BDGP6.46", genome)) %>% # fix some Drosophila names + mutate(genome = ifelse(sequence %in% c("K02E7.12.1", "F29C4.4.1", "C39B10.6b.1", "F53F4.16.1", "F53F4.17.1"), "WBcel235", genome)) %>% # fix some C. elegans genomes + mutate(length_description = ifelse(length <= 300, "<= 300nt", "> 300nt")) + +fai %>% + group_by(coding_type, set_name) %>% + tally() %>% + arrange(set_name) %>% + write_tsv(snakemake@output[['set_summary']]) + +fai %>% + group_by(coding_type, set_name, length_description) %>% + tally() %>% + arrange(set_name) %>% + write_tsv(snakemake@output[['set_length_summary']]) + +fai %>% + group_by(coding_type, set_name, length_description, genome) %>% + tally() %>% + arrange(set_name) %>% + pivot_wider(id_cols = c(coding_type, set_name, length_description), names_from = "genome", values_from = "n") %>% + write_tsv(snakemake@output[['set_length_genome_summary']]) From d20938a6206010ee901c04748ca86261647eb58d Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 6 Feb 2024 10:22:59 -0500 Subject: [PATCH 05/34] swap out wildcard names to be more descriptive --- build_rnasamba_euk_model.snakefile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index eb82b8f..509e627 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -6,17 +6,17 @@ metadata = pd.read_csv("inputs/models/rnasamba/build/train_data_links.tsv", sep GENOMES = metadata['genome'].unique().tolist() RNA_TYPES = ['cdna', 'ncrna'] # inherits names from ensembl VALIDATION_TYPES = ['mRNAs', 'ncRNAs'] # inherits names from https://github.com/cbl-nabi/RNAChallenge -SET_TYPES = ['coding', 'noncoding'] -SET_NAMES = ['train', 'test', 'validation'] +CODING_TYPES = ['coding', 'noncoding'] +DATASET_TYPES = ['train', 'test', 'validation'] rule all: input: - expand("outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa", set_type = SET_TYPES, set_name = SET_NAMES), + expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES), "outputs/models/rnasamba/build/6_stats/set_summary.tsv" rule download_ensembl_data: """ - Download ensembl cDNA and ncRNA files. + Download ensembl cDNA and ncRNA files. Ensembl annotates protein coding and non-coding RNA transcripts in their files. This information will be used to separate protein coding from non-coding RNAs to build an RNAsamba model. Note this download renames genome files from their names on ensembl to make them simpler to point to. @@ -115,7 +115,7 @@ rule grab_traintest_noncoding_names_and_lengths: rule process_sequences_into_nonoverlapping_sets: input: - traintest_fai = expand("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_{set_type}.fa.fai", set_type = SET_TYPES), + traintest_fai = expand("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_{coding_type}.fa.fai", coding_type = CODING_TYPES), validation_fai = expand("inputs/validation/rnachallenge/{validation_type}.fa.fai", validation_type = VALIDATION_TYPES), clusters = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv" output: @@ -132,8 +132,8 @@ rule process_sequences_into_nonoverlapping_sets: rule filter_sequence_sets: input: fa = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", - names = "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.txt" - output: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa" + names = "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt" + output: "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa" conda: "envs/seqtk.yml" shell:''' seqtk subseq {input.fa} {input.names} > {output} @@ -144,15 +144,15 @@ rule filter_sequence_sets: ################################################################## rule get_sequence_descriptors: - input: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa" - output: "outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa.seqkit.fai" + input: "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa" + output: "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai" conda: "envs/seqkit.yml" shell:''' seqkit faidx -f {input} ''' rule calculate_sequence_statistics: - input: expand("outputs/models/rnasamba/build/2_sequence_sets/{set_type}_{set_name}.fa.seqkit.fai", set_type = SET_TYPES, set_name = SET_NAMES) + input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES) output: set_summary = "outputs/models/rnasamba/build/6_stats/set_summary.tsv", set_length_summary = "outputs/models/rnasamba/build/6_stats/set_length_summary.tsv", From 6a759e8b8072cf907e39ba48a8266f28a49cc1d9 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 6 Feb 2024 10:44:45 -0500 Subject: [PATCH 06/34] simplify output of set creation --- build_rnasamba_euk_model.snakefile | 11 ++--------- .../process_sequences_into_nonoverlapping_sets.R | 15 ++++++++------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index 509e627..5d9c361 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -79,6 +79,7 @@ rule reduce_sequence_homology: mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 ''' +# TER TODO: consider changing the ncrna input path to remove some duplication in the rules rule grab_validation_set_names_and_lengths: input: "inputs/validation/rnachallenge/{validation_type}.fa.gz", output: @@ -90,7 +91,6 @@ rule grab_validation_set_names_and_lengths: seqkit faidx {output.validation} ''' -# TER TODO: consider changing the ncrna input path to remove some duplication in the rules rule grab_traintest_coding_names_and_lengths: input: expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome = GENOMES), output: @@ -118,14 +118,7 @@ rule process_sequences_into_nonoverlapping_sets: traintest_fai = expand("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_{coding_type}.fa.fai", coding_type = CODING_TYPES), validation_fai = expand("inputs/validation/rnachallenge/{validation_type}.fa.fai", validation_type = VALIDATION_TYPES), clusters = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv" - output: - # TER TODO: the set_name/set_types could be born here, but I need to check the order in which they would be built to make sure files aren't named the wrong thing. actually i could do this by number of lines in the final file. Also need to update the R script to point to the right thing - coding_train = "outputs/models/rnasamba/build/2_sequence_sets/coding_train.txt", - coding_test = "outputs/models/rnasamba/build/2_sequence_sets/coding_test.txt", - noncoding_train = "outputs/models/rnasamba/build/2_sequence_sets/noncoding_train.txt", - noncoding_test = "outputs/models/rnasamba/build/2_sequence_sets/noncoding_test.txt", - coding_validation = "outputs/models/rnasamba/build/2_sequence_sets/coding_validation.txt", - noncoding_validation = "outputs/models/rnasamba/build/2_sequence_sets/noncoding_validation.txt" + output: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES) conda: "envs/tidyverse.yml" script: "scripts/process_sequences_into_nonoverlapping_sets.R" diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index 7926ee3..26990ec 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -77,10 +77,11 @@ coding_traintest_split <- split_train_and_test_data(coding_traintest_keep) noncoding_traintest_split <- split_train_and_test_data(noncoding_traintest_filtered) # write out contigs names for each set ------------------------------------ - -write.table(coding_traintest_split$train_df$sequence, snakemake@output[['coding_train']], quote = F, col.names = F, row.names = F) -write.table(coding_traintest_split$test_df$sequence, snakemake@output[['coding_test']], quote = F, col.names = F, row.names = F) -write.table(noncoding_traintest_split$train_df$sequence, snakemake@output[['noncoding_train']], quote = F, col.names = F, row.names = F) -write.table(noncoding_traintest_split$test_df$sequence, snakemake@output[['noncoding_test']], quote = F, col.names = F, row.names = F) -write.table(coding_validation_filtered$sequence, snakemake@output[['coding_validation']], quote = F, col.names = F, row.names = F) -write.table(noncoding_validation_filtered$sequence, snakemake@output[['noncoding_validation']], quote = F, col.names = F, row.names = F) +snakemake_output <- unlist(snakemake@output) +print(snakemake_output) +write.table(coding_traintest_split$train_df$sequence, snakemake_output[1], quote = F, col.names = F, row.names = F) +write.table(coding_traintest_split$test_df$sequence, snakemake_output[2], quote = F, col.names = F, row.names = F) +write.table(coding_validation_filtered$sequence, snakemake_output[3], quote = F, col.names = F, row.names = F) +write.table(noncoding_traintest_split$train_df$sequence, snakemake_output[4], quote = F, col.names = F, row.names = F) +write.table(noncoding_traintest_split$test_df$sequence, snakemake_output[5], quote = F, col.names = F, row.names = F) +write.table(noncoding_validation_filtered$sequence, snakemake_output[6], quote = F, col.names = F, row.names = F) From b0aba84c376450ff1212823122ab0f962f968bca Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 6 Feb 2024 11:32:39 -0500 Subject: [PATCH 07/34] add rule to build rnasamba model --- build_rnasamba_euk_model.snakefile | 20 ++++++++++++++++++-- envs/rnasamba.yml | 6 ++++++ 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 envs/rnasamba.yml diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index 5d9c361..b28ad74 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -12,7 +12,8 @@ DATASET_TYPES = ['train', 'test', 'validation'] rule all: input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES), - "outputs/models/rnasamba/build/6_stats/set_summary.tsv" + "outputs/models/rnasamba/build/6_stats/set_summary.tsv", + "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5" rule download_ensembl_data: """ @@ -79,7 +80,6 @@ rule reduce_sequence_homology: mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 ''' -# TER TODO: consider changing the ncrna input path to remove some duplication in the rules rule grab_validation_set_names_and_lengths: input: "inputs/validation/rnachallenge/{validation_type}.fa.gz", output: @@ -132,6 +132,22 @@ rule filter_sequence_sets: seqtk subseq {input.fa} {input.names} > {output} ''' +################################################################## +## Build RNAsamba model +################################################################## + +rule build_rnasamba_model: + """ + Build a new rnasamba model from the training data curated above. + The --early_stopping parameter reduces training time and can help avoiding overfitting. + """ + input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_train.fa", coding_types = CODING_TYPES) + output: "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5" + conda: "envs/rnasamba.yml" + shell:''' + rnasamba train --early_stopping --verbose 2 {output} {input[0]} {input[1]} + ''' + ################################################################## ## Get sequence statistics ################################################################## diff --git a/envs/rnasamba.yml b/envs/rnasamba.yml new file mode 100644 index 0000000..972284a --- /dev/null +++ b/envs/rnasamba.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - rnasamba=0.2.5 From d2ef62a0cad767f5e5dbd62319b2bc1004afdbcb Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 7 Feb 2024 15:44:55 +0000 Subject: [PATCH 08/34] bump snakemake version and add pandas to dev --- envs/dev.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/envs/dev.yml b/envs/dev.yml index 3c70ebc..38ee968 100644 --- a/envs/dev.yml +++ b/envs/dev.yml @@ -8,4 +8,5 @@ dependencies: - python=3.12.0 - ruff=0.1.6 - snakefmt=0.8.5 - - snakemake-minimal=8.0.1 + - snakemake-minimal=8.4.2 + - pandas=2.2.0 From 7eca248003b86c0b67c649acee6d7b374cdfdd70 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 7 Feb 2024 15:45:17 +0000 Subject: [PATCH 09/34] add early stopping epochs, typos --- build_rnasamba_euk_model.snakefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index b28ad74..caf2802 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -140,12 +140,13 @@ rule build_rnasamba_model: """ Build a new rnasamba model from the training data curated above. The --early_stopping parameter reduces training time and can help avoiding overfitting. + It is the number of epochs after lowest validation loss before stopping training. """ - input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_train.fa", coding_types = CODING_TYPES) + input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_train.fa", coding_type = CODING_TYPES) output: "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5" conda: "envs/rnasamba.yml" shell:''' - rnasamba train --early_stopping --verbose 2 {output} {input[0]} {input[1]} + rnasamba train --early_stopping 5 --verbose 2 {output} {input[0]} {input[1]} ''' ################################################################## From e56261b463d49fd7a4df5a9b5c0a0d53402f0920 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Thu, 8 Feb 2024 15:04:09 +0000 Subject: [PATCH 10/34] add rules and code to assess accuracy of new RNAsamba model --- build_rnasamba_euk_model.snakefile | 32 +++++++++++++++++---- envs/caret.yml | 7 +++++ scripts/calculate_rnasamba_model_accuracy.R | 29 +++++++++++++++++++ 3 files changed, 62 insertions(+), 6 deletions(-) create mode 100644 envs/caret.yml create mode 100644 scripts/calculate_rnasamba_model_accuracy.R diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index caf2802..ad4b881 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -11,9 +11,8 @@ DATASET_TYPES = ['train', 'test', 'validation'] rule all: input: - expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES), - "outputs/models/rnasamba/build/6_stats/set_summary.tsv", - "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5" + "outputs/models/rnasamba/build/5_stats/set_summary.tsv", + expand("outputs/models/rnasamba/build/4_evaluation/accuracy_metrics_{dataset_type}.tsv", dataset_type = DATASET_TYPES) rule download_ensembl_data: """ @@ -149,6 +148,27 @@ rule build_rnasamba_model: rnasamba train --early_stopping 5 --verbose 2 {output} {input[0]} {input[1]} ''' +rule assess_rnasamba_model: + input: + model = "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5", + faa = "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa" + output: + faa = "outputs/models/rnasamba/build/4_evaluation/{coding_type}_{dataset_type}.fa", + predictions = "outputs/models/rnasamba/build/4_evaluation/{coding_type}_{dataset_type}.tsv" + benchmark: "benchmarks/models/rnasamba/build/4_evaluation/{coding_type}_{dataset_type}.tsv" + conda: "envs/rnasamba.yml" + shell:''' + rnasamba classify --protein_fasta {output.faa} {output.predictions} {input.faa} {input.model} + ''' + +rule calculate_rnasamba_model_accuracy: + input: expand("outputs/models/rnasamba/build/4_evaluation/{coding_type}_{{dataset_type}}.tsv", coding_type = CODING_TYPES) + output: + freq = "outputs/models/rnasamba/build/4_evaluation/confusionmatrix_{dataset_type}.tsv", + metrics = "outputs/models/rnasamba/build/4_evaluation/accuracy_metrics_{dataset_type}.tsv" + conda: "envs/caret.yml" + script: "scripts/calculate_rnasamba_model_accuracy.R" + ################################################################## ## Get sequence statistics ################################################################## @@ -164,8 +184,8 @@ rule get_sequence_descriptors: rule calculate_sequence_statistics: input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES) output: - set_summary = "outputs/models/rnasamba/build/6_stats/set_summary.tsv", - set_length_summary = "outputs/models/rnasamba/build/6_stats/set_length_summary.tsv", - set_length_genome_summary = "outputs/models/rnasamba/build/6_stats/set_length_genome_summary.tsv" + set_summary = "outputs/models/rnasamba/build/5_stats/set_summary.tsv", + set_length_summary = "outputs/models/rnasamba/build/5_stats/set_length_summary.tsv", + set_length_genome_summary = "outputs/models/rnasamba/build/5_stats/set_length_genome_summary.tsv" conda: "envs/tidyverse.yml" script: "scripts/calculate_sequence_statistics.R" diff --git a/envs/caret.yml b/envs/caret.yml new file mode 100644 index 0000000..413025a --- /dev/null +++ b/envs/caret.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-tidyverse=2.0.0 + - r-caret=6.0_94 diff --git a/scripts/calculate_rnasamba_model_accuracy.R b/scripts/calculate_rnasamba_model_accuracy.R new file mode 100644 index 0000000..3260929 --- /dev/null +++ b/scripts/calculate_rnasamba_model_accuracy.R @@ -0,0 +1,29 @@ +library(tidyverse) +library(caret) + +files <- unlist(snakemake@input) + +# read in and format model results for each dataset type +model_predictions <- files %>% + set_names() %>% + map_dfr(read_tsv, .id = "tmp") %>% + mutate(tmp = gsub(".tsv", "", basename(tmp))) %>% + separate(tmp, into = c("coding_type", "dataset_type"), sep = "_", remove = T) %>% + mutate(coding_type = as.factor(coding_type), + classification = as.factor(classification)) + +# determine model performance +model_confusion_matrix <- confusionMatrix(data = model_predictions$classification, + reference = model_predictions$coding_type) + +# convert the model performance metrics into a data frame +model_confusion_matrix_df <- bind_rows(data.frame(value = model_confusion_matrix$overall), + data.frame(value = model_confusion_matrix$byClass)) %>% + rownames_to_column("metric") %>% + mutate(dataset_type = model_predictions$dataset_type[1]) # set dataset type as column even tho it will be in the file name too + +# convert the confusion matrix into a data frame +model_confusion_matrix_freq_df <- model_confusion_matrix %>% as.table() %>% data.frame() + +write_tsv(model_confusion_matrix_df, snakemake@output[['metrics']]) +write_tsv(model_confusion_matrix_freq_df, snakemake@output[['freq']]) From 198fff9de8e9b274dc3a1b31cc66b9c2987df9d8 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Thu, 8 Feb 2024 15:22:26 +0000 Subject: [PATCH 11/34] missing eof new line --- inputs/models/rnasamba/build/train_data_links.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inputs/models/rnasamba/build/train_data_links.tsv b/inputs/models/rnasamba/build/train_data_links.tsv index 63f63aa..df0b294 100644 --- a/inputs/models/rnasamba/build/train_data_links.tsv +++ b/inputs/models/rnasamba/build/train_data_links.tsv @@ -6,4 +6,4 @@ arabadopsis https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/ara drosophila https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/ Drosophila_melanogaster.BDGP6.46 cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz BDGP6.46 dictyostelium_discoideum https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/ Dictyostelium_discoideum.dicty_2.7 cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz dicty_2.7 mouse https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/ Mus_musculus.GRCm39 cdna/Mus_musculus.GRCm39.cdna.all.fa.gz ncrna/Mus_musculus.GRCm39.ncrna.fa.gz GRCm39 -zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz GRCz11 \ No newline at end of file +zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz GRCz11 From f50f19c16cc51e7466389bc19a76fd1ad5cd6097 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Thu, 8 Feb 2024 15:23:28 +0000 Subject: [PATCH 12/34] rm comments --- .../process_sequences_into_nonoverlapping_sets.R | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index 26990ec..f97d064 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -1,6 +1,5 @@ library(readr) library(dplyr) -# library(sourmashconsumr) # functions --------------------------------------------------------------- @@ -22,14 +21,6 @@ noncoding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[2], coding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[1], col_names = fai_col_names) noncoding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[2], col_names = fai_col_names) -# check overlap between sets ---------------------------------------------- - -# upset_df <- from_list_to_upset_df(list(noncoding_validation = noncoding_validation$sequence, -# coding_validation = coding_validation$sequence, -# noncoding_traintest = noncoding_traintest$sequence, -# coding_traintest = coding_traintest$sequence, -# cluster_rep = unique(clusters$rep))) - # filter to non-overlapping sets ------------------------------------------ noncoding_validation_filtered <- noncoding_validation %>% @@ -48,13 +39,6 @@ noncoding_traintest_filtered <- noncoding_traintest %>% coding_traintest_filtered <- coding_traintest %>% filter(sequence %in% clusters$rep) # keep sequences that had =<80% homology to other sequences -# confirm no unexpected overlap between data sets ----------------------- -# upset_df_filtered <- from_list_to_upset_df(list(noncoding_validation = noncoding_validation_filtered$sequence, -# coding_validation = coding_validation_filtered$sequence, -# noncoding_traintest = noncoding_traintest_filtered$sequence, -# coding_traintest = coding_traintest_filtered$sequence, -# cluster_rep = unique(clusters$rep))) - # subsample the coding traintest set to balance sizes ------------------ # to avoid biases in classification, it's better to have the same number of sequences in the different classes From 6989b64732694d01b3f858d3eebb88b8a2b0a3a2 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Thu, 8 Feb 2024 23:13:33 +0000 Subject: [PATCH 13/34] add in comparison to existing human model to show improvement --- build_rnasamba_euk_model.snakefile | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index ad4b881..b047f41 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -8,11 +8,12 @@ RNA_TYPES = ['cdna', 'ncrna'] # inherits names from ensembl VALIDATION_TYPES = ['mRNAs', 'ncRNAs'] # inherits names from https://github.com/cbl-nabi/RNAChallenge CODING_TYPES = ['coding', 'noncoding'] DATASET_TYPES = ['train', 'test', 'validation'] +MODEL_TYPES = ['eu', 'human'] rule all: input: "outputs/models/rnasamba/build/5_stats/set_summary.tsv", - expand("outputs/models/rnasamba/build/4_evaluation/accuracy_metrics_{dataset_type}.tsv", dataset_type = DATASET_TYPES) + expand("outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", model_type = MODEL_TYPES, dataset_type = DATASET_TYPES) rule download_ensembl_data: """ @@ -150,25 +151,36 @@ rule build_rnasamba_model: rule assess_rnasamba_model: input: - model = "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5", + model = "outputs/models/rnasamba/build/3_model/{model_type}_rnasamba.hdf5", faa = "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa" output: - faa = "outputs/models/rnasamba/build/4_evaluation/{coding_type}_{dataset_type}.fa", - predictions = "outputs/models/rnasamba/build/4_evaluation/{coding_type}_{dataset_type}.tsv" - benchmark: "benchmarks/models/rnasamba/build/4_evaluation/{coding_type}_{dataset_type}.tsv" + faa = "outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.fa", + predictions = "outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" + benchmark: "benchmarks/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" conda: "envs/rnasamba.yml" shell:''' rnasamba classify --protein_fasta {output.faa} {output.predictions} {input.faa} {input.model} ''' rule calculate_rnasamba_model_accuracy: - input: expand("outputs/models/rnasamba/build/4_evaluation/{coding_type}_{{dataset_type}}.tsv", coding_type = CODING_TYPES) + input: expand("outputs/models/rnasamba/build/4_evaluation/{{model_type}}/{coding_type}_{{dataset_type}}.tsv", coding_type = CODING_TYPES) output: - freq = "outputs/models/rnasamba/build/4_evaluation/confusionmatrix_{dataset_type}.tsv", - metrics = "outputs/models/rnasamba/build/4_evaluation/accuracy_metrics_{dataset_type}.tsv" + freq = "outputs/models/rnasamba/build/4_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", + metrics = "outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv" conda: "envs/caret.yml" script: "scripts/calculate_rnasamba_model_accuracy.R" + +rule download_rnasamba_human_model: + """ + Use this model to compare whether the new model performs better or worse. + It's saved under a new name so we can use a wildcard to run rnasamba classify and to calculate model accuracy. + """ + output: "outputs/models/rnasamba/build/3_model/human_rnasamba.hdf5", + shell:''' + curl -JLo {output} https://github.com/apcamargo/RNAsamba/raw/master/data/full_length_weights.hdf5 + ''' + ################################################################## ## Get sequence statistics ################################################################## From 3dcdf2bceea5956cfbdc1316c669dc9044e2d7b6 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Fri, 9 Feb 2024 08:17:45 -0500 Subject: [PATCH 14/34] diversify snakefmt file endings for CI --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ede457e..d5ce5c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,5 +59,5 @@ no-lines-before = ["future", "standard-library"] [tool.snakefmt] # the line length here should match the one used by ruff line_length = 100 -include = 'Snakefile|Snakefile_*' +include = 'Snakefile|Snakefile_*|*.snakefile|*.smk' exclude = 'dev' From aa15c4683de7757e0b44768f7a33bb162ec926d0 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Fri, 9 Feb 2024 08:27:09 -0500 Subject: [PATCH 15/34] update snakefmt file endings and run linting locally --- build_rnasamba_euk_model.snakefile | 306 +++++++++++++++++++---------- pyproject.toml | 2 +- 2 files changed, 201 insertions(+), 107 deletions(-) diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index b047f41..e855767 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -2,18 +2,27 @@ import pandas as pd import os import re -metadata = pd.read_csv("inputs/models/rnasamba/build/train_data_links.tsv", sep = "\t") -GENOMES = metadata['genome'].unique().tolist() -RNA_TYPES = ['cdna', 'ncrna'] # inherits names from ensembl -VALIDATION_TYPES = ['mRNAs', 'ncRNAs'] # inherits names from https://github.com/cbl-nabi/RNAChallenge -CODING_TYPES = ['coding', 'noncoding'] -DATASET_TYPES = ['train', 'test', 'validation'] -MODEL_TYPES = ['eu', 'human'] +metadata = pd.read_csv("inputs/models/rnasamba/build/train_data_links.tsv", sep="\t") +GENOMES = metadata["genome"].unique().tolist() +RNA_TYPES = ["cdna", "ncrna"] # inherits names from ensembl +VALIDATION_TYPES = [ + "mRNAs", + "ncRNAs", +] # inherits names from https://github.com/cbl-nabi/RNAChallenge +CODING_TYPES = ["coding", "noncoding"] +DATASET_TYPES = ["train", "test", "validation"] +MODEL_TYPES = ["eu", "human"] + rule all: - input: + input: "outputs/models/rnasamba/build/5_stats/set_summary.tsv", - expand("outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", model_type = MODEL_TYPES, dataset_type = DATASET_TYPES) + expand( + "outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", + model_type=MODEL_TYPES, + dataset_type=DATASET_TYPES, + ), + rule download_ensembl_data: """ @@ -25,15 +34,16 @@ rule download_ensembl_data: - cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz -> cdna/Homo_sapiens.GRCh38.cdna.fa.gz (dropped "all.") - ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz -> ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz (no change) """ - output: "inputs/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz" + output: + "inputs/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz", run: - genome_df = metadata.loc[(metadata['genome'] == wildcards.genome)] - root_url = genome_df['root_url'].values[0] + genome_df = metadata.loc[(metadata["genome"] == wildcards.genome)] + root_url = genome_df["root_url"].values[0] if wildcards.rna_type == "cdna": - suffix = genome_df['cdna_suffix'].values[0] + suffix = genome_df["cdna_suffix"].values[0] else: - suffix = genome_df['ncrna_suffix'].values[0] - + suffix = genome_df["ncrna_suffix"].values[0] + url = root_url + suffix shell("curl -JLo {output} {url}") @@ -43,132 +53,200 @@ rule extract_protein_coding_orfs_from_cdna: Ensembl cDNA files consist of transcript sequences for actual and possible genes, including pseudogenes, NMD and the like. Transcripts in the cDNA files have headers like: >TRANSCRIPT_ID SEQTYPE LOCATION GENE_ID GENE_BIOTYPE TRANSCRIPT_BIOTYPE, where the gene_biotype and transcript_biotype both contain information about whether the gene is coding or not. """ - input: "inputs/ensembl/cdna/{genome}.cdna.fa.gz" - output: "outputs/models/rnasamba/build/0_coding/{genome}.fa.gz" - conda: "envs/seqkit.yml" - shell:''' + input: + "inputs/ensembl/cdna/{genome}.cdna.fa.gz", + output: + "outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", + conda: + "envs/seqkit.yml" + shell: + """ seqkit grep --use-regexp --by-name --pattern "transcript_biotype:protein_coding" -o {output} {input} - ''' + """ + rule download_validation_data: - output: "inputs/validation/rnachallenge/{validation_type}.fa.gz", - shell:''' + output: + "inputs/validation/rnachallenge/{validation_type}.fa.gz", + shell: + """ curl -JL https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa | gzip > {output} - ''' + """ + rule combine_sequences: input: - coding = expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome = GENOMES), - noncoding = expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome = GENOMES), - validation = expand("inputs/validation/rnachallenge/{validation_type}.fa.gz", validation_type = VALIDATION_TYPES) - output: "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz" - shell:''' + coding=expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome=GENOMES), + noncoding=expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES), + validation=expand( + "inputs/validation/rnachallenge/{validation_type}.fa.gz", + validation_type=VALIDATION_TYPES, + ), + output: + "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz", + shell: + """ cat {input} > {output} - ''' - + """ + + rule reduce_sequence_homology: """ To reduce pollution between training and testing set, cluster sequences at 80% sequence identity. """ - input: "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz" - output: + input: + "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz", + output: "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", - "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv" - params: prefix = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences" - conda: "envs/mmseqs2.yml" - shell:''' + "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", + params: + prefix="outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences", + conda: + "envs/mmseqs2.yml" + shell: + """ mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 - ''' + """ + rule grab_validation_set_names_and_lengths: - input: "inputs/validation/rnachallenge/{validation_type}.fa.gz", - output: - validation = "inputs/validation/rnachallenge/{validation_type}.fa", - validation_fai = "inputs/validation/rnachallenge/{validation_type}.fa.fai", - conda: "envs/seqkit.yml" - shell:''' + input: + "inputs/validation/rnachallenge/{validation_type}.fa.gz", + output: + validation="inputs/validation/rnachallenge/{validation_type}.fa", + validation_fai="inputs/validation/rnachallenge/{validation_type}.fa.fai", + conda: + "envs/seqkit.yml" + shell: + """ cat {input} | gunzip > {output.validation} seqkit faidx {output.validation} - ''' + """ + rule grab_traintest_coding_names_and_lengths: - input: expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome = GENOMES), + input: + expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome=GENOMES), output: - coding = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa", - coding_fai = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa.fai" - conda: "envs/seqkit.yml" - shell:''' + coding="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa", + coding_fai="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa.fai", + conda: + "envs/seqkit.yml" + shell: + """ cat {input} | gunzip > {output.coding} seqkit faidx {output.coding} - ''' + """ + rule grab_traintest_noncoding_names_and_lengths: - input: expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome = GENOMES), + input: + expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES), output: - noncoding = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa", - noncoding_fai = "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa.fai" - conda: "envs/seqkit.yml" - shell:''' + noncoding="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa", + noncoding_fai="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa.fai", + conda: + "envs/seqkit.yml" + shell: + """ cat {input} | gunzip > {output.noncoding} seqkit faidx {output.noncoding} - ''' + """ + rule process_sequences_into_nonoverlapping_sets: - input: - traintest_fai = expand("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_{coding_type}.fa.fai", coding_type = CODING_TYPES), - validation_fai = expand("inputs/validation/rnachallenge/{validation_type}.fa.fai", validation_type = VALIDATION_TYPES), - clusters = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv" - output: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES) - conda: "envs/tidyverse.yml" - script: "scripts/process_sequences_into_nonoverlapping_sets.R" + input: + traintest_fai=expand( + "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_{coding_type}.fa.fai", + coding_type=CODING_TYPES, + ), + validation_fai=expand( + "inputs/validation/rnachallenge/{validation_type}.fa.fai", + validation_type=VALIDATION_TYPES, + ), + clusters="outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", + output: + expand( + "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt", + coding_type=CODING_TYPES, + dataset_type=DATASET_TYPES, + ), + conda: + "envs/tidyverse.yml" + script: + "scripts/process_sequences_into_nonoverlapping_sets.R" + rule filter_sequence_sets: input: - fa = "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", - names = "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt" - output: "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa" - conda: "envs/seqtk.yml" - shell:''' + fa="outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", + names="outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt", + output: + "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", + conda: + "envs/seqtk.yml" + shell: + """ seqtk subseq {input.fa} {input.names} > {output} - ''' + """ + ################################################################## -## Build RNAsamba model +## Build RNAsamba model ################################################################## + rule build_rnasamba_model: """ Build a new rnasamba model from the training data curated above. The --early_stopping parameter reduces training time and can help avoiding overfitting. It is the number of epochs after lowest validation loss before stopping training. """ - input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_train.fa", coding_type = CODING_TYPES) - output: "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5" - conda: "envs/rnasamba.yml" - shell:''' + input: + expand( + "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_train.fa", + coding_type=CODING_TYPES, + ), + output: + "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5", + conda: + "envs/rnasamba.yml" + shell: + """ rnasamba train --early_stopping 5 --verbose 2 {output} {input[0]} {input[1]} - ''' + """ + rule assess_rnasamba_model: - input: - model = "outputs/models/rnasamba/build/3_model/{model_type}_rnasamba.hdf5", - faa = "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa" - output: - faa = "outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.fa", - predictions = "outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" - benchmark: "benchmarks/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" - conda: "envs/rnasamba.yml" - shell:''' + input: + model="outputs/models/rnasamba/build/3_model/{model_type}_rnasamba.hdf5", + faa="outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", + output: + faa="outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.fa", + predictions="outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv", + benchmark: + "benchmarks/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" + conda: + "envs/rnasamba.yml" + shell: + """ rnasamba classify --protein_fasta {output.faa} {output.predictions} {input.faa} {input.model} - ''' + """ + rule calculate_rnasamba_model_accuracy: - input: expand("outputs/models/rnasamba/build/4_evaluation/{{model_type}}/{coding_type}_{{dataset_type}}.tsv", coding_type = CODING_TYPES) - output: - freq = "outputs/models/rnasamba/build/4_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", - metrics = "outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv" - conda: "envs/caret.yml" - script: "scripts/calculate_rnasamba_model_accuracy.R" + input: + expand( + "outputs/models/rnasamba/build/4_evaluation/{{model_type}}/{coding_type}_{{dataset_type}}.tsv", + coding_type=CODING_TYPES, + ), + output: + freq="outputs/models/rnasamba/build/4_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", + metrics="outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", + conda: + "envs/caret.yml" + script: + "scripts/calculate_rnasamba_model_accuracy.R" rule download_rnasamba_human_model: @@ -176,28 +254,44 @@ rule download_rnasamba_human_model: Use this model to compare whether the new model performs better or worse. It's saved under a new name so we can use a wildcard to run rnasamba classify and to calculate model accuracy. """ - output: "outputs/models/rnasamba/build/3_model/human_rnasamba.hdf5", - shell:''' + output: + "outputs/models/rnasamba/build/3_model/human_rnasamba.hdf5", + shell: + """ curl -JLo {output} https://github.com/apcamargo/RNAsamba/raw/master/data/full_length_weights.hdf5 - ''' + """ + ################################################################## ## Get sequence statistics ################################################################## + rule get_sequence_descriptors: - input: "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa" - output: "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai" - conda: "envs/seqkit.yml" - shell:''' + input: + "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", + output: + "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", + conda: + "envs/seqkit.yml" + shell: + """ seqkit faidx -f {input} - ''' + """ + rule calculate_sequence_statistics: - input: expand("outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", coding_type = CODING_TYPES, dataset_type = DATASET_TYPES) - output: - set_summary = "outputs/models/rnasamba/build/5_stats/set_summary.tsv", - set_length_summary = "outputs/models/rnasamba/build/5_stats/set_length_summary.tsv", - set_length_genome_summary = "outputs/models/rnasamba/build/5_stats/set_length_genome_summary.tsv" - conda: "envs/tidyverse.yml" - script: "scripts/calculate_sequence_statistics.R" + input: + expand( + "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", + coding_type=CODING_TYPES, + dataset_type=DATASET_TYPES, + ), + output: + set_summary="outputs/models/rnasamba/build/5_stats/set_summary.tsv", + set_length_summary="outputs/models/rnasamba/build/5_stats/set_length_summary.tsv", + set_length_genome_summary="outputs/models/rnasamba/build/5_stats/set_length_genome_summary.tsv", + conda: + "envs/tidyverse.yml" + script: + "scripts/calculate_sequence_statistics.R" diff --git a/pyproject.toml b/pyproject.toml index d5ce5c7..bd55cb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,5 +59,5 @@ no-lines-before = ["future", "standard-library"] [tool.snakefmt] # the line length here should match the one used by ruff line_length = 100 -include = 'Snakefile|Snakefile_*|*.snakefile|*.smk' +include = 'Snakefile|Snakefile_*|\.snakefile$|\.smk$' exclude = 'dev' From 12985a8c94fe21415f5edc00390f6004e3d18de3 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Mon, 12 Feb 2024 08:10:36 -0500 Subject: [PATCH 16/34] Apply suggestions from code review Co-authored-by: Erin Signed-off-by: Taylor Reiter --- build_rnasamba_euk_model.snakefile | 2 +- scripts/process_sequences_into_nonoverlapping_sets.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index e855767..0c6e833 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -199,7 +199,7 @@ rule filter_sequence_sets: rule build_rnasamba_model: """ Build a new rnasamba model from the training data curated above. - The --early_stopping parameter reduces training time and can help avoiding overfitting. + The --early_stopping parameter reduces training time and can help avoid overfitting. It is the number of epochs after lowest validation loss before stopping training. """ input: diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index f97d064..13ff2be 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -5,7 +5,7 @@ library(dplyr) split_train_and_test_data <- function(df, fraction = 0.8, seed = 1){ # sample without replacement to select fraction of the data set as a training data set - set.seed(1) # set seed so that the sample command gives the same results each time it is run + set.seed(seed) # set seed so that the sample command gives the same results each time it is run df_annotation <- sort(sample(nrow(df), nrow(df)* fraction)) train <- df[df_annotation, ] test <- df[-df_annotation, ] From b31317f71e2dc66791596134d7ea3ba886f596c0 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Mon, 12 Feb 2024 13:13:51 -0500 Subject: [PATCH 17/34] indentation --- build_rnasamba_euk_model.snakefile | 54 +++++++++++++++--------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/build_rnasamba_euk_model.snakefile b/build_rnasamba_euk_model.snakefile index e855767..753bad3 100644 --- a/build_rnasamba_euk_model.snakefile +++ b/build_rnasamba_euk_model.snakefile @@ -61,8 +61,8 @@ rule extract_protein_coding_orfs_from_cdna: "envs/seqkit.yml" shell: """ - seqkit grep --use-regexp --by-name --pattern "transcript_biotype:protein_coding" -o {output} {input} - """ + seqkit grep --use-regexp --by-name --pattern "transcript_biotype:protein_coding" -o {output} {input} + """ rule download_validation_data: @@ -70,8 +70,8 @@ rule download_validation_data: "inputs/validation/rnachallenge/{validation_type}.fa.gz", shell: """ - curl -JL https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa | gzip > {output} - """ + curl -JL https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa | gzip > {output} + """ rule combine_sequences: @@ -86,8 +86,8 @@ rule combine_sequences: "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz", shell: """ - cat {input} > {output} - """ + cat {input} > {output} + """ rule reduce_sequence_homology: @@ -105,8 +105,8 @@ rule reduce_sequence_homology: "envs/mmseqs2.yml" shell: """ - mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 - """ + mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 + """ rule grab_validation_set_names_and_lengths: @@ -119,9 +119,9 @@ rule grab_validation_set_names_and_lengths: "envs/seqkit.yml" shell: """ - cat {input} | gunzip > {output.validation} - seqkit faidx {output.validation} - """ + cat {input} | gunzip > {output.validation} + seqkit faidx {output.validation} + """ rule grab_traintest_coding_names_and_lengths: @@ -134,9 +134,9 @@ rule grab_traintest_coding_names_and_lengths: "envs/seqkit.yml" shell: """ - cat {input} | gunzip > {output.coding} - seqkit faidx {output.coding} - """ + cat {input} | gunzip > {output.coding} + seqkit faidx {output.coding} + """ rule grab_traintest_noncoding_names_and_lengths: @@ -149,9 +149,9 @@ rule grab_traintest_noncoding_names_and_lengths: "envs/seqkit.yml" shell: """ - cat {input} | gunzip > {output.noncoding} - seqkit faidx {output.noncoding} - """ + cat {input} | gunzip > {output.noncoding} + seqkit faidx {output.noncoding} + """ rule process_sequences_into_nonoverlapping_sets: @@ -187,8 +187,8 @@ rule filter_sequence_sets: "envs/seqtk.yml" shell: """ - seqtk subseq {input.fa} {input.names} > {output} - """ + seqtk subseq {input.fa} {input.names} > {output} + """ ################################################################## @@ -213,8 +213,8 @@ rule build_rnasamba_model: "envs/rnasamba.yml" shell: """ - rnasamba train --early_stopping 5 --verbose 2 {output} {input[0]} {input[1]} - """ + rnasamba train --early_stopping 5 --verbose 2 {output} {input[0]} {input[1]} + """ rule assess_rnasamba_model: @@ -230,8 +230,8 @@ rule assess_rnasamba_model: "envs/rnasamba.yml" shell: """ - rnasamba classify --protein_fasta {output.faa} {output.predictions} {input.faa} {input.model} - """ + rnasamba classify --protein_fasta {output.faa} {output.predictions} {input.faa} {input.model} + """ rule calculate_rnasamba_model_accuracy: @@ -258,8 +258,8 @@ rule download_rnasamba_human_model: "outputs/models/rnasamba/build/3_model/human_rnasamba.hdf5", shell: """ - curl -JLo {output} https://github.com/apcamargo/RNAsamba/raw/master/data/full_length_weights.hdf5 - """ + curl -JLo {output} https://github.com/apcamargo/RNAsamba/raw/master/data/full_length_weights.hdf5 + """ ################################################################## @@ -276,8 +276,8 @@ rule get_sequence_descriptors: "envs/seqkit.yml" shell: """ - seqkit faidx -f {input} - """ + seqkit faidx -f {input} + """ rule calculate_sequence_statistics: From eaee034ef4657bdc4cb9bf3d3259ad8ede9552ef Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 09:10:15 -0500 Subject: [PATCH 18/34] sample with replacement to augment noncoding to coding numbers --- ...del.snakefile => curate_datasets.snakefile | 0 ...ocess_sequences_into_nonoverlapping_sets.R | 28 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) rename build_rnasamba_euk_model.snakefile => curate_datasets.snakefile (100%) diff --git a/build_rnasamba_euk_model.snakefile b/curate_datasets.snakefile similarity index 100% rename from build_rnasamba_euk_model.snakefile rename to curate_datasets.snakefile diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index 13ff2be..f99b85a 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -21,6 +21,12 @@ noncoding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[2], coding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[1], col_names = fai_col_names) noncoding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[2], col_names = fai_col_names) +fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") +clusters <- read_tsv("outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", col_names = c("rep", "cluster_member")) +coding_validation <- read_tsv("inputs/validation/rnachallenge/mRNAs.fa.fai", col_names = fai_col_names) +noncoding_validation <- read_tsv("inputs/validation/rnachallenge/ncRNAs.fa.fai", col_names = fai_col_names) +coding_traintest <- read_tsv("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa.fai", col_names = fai_col_names) +noncoding_traintest <- read_tsv("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa.fai", col_names = fai_col_names) # filter to non-overlapping sets ------------------------------------------ noncoding_validation_filtered <- noncoding_validation %>% @@ -39,25 +45,21 @@ noncoding_traintest_filtered <- noncoding_traintest %>% coding_traintest_filtered <- coding_traintest %>% filter(sequence %in% clusters$rep) # keep sequences that had =<80% homology to other sequences -# subsample the coding traintest set to balance sizes ------------------ - -# to avoid biases in classification, it's better to have the same number of sequences in the different classes +# augment the noncoding traintest set to balance sizes ------------------ -# select out all sORFs so these won't be filtered out since they are tricky cases -coding_traintest_filtered_sorfs <- coding_traintest_filtered %>% - filter(length <= 300) +# To avoid biases in classification, it's better to have the same number of sequences in the different classes. +# This could also be done with changing the weights on the activation function of the model building in tensorflow, +# but that would require altering the source code of the RNAsamba tool. +# This approach should produce equivalent results. -# sample out larger coding sequences -coding_traintest_filtered_other <- coding_traintest_filtered %>% - filter(length > 300) %>% - sample_n(size = nrow(noncoding_traintest_filtered) - nrow(coding_traintest_filtered_sorfs)) +num_rows_target <- nrow(coding_traintest_filtered) -# combine to the final set of coding sequences that we'll use for training & testing -coding_traintest_keep <- bind_rows(coding_traintest_filtered_other, coding_traintest_filtered_sorfs) +noncoding_traintest_filtered <- noncoding_traintest_filtered %>% + slice_sample(n = num_rows_target, replace = TRUE) # split training and testing sets ----------------------------------------- -coding_traintest_split <- split_train_and_test_data(coding_traintest_keep) +coding_traintest_split <- split_train_and_test_data(coding_traintest_filtered) noncoding_traintest_split <- split_train_and_test_data(noncoding_traintest_filtered) # write out contigs names for each set ------------------------------------ From 0a3c232e16dd7a370cc9261f05c9ca26b02c2247 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 09:47:00 -0500 Subject: [PATCH 19/34] add new test data set links --- .../rnasamba/build/train_data_links.tsv | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/inputs/models/rnasamba/build/train_data_links.tsv b/inputs/models/rnasamba/build/train_data_links.tsv index df0b294..7d0679c 100644 --- a/inputs/models/rnasamba/build/train_data_links.tsv +++ b/inputs/models/rnasamba/build/train_data_links.tsv @@ -1,9 +1,17 @@ -organism root_url genome cdna_suffix ncrna_suffix genome_abbreviation -human https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/ Homo_sapiens.GRCh38 cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz GRCh38 -yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/ Saccharomyces_cerevisiae.R64-1-1 cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz R64-1-1 -worm https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/ Caenorhabditis_elegans.WBcel235 cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz WBcel235 -arabadopsis https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/arabidopsis_thaliana/ Arabidopsis_thaliana.TAIR10 cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz TAIR10 -drosophila https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/ Drosophila_melanogaster.BDGP6.46 cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz BDGP6.46 -dictyostelium_discoideum https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/ Dictyostelium_discoideum.dicty_2.7 cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz dicty_2.7 -mouse https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/ Mus_musculus.GRCm39 cdna/Mus_musculus.GRCm39.cdna.all.fa.gz ncrna/Mus_musculus.GRCm39.ncrna.fa.gz GRCm39 -zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz GRCz11 +organism root_url genome cdna_suffix ncrna_suffix genome_abbreviation set +human https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/ Homo_sapiens.GRCh38 cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz GRCh38 train +yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/ Saccharomyces_cerevisiae.R64-1-1 cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz R64-1-1 train +worm https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/ Caenorhabditis_elegans.WBcel235 cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz WBcel235 train +arabadopsis https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/arabidopsis_thaliana/ Arabidopsis_thaliana.TAIR10 cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz TAIR10 train +drosophila https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/ Drosophila_melanogaster.BDGP6.46 cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz BDGP6.46 train +dictyostelium_discoideum https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/ Dictyostelium_discoideum.dicty_2.7 cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz dicty_2.7 train +mouse https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/ Mus_musculus.GRCm39 cdna/Mus_musculus.GRCm39.cdna.all.fa.gz ncrna/Mus_musculus.GRCm39.ncrna.fa.gz GRCm39 train +zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz GRCz11 train +chicken https://ftp.ensembl.org/pub/release-111/fasta/gallus_gallus/ Gallus_gallus.bGalGal1.mat.broiler.GRCg7b cdna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.cdna.all.fa.gz ncrna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.ncrna.fa.gz bGalGal1.mat.broiler.GRCg7b test +rice https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/oryza_indica/ Oryza_indica.ASM465v1 cdna/Oryza_indica.ASM465v1.cdna.all.fa.gz ncrna/Oryza_indica.ASM465v1.ncrna.fa.gz ASM465v1 test +maize https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/zea_mays/ Zea_mays.Zm-B73-REFERENCE-NAM-5.0 cdna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.cdna.all.fa.gz ncrna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.ncrna.fa.gz Zm-B73-REFERENCE-NAM-5.0 test +frog https://ftp.ensembl.org/pub/release-111/fasta/xenopus_tropicalis/ Xenopus_tropicalis.UCB_Xtro_10.0 cdna/Xenopus_tropicalis.UCB_Xtro_10.0.cdna.all.fa.gz ncrna/Xenopus_tropicalis.UCB_Xtro_10.0.ncrna.fa.gz UCB_Xtro_10.0 test +rat https://ftp.ensembl.org/pub/release-111/fasta/rattus_norvegicus/ Rattus_norvegicus.mRatBN7.2 cdna/Rattus_norvegicus.mRatBN7.2.cdna.all.fa.gz ncrna/Rattus_norvegicus.mRatBN7.2.ncrna.fa.gz mRatBN7 test +honeybee https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/apis_mellifera/ Apis_mellifera.Amel_HAv3.1 cdna/Apis_mellifera.Amel_HAv3.1.cdna.all.fa.gz ncrna/Apis_mellifera.Amel_HAv3.1.ncrna.fa.gz Amel_HAv3.1 test +fission_yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/schizosaccharomyces_pombe/ Schizosaccharomyces_pombe.ASM294v2 cdna/Schizosaccharomyces_pombe.ASM294v2.cdna.all.fa.gz ncrna/Schizosaccharomyces_pombe.ASM294v2.ncrna.fa.gz ASM294v2 test +tetrahymena https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/tetrahymena_thermophila/ Tetrahymena_thermophila.JCVI-TTA1-2.2 cdna/Tetrahymena_thermophila.JCVI-TTA1-2.2.cdna.all.fa.gz ncrna/Tetrahymena_thermophila.JCVI-TTA1-2.2.ncrna.fa.gz JCVI-TTA1-2.2 test \ No newline at end of file From 8972760848776a3cb97afb8f86c694205bab6430 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 12:47:12 -0500 Subject: [PATCH 20/34] update train and test sets to be different species --- curate_datasets.snakefile | 69 +++++--------- .../rnasamba/build/train_data_links.tsv | 2 +- scripts/calculate_sequence_statistics.R | 15 +-- scripts/parse_sequence_information.R | 23 +++++ ...ocess_sequences_into_nonoverlapping_sets.R | 95 +++++++++++-------- 5 files changed, 107 insertions(+), 97 deletions(-) create mode 100644 scripts/parse_sequence_information.R diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index 7ee1655..8d6c7d7 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -11,7 +11,7 @@ VALIDATION_TYPES = [ ] # inherits names from https://github.com/cbl-nabi/RNAChallenge CODING_TYPES = ["coding", "noncoding"] DATASET_TYPES = ["train", "test", "validation"] -MODEL_TYPES = ["eu", "human"] +MODEL_TYPES = ["eukaryote", "human"] rule all: @@ -83,19 +83,30 @@ rule combine_sequences: validation_type=VALIDATION_TYPES, ), output: - "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz", + "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa", shell: """ - cat {input} > {output} + cat {input} | gunzip > {output} """ +rule grab_all_sequence_names_and_lengths: + input: + "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa", + output: + "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.seqkit.fai", + conda: + "envs/seqkit.yml" + shell: + """ + seqkit faidx -f {input} + """ rule reduce_sequence_homology: """ To reduce pollution between training and testing set, cluster sequences at 80% sequence identity. """ input: - "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.gz", + "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa", output: "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", @@ -110,58 +121,29 @@ rule reduce_sequence_homology: rule grab_validation_set_names_and_lengths: + """ + The train/test data set sequences are identifiable by the genome information in the header, which is consistently formatted by Ensembl. + The same is not true for the validation data. + This rule grabs the validation sequence header names so they can be separated from the train/test sets. + """ input: "inputs/validation/rnachallenge/{validation_type}.fa.gz", output: validation="inputs/validation/rnachallenge/{validation_type}.fa", - validation_fai="inputs/validation/rnachallenge/{validation_type}.fa.fai", + validation_fai="inputs/validation/rnachallenge/{validation_type}.fa.seqkit.fai", conda: "envs/seqkit.yml" shell: """ cat {input} | gunzip > {output.validation} - seqkit faidx {output.validation} - """ - - -rule grab_traintest_coding_names_and_lengths: - input: - expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome=GENOMES), - output: - coding="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa", - coding_fai="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa.fai", - conda: - "envs/seqkit.yml" - shell: - """ - cat {input} | gunzip > {output.coding} - seqkit faidx {output.coding} - """ - - -rule grab_traintest_noncoding_names_and_lengths: - input: - expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES), - output: - noncoding="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa", - noncoding_fai="outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa.fai", - conda: - "envs/seqkit.yml" - shell: - """ - cat {input} | gunzip > {output.noncoding} - seqkit faidx {output.noncoding} + seqkit faidx -f {output.validation} """ - rule process_sequences_into_nonoverlapping_sets: input: - traintest_fai=expand( - "outputs/models/rnasamba/build/2_sequence_sets/traintest/all_{coding_type}.fa.fai", - coding_type=CODING_TYPES, - ), + all_fai="outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.seqkit.fai", validation_fai=expand( - "inputs/validation/rnachallenge/{validation_type}.fa.fai", + "inputs/validation/rnachallenge/{validation_type}.fa.seqkit.fai", validation_type=VALIDATION_TYPES, ), clusters="outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", @@ -208,7 +190,7 @@ rule build_rnasamba_model: coding_type=CODING_TYPES, ), output: - "outputs/models/rnasamba/build/3_model/eu_rnasamba.hdf5", + "outputs/models/rnasamba/build/3_model/eukaryote_rnasamba.hdf5", conda: "envs/rnasamba.yml" shell: @@ -279,7 +261,6 @@ rule get_sequence_descriptors: seqkit faidx -f {input} """ - rule calculate_sequence_statistics: input: expand( diff --git a/inputs/models/rnasamba/build/train_data_links.tsv b/inputs/models/rnasamba/build/train_data_links.tsv index 7d0679c..ec514dd 100644 --- a/inputs/models/rnasamba/build/train_data_links.tsv +++ b/inputs/models/rnasamba/build/train_data_links.tsv @@ -1,4 +1,4 @@ -organism root_url genome cdna_suffix ncrna_suffix genome_abbreviation set +organism root_url genome cdna_suffix ncrna_suffix genome_abbreviation set_name human https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/ Homo_sapiens.GRCh38 cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz GRCh38 train yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/ Saccharomyces_cerevisiae.R64-1-1 cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz R64-1-1 train worm https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/ Caenorhabditis_elegans.WBcel235 cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz WBcel235 train diff --git a/scripts/calculate_sequence_statistics.R b/scripts/calculate_sequence_statistics.R index 2943880..2000051 100644 --- a/scripts/calculate_sequence_statistics.R +++ b/scripts/calculate_sequence_statistics.R @@ -1,22 +1,13 @@ library(tidyverse) +source("scripts/parse_sequence_information.R") files <- unlist(snakemake@input) +#files <- Sys.glob("outputs/models/rnasamba/build/2_sequence_sets/*.fa.seqkit.fai") fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") fai <- files %>% set_names() %>% map_dfr(read_tsv, col_names = fai_col_names, .id = "sequence_set") %>% - mutate(sequence_set = gsub(".fa.seqkit.fai", "", basename(sequence_set))) %>% - separate(sequence_set, into = c("coding_type", "set_name"), sep = "_", remove = FALSE) %>% - separate(sequence, into = c("sequence_id", "sequence_type", "chromosome", "gene", - "gene_biotype", "transcript_biotype", "gene_symbol", - "description"), - sep = " ", remove = FALSE, extra = "merge", fill = "right") %>% - separate(chromosome, into = c("chromosome_type", "genome", "chromosome_id", - "start", "end", "strand"), - sep = ":", remove = FALSE, extra = "merge", fill = "right") %>% - mutate(genome = ifelse(genome == "BDGP6", "BDGP6.46", genome)) %>% # fix some Drosophila names - mutate(genome = ifelse(sequence %in% c("K02E7.12.1", "F29C4.4.1", "C39B10.6b.1", "F53F4.16.1", "F53F4.17.1"), "WBcel235", genome)) %>% # fix some C. elegans genomes - mutate(length_description = ifelse(length <= 300, "<= 300nt", "> 300nt")) + parse_sequence_information_from_seqkit_fai(dataset_type = TRUE) fai %>% group_by(coding_type, set_name) %>% diff --git a/scripts/parse_sequence_information.R b/scripts/parse_sequence_information.R new file mode 100644 index 0000000..b5ada9a --- /dev/null +++ b/scripts/parse_sequence_information.R @@ -0,0 +1,23 @@ +library(tidyverse) + +parse_sequence_information_from_seqkit_fai <- function(df, dataset_type = FALSE){ + parsed_df <- df %>% + separate(sequence, into = c("sequence_id", "sequence_type", "chromosome", "gene", + "gene_biotype", "transcript_biotype", "gene_symbol", + "description"), + sep = " ", remove = FALSE, extra = "merge", fill = "right") %>% + separate(chromosome, into = c("chromosome_type", "genome", "chromosome_id", + "start", "end", "strand"), + sep = ":", remove = FALSE, extra = "merge", fill = "right") %>% + mutate(genome = ifelse(genome == "BDGP6", "BDGP6.46", genome)) %>% # fix some Drosophila names + mutate(genome = ifelse(sequence %in% c("K02E7.12.1", "F29C4.4.1", "C39B10.6b.1", "F53F4.16.1", "F53F4.17.1"), "WBcel235", genome)) %>% # fix some C. elegans genomes + mutate(length_description = ifelse(length <= 300, "<= 300nt", "> 300nt")) + + if(dataset_type == TRUE){ + parsed_df <- parsed_df %>% + mutate(sequence_set = gsub(".fa.seqkit.fai", "", basename(sequence_set))) %>% + separate(sequence_set, into = c("coding_type", "set_name"), sep = "_", remove = FALSE) + } + + return(parsed_df) +} diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index f99b85a..8bd2a12 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -1,16 +1,17 @@ library(readr) library(dplyr) +source("scripts/parse_sequence_information.R") # functions --------------------------------------------------------------- -split_train_and_test_data <- function(df, fraction = 0.8, seed = 1){ - # sample without replacement to select fraction of the data set as a training data set - set.seed(seed) # set seed so that the sample command gives the same results each time it is run - df_annotation <- sort(sample(nrow(df), nrow(df)* fraction)) - train <- df[df_annotation, ] - test <- df[-df_annotation, ] - return(list(train_df = train, test_df = test)) -} +# split_train_and_test_data <- function(df, fraction = 0.8, seed = 1){ +# # sample without replacement to select fraction of the data set as a training data set +# set.seed(seed) # set seed so that the sample command gives the same results each time it is run +# df_annotation <- sort(sample(nrow(df), nrow(df)* fraction)) +# train <- df[df_annotation, ] +# test <- df[-df_annotation, ] +# return(list(train_df = train, test_df = test)) +# } # read in data sets ------------------------------------------------------ @@ -18,32 +19,51 @@ fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") clusters <- read_tsv(snakemake@input[['clusters']], col_names = c("rep", "cluster_member")) coding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[1], col_names = fai_col_names) noncoding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[2], col_names = fai_col_names) -coding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[1], col_names = fai_col_names) -noncoding_traintest <- read_tsv(unlist(snakemake@input[['traintest_fai']])[2], col_names = fai_col_names) +all_fai <- read_tsv(snakemake@input[['all_fai']], col_names = fai_col_names) %>% + parse_sequence_information_from_seqkit_fai() + +metadata <- read_tsv("inputs/models/rnasamba/build/train_data_links.tsv") %>% + select(organism, genome = genome_abbreviation, set_name) fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") clusters <- read_tsv("outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", col_names = c("rep", "cluster_member")) -coding_validation <- read_tsv("inputs/validation/rnachallenge/mRNAs.fa.fai", col_names = fai_col_names) -noncoding_validation <- read_tsv("inputs/validation/rnachallenge/ncRNAs.fa.fai", col_names = fai_col_names) -coding_traintest <- read_tsv("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_coding.fa.fai", col_names = fai_col_names) -noncoding_traintest <- read_tsv("outputs/models/rnasamba/build/2_sequence_sets/traintest/all_noncoding.fa.fai", col_names = fai_col_names) -# filter to non-overlapping sets ------------------------------------------ +coding_validation <- read_tsv("inputs/validation/rnachallenge/mRNAs.fa.seqkit.fai", col_names = fai_col_names) +noncoding_validation <- read_tsv("inputs/validation/rnachallenge/ncRNAs.fa.seqkit.fai", col_names = fai_col_names) +all_fai <- read_tsv("outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.seqkit.fai", col_names = fai_col_names) + + +# homology reduction ------------------------------------------------------ + +# filter to homology reduced sequences -- keep sequences that had =<80% homology to other sequences +all_fai_filtered <- all_fai %>% + mutate(sequence_id = gsub(" .*", "", sequence)) %>% + filter(sequence_id %in% clusters$rep) %>% + select(-sequence_id) + +# data set separation ----------------------------------------------------- -noncoding_validation_filtered <- noncoding_validation %>% - filter(sequence %in% clusters$rep) %>% # keep sequences that had =<80% homology to other sequences - filter(!sequence %in% noncoding_traintest$sequence) %>% # remove sequences that are in the noncoding train/test set - filter(!sequence %in% coding_traintest$sequence) # remove sequences that ensembl now annotates as coding from validation +traintest <- all_fai_filtered %>% + filter(!sequence %in% c(coding_validation_filtered$sequence, noncoding_validation_filtered$sequence)) %>% # remove sequences that are in the validation sets + parse_sequence_information_from_seqkit_fai(dataset_type = FALSE) %>% + left_join(metadata, by = c("genome")) -coding_validation_filtered <- coding_validation %>% - filter(sequence %in% clusters$rep) %>% # keep sequences that had =<80% homology to other sequences - filter(!sequence %in% coding_traintest$sequence) %>% # remove seqencues that are in the coding train/test set - filter(!sequence %in% noncoding_traintest$sequence) # remove sequences that are in the noncoding train/test set +train <- traintest %>% + filter(set_name == "train") +coding_train <- train %>% filter(sequence_type == "cdna") +noncoding_train <- train %>% filter(sequence_type == "ncrna") -noncoding_traintest_filtered <- noncoding_traintest %>% - filter(sequence %in% clusters$rep) # keep sequences that had =<80% homology to other sequences +test <- traintest %>% + filter(set_name == "test") +coding_test <- test %>% filter(sequence_type == "cdna") +noncoding_test <- test %>% filter(sequence_type == "ncrna") -coding_traintest_filtered <- coding_traintest %>% - filter(sequence %in% clusters$rep) # keep sequences that had =<80% homology to other sequences +coding_validation_filtered <- all_fai_filtered %>% + filter(sequence %in% coding_validation$sequence) %>% # filter to coding sequences + filter(!sequence %in% traintest$sequence_id) # remove sequences that overlap with the training/testing data + +noncoding_validation_filtered <- all_fai_filtered %>% + filter(sequence %in% noncoding_validation$sequence) %>% # filter to noncoding sequences + filter(!sequence %in% traintest$sequence_id) # remove sequences that overlap with the training/testing data # augment the noncoding traintest set to balance sizes ------------------ @@ -52,22 +72,17 @@ coding_traintest_filtered <- coding_traintest %>% # but that would require altering the source code of the RNAsamba tool. # This approach should produce equivalent results. -num_rows_target <- nrow(coding_traintest_filtered) +num_rows_target <- nrow(coding_train) -noncoding_traintest_filtered <- noncoding_traintest_filtered %>% +noncoding_train <- noncoding_train %>% slice_sample(n = num_rows_target, replace = TRUE) -# split training and testing sets ----------------------------------------- - -coding_traintest_split <- split_train_and_test_data(coding_traintest_filtered) -noncoding_traintest_split <- split_train_and_test_data(noncoding_traintest_filtered) - # write out contigs names for each set ------------------------------------ + snakemake_output <- unlist(snakemake@output) -print(snakemake_output) -write.table(coding_traintest_split$train_df$sequence, snakemake_output[1], quote = F, col.names = F, row.names = F) -write.table(coding_traintest_split$test_df$sequence, snakemake_output[2], quote = F, col.names = F, row.names = F) -write.table(coding_validation_filtered$sequence, snakemake_output[3], quote = F, col.names = F, row.names = F) -write.table(noncoding_traintest_split$train_df$sequence, snakemake_output[4], quote = F, col.names = F, row.names = F) -write.table(noncoding_traintest_split$test_df$sequence, snakemake_output[5], quote = F, col.names = F, row.names = F) +write.table(coding_train$sequence, snakemake_output[1], quote = F, col.names = F, row.names = F) +write.table(coding_test$sequence, snakemake_output[2], quote = F, col.names = F, row.names = F) +write.table(coding_validation$sequence, snakemake_output[3], quote = F, col.names = F, row.names = F) +write.table(noncoding_train_split$sequence, snakemake_output[4], quote = F, col.names = F, row.names = F) +write.table(noncoding_test_split$sequence, snakemake_output[5], quote = F, col.names = F, row.names = F) write.table(noncoding_validation_filtered$sequence, snakemake_output[6], quote = F, col.names = F, row.names = F) From 55cc055d584ddc386b2def2ff99c0d6253e511fe Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 13:04:25 -0500 Subject: [PATCH 21/34] update pthas --- curate_datasets.snakefile | 88 +++++++++---------- .../build => datasets}/train_data_links.tsv | 0 2 files changed, 44 insertions(+), 44 deletions(-) rename inputs/models/{rnasamba/build => datasets}/train_data_links.tsv (100%) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index 8d6c7d7..d4f666c 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -2,7 +2,7 @@ import pandas as pd import os import re -metadata = pd.read_csv("inputs/models/rnasamba/build/train_data_links.tsv", sep="\t") +metadata = pd.read_csv("inputs/models/datasets/train_data_links.tsv", sep="\t") GENOMES = metadata["genome"].unique().tolist() RNA_TYPES = ["cdna", "ncrna"] # inherits names from ensembl VALIDATION_TYPES = [ @@ -16,9 +16,9 @@ MODEL_TYPES = ["eukaryote", "human"] rule all: input: - "outputs/models/rnasamba/build/5_stats/set_summary.tsv", + "outputs/models/datasets/3_stats/set_summary.tsv", expand( - "outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", + "outputs/models/build/rnasamba/1_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", model_type=MODEL_TYPES, dataset_type=DATASET_TYPES, ), @@ -35,7 +35,7 @@ rule download_ensembl_data: - ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz -> ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz (no change) """ output: - "inputs/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz", + "inputs/models/datasets/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz", run: genome_df = metadata.loc[(metadata["genome"] == wildcards.genome)] root_url = genome_df["root_url"].values[0] @@ -54,9 +54,9 @@ rule extract_protein_coding_orfs_from_cdna: Transcripts in the cDNA files have headers like: >TRANSCRIPT_ID SEQTYPE LOCATION GENE_ID GENE_BIOTYPE TRANSCRIPT_BIOTYPE, where the gene_biotype and transcript_biotype both contain information about whether the gene is coding or not. """ input: - "inputs/ensembl/cdna/{genome}.cdna.fa.gz", + "inputs/models/datasets/ensembl/cdna/{genome}.cdna.fa.gz", output: - "outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", + "outputs/models/datasets/0_coding/{genome}.fa.gz", conda: "envs/seqkit.yml" shell: @@ -67,7 +67,7 @@ rule extract_protein_coding_orfs_from_cdna: rule download_validation_data: output: - "inputs/validation/rnachallenge/{validation_type}.fa.gz", + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz", shell: """ curl -JL https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa | gzip > {output} @@ -76,14 +76,14 @@ rule download_validation_data: rule combine_sequences: input: - coding=expand("outputs/models/rnasamba/build/0_coding/{genome}.fa.gz", genome=GENOMES), - noncoding=expand("inputs/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES), + coding=expand("outputs/models/datasets/0_coding/{genome}.fa.gz", genome=GENOMES), + noncoding=expand("inputs/models/datasets/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES), validation=expand( - "inputs/validation/rnachallenge/{validation_type}.fa.gz", + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz", validation_type=VALIDATION_TYPES, ), output: - "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa", + "outputs/models/datasets/1_homology_reduction/all_sequences.fa", shell: """ cat {input} | gunzip > {output} @@ -91,9 +91,9 @@ rule combine_sequences: rule grab_all_sequence_names_and_lengths: input: - "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa", + "outputs/models/datasets/1_homology_reduction/all_sequences.fa", output: - "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.seqkit.fai", + "outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai", conda: "envs/seqkit.yml" shell: @@ -106,12 +106,12 @@ rule reduce_sequence_homology: To reduce pollution between training and testing set, cluster sequences at 80% sequence identity. """ input: - "outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa", + "outputs/models/datasets/1_homology_reduction/all_sequences.fa", output: - "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", - "outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", + "outputs/models/datasets/1_homology_reduction/clustered_sequences_rep_seq.fasta", + "outputs/models/datasets/1_homology_reduction/clustered_sequences_cluster.tsv", params: - prefix="outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences", + prefix="outputs/models/datasets/1_homology_reduction/clustered_sequences", conda: "envs/mmseqs2.yml" shell: @@ -127,10 +127,10 @@ rule grab_validation_set_names_and_lengths: This rule grabs the validation sequence header names so they can be separated from the train/test sets. """ input: - "inputs/validation/rnachallenge/{validation_type}.fa.gz", + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz", output: - validation="inputs/validation/rnachallenge/{validation_type}.fa", - validation_fai="inputs/validation/rnachallenge/{validation_type}.fa.seqkit.fai", + validation="inputs/models/datasets/validation/rnachallenge/{validation_type}.fa", + validation_fai="inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.seqkit.fai", conda: "envs/seqkit.yml" shell: @@ -141,15 +141,15 @@ rule grab_validation_set_names_and_lengths: rule process_sequences_into_nonoverlapping_sets: input: - all_fai="outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.seqkit.fai", + all_fai="outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai", validation_fai=expand( - "inputs/validation/rnachallenge/{validation_type}.fa.seqkit.fai", + "inputs/models/datsets/validation/rnachallenge/{validation_type}.fa.seqkit.fai", validation_type=VALIDATION_TYPES, ), - clusters="outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", + clusters="outputs/models/datasets/1_homology_reduction/clustered_sequences_cluster.tsv", output: expand( - "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt", + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.txt", coding_type=CODING_TYPES, dataset_type=DATASET_TYPES, ), @@ -161,10 +161,10 @@ rule process_sequences_into_nonoverlapping_sets: rule filter_sequence_sets: input: - fa="outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_rep_seq.fasta", - names="outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.txt", + fa="outputs/models/datasets/1_homology_reduction/clustered_sequences_rep_seq.fasta", + names="outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.txt", output: - "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa", conda: "envs/seqtk.yml" shell: @@ -186,11 +186,11 @@ rule build_rnasamba_model: """ input: expand( - "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_train.fa", + "outputs/models/datasets/2_sequence_sets/{coding_type}_train.fa", coding_type=CODING_TYPES, ), output: - "outputs/models/rnasamba/build/3_model/eukaryote_rnasamba.hdf5", + "outputs/models/build/rnasamba/0_model/eukaryote_rnasamba.hdf5", conda: "envs/rnasamba.yml" shell: @@ -201,13 +201,13 @@ rule build_rnasamba_model: rule assess_rnasamba_model: input: - model="outputs/models/rnasamba/build/3_model/{model_type}_rnasamba.hdf5", - faa="outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", + model="outputs/models/build/rnasamba/0_model/{model_type}_rnasamba.hdf5", + faa="outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa", output: - faa="outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.fa", - predictions="outputs/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv", + faa="outputs/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.fa", + predictions="outputs/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv", benchmark: - "benchmarks/models/rnasamba/build/4_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" + "benchmarks/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" conda: "envs/rnasamba.yml" shell: @@ -219,12 +219,12 @@ rule assess_rnasamba_model: rule calculate_rnasamba_model_accuracy: input: expand( - "outputs/models/rnasamba/build/4_evaluation/{{model_type}}/{coding_type}_{{dataset_type}}.tsv", + "outputs/models/build/rnasamba/1_evaluation/{{model_type}}/{coding_type}_{{dataset_type}}.tsv", coding_type=CODING_TYPES, ), output: - freq="outputs/models/rnasamba/build/4_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", - metrics="outputs/models/rnasamba/build/4_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", + freq="outputs/models/build/build/1_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", + metrics="outputs/models/build/rnasamba/1_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", conda: "envs/caret.yml" script: @@ -237,7 +237,7 @@ rule download_rnasamba_human_model: It's saved under a new name so we can use a wildcard to run rnasamba classify and to calculate model accuracy. """ output: - "outputs/models/rnasamba/build/3_model/human_rnasamba.hdf5", + "outputs/models/build/rnasamba/0_model/human_rnasamba.hdf5", shell: """ curl -JLo {output} https://github.com/apcamargo/RNAsamba/raw/master/data/full_length_weights.hdf5 @@ -251,9 +251,9 @@ rule download_rnasamba_human_model: rule get_sequence_descriptors: input: - "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa", + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa", output: - "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", conda: "envs/seqkit.yml" shell: @@ -264,14 +264,14 @@ rule get_sequence_descriptors: rule calculate_sequence_statistics: input: expand( - "outputs/models/rnasamba/build/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", coding_type=CODING_TYPES, dataset_type=DATASET_TYPES, ), output: - set_summary="outputs/models/rnasamba/build/5_stats/set_summary.tsv", - set_length_summary="outputs/models/rnasamba/build/5_stats/set_length_summary.tsv", - set_length_genome_summary="outputs/models/rnasamba/build/5_stats/set_length_genome_summary.tsv", + set_summary="outputs/models/datasets/3_stats/set_summary.tsv", + set_length_summary="outputs/models/datasets/3_stats/set_length_summary.tsv", + set_length_genome_summary="outputs/models/datasets/3_stats/set_length_genome_summary.tsv", conda: "envs/tidyverse.yml" script: diff --git a/inputs/models/rnasamba/build/train_data_links.tsv b/inputs/models/datasets/train_data_links.tsv similarity index 100% rename from inputs/models/rnasamba/build/train_data_links.tsv rename to inputs/models/datasets/train_data_links.tsv From aace13f8393e13b2622b884e83b5afc63d81893e Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 13:06:22 -0500 Subject: [PATCH 22/34] linting --- curate_datasets.snakefile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index d4f666c..c244c5a 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -77,7 +77,9 @@ rule download_validation_data: rule combine_sequences: input: coding=expand("outputs/models/datasets/0_coding/{genome}.fa.gz", genome=GENOMES), - noncoding=expand("inputs/models/datasets/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES), + noncoding=expand( + "inputs/models/datasets/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES + ), validation=expand( "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz", validation_type=VALIDATION_TYPES, @@ -89,6 +91,7 @@ rule combine_sequences: cat {input} | gunzip > {output} """ + rule grab_all_sequence_names_and_lengths: input: "outputs/models/datasets/1_homology_reduction/all_sequences.fa", @@ -101,6 +104,7 @@ rule grab_all_sequence_names_and_lengths: seqkit faidx -f {input} """ + rule reduce_sequence_homology: """ To reduce pollution between training and testing set, cluster sequences at 80% sequence identity. @@ -139,11 +143,12 @@ rule grab_validation_set_names_and_lengths: seqkit faidx -f {output.validation} """ + rule process_sequences_into_nonoverlapping_sets: input: all_fai="outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai", validation_fai=expand( - "inputs/models/datsets/validation/rnachallenge/{validation_type}.fa.seqkit.fai", + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.seqkit.fai", validation_type=VALIDATION_TYPES, ), clusters="outputs/models/datasets/1_homology_reduction/clustered_sequences_cluster.tsv", @@ -261,6 +266,7 @@ rule get_sequence_descriptors: seqkit faidx -f {input} """ + rule calculate_sequence_statistics: input: expand( From b54f0cb34f4477db62b2981c6de8a085abcec379 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 13:08:13 -0500 Subject: [PATCH 23/34] try update rnasamba env for gpu --- envs/rnasamba.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/envs/rnasamba.yml b/envs/rnasamba.yml index 972284a..55536ac 100644 --- a/envs/rnasamba.yml +++ b/envs/rnasamba.yml @@ -3,4 +3,11 @@ channels: - bioconda - defaults dependencies: - - rnasamba=0.2.5 + - python=3.8 + - pip + - pip: + - nvidia-pyindex + - nvidia-tensorflow>=1.15 + - keras>=2.1.0,<2.3.0 + - biopython + - --no-deps rnasamba From 6879d04a85fdc2f7788c542a197f45cd95aedfb2 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 18:57:52 +0000 Subject: [PATCH 24/34] deal with rnasamba install for gpu --- curate_datasets.snakefile | 21 +++++++++++++++++++-- envs/rnasamba.yml | 2 -- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index c244c5a..efdc5c2 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -182,6 +182,22 @@ rule filter_sequence_sets: ## Build RNAsamba model ################################################################## +rule pip_install_rnasamba_no_deps: + """ + To take advantage of nvidia GPU on AWS instance "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" (ami-07eb000b3340966b0), + we need to install specific versions of tensorflow and other dependencies. + This is accomplished in the envs/rnasamba.yml file, however rnasamba itself is not installed there because we need to use the command: + pip install --no-deps rnasamba + and there is no way to specify the "--no-deps" flag in a yaml file. + This rule installs rnasamba into the conda-generated environment. + """ + output: "outputs/models/build/rnasamba/rnasamba_installed.txt" + conda: "envs/rnasamba.yml" + shell:''' + pip install 'nvidia-tensorflow>=1.15' + pip install --no-deps rnasamba + touch {output} + ''' rule build_rnasamba_model: """ @@ -190,7 +206,8 @@ rule build_rnasamba_model: It is the number of epochs after lowest validation loss before stopping training. """ input: - expand( + rnasamba = "outputs/models/build/rnasamba/rnasamba_installed.txt", + fa = expand( "outputs/models/datasets/2_sequence_sets/{coding_type}_train.fa", coding_type=CODING_TYPES, ), @@ -200,7 +217,7 @@ rule build_rnasamba_model: "envs/rnasamba.yml" shell: """ - rnasamba train --early_stopping 5 --verbose 2 {output} {input[0]} {input[1]} + rnasamba train --early_stopping 5 --verbose 2 {output} {input.fa[0]} {input.fa[1]} """ diff --git a/envs/rnasamba.yml b/envs/rnasamba.yml index 55536ac..8749a0e 100644 --- a/envs/rnasamba.yml +++ b/envs/rnasamba.yml @@ -7,7 +7,5 @@ dependencies: - pip - pip: - nvidia-pyindex - - nvidia-tensorflow>=1.15 - keras>=2.1.0,<2.3.0 - biopython - - --no-deps rnasamba From a073c1369183f7c14a519f25f6729acc33e37a7b Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 14:01:24 -0500 Subject: [PATCH 25/34] update file pointers for data processing --- curate_datasets.snakefile | 1 + ...ocess_sequences_into_nonoverlapping_sets.R | 20 +------------------ 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index efdc5c2..ac5781e 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -146,6 +146,7 @@ rule grab_validation_set_names_and_lengths: rule process_sequences_into_nonoverlapping_sets: input: + metadata="inputs/models/datasets/train_data_links.tsv", all_fai="outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai", validation_fai=expand( "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.seqkit.fai", diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index 8bd2a12..0bf0ef6 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -2,17 +2,6 @@ library(readr) library(dplyr) source("scripts/parse_sequence_information.R") -# functions --------------------------------------------------------------- - -# split_train_and_test_data <- function(df, fraction = 0.8, seed = 1){ -# # sample without replacement to select fraction of the data set as a training data set -# set.seed(seed) # set seed so that the sample command gives the same results each time it is run -# df_annotation <- sort(sample(nrow(df), nrow(df)* fraction)) -# train <- df[df_annotation, ] -# test <- df[-df_annotation, ] -# return(list(train_df = train, test_df = test)) -# } - # read in data sets ------------------------------------------------------ fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") @@ -22,16 +11,9 @@ noncoding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[2], all_fai <- read_tsv(snakemake@input[['all_fai']], col_names = fai_col_names) %>% parse_sequence_information_from_seqkit_fai() -metadata <- read_tsv("inputs/models/rnasamba/build/train_data_links.tsv") %>% +metadata <- read_tsv(snakemake@input[['metadata']]) %>% select(organism, genome = genome_abbreviation, set_name) -fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") -clusters <- read_tsv("outputs/models/rnasamba/build/1_homology_reduction/clustered_sequences_cluster.tsv", col_names = c("rep", "cluster_member")) -coding_validation <- read_tsv("inputs/validation/rnachallenge/mRNAs.fa.seqkit.fai", col_names = fai_col_names) -noncoding_validation <- read_tsv("inputs/validation/rnachallenge/ncRNAs.fa.seqkit.fai", col_names = fai_col_names) -all_fai <- read_tsv("outputs/models/rnasamba/build/1_homology_reduction/all_sequences.fa.seqkit.fai", col_names = fai_col_names) - - # homology reduction ------------------------------------------------------ # filter to homology reduced sequences -- keep sequences that had =<80% homology to other sequences From 98c1a98ce5296173608ebed335c1652ee4e843ca Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 14:02:10 -0500 Subject: [PATCH 26/34] linting --- curate_datasets.snakefile | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index ac5781e..ca3e3e2 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -183,6 +183,7 @@ rule filter_sequence_sets: ## Build RNAsamba model ################################################################## + rule pip_install_rnasamba_no_deps: """ To take advantage of nvidia GPU on AWS instance "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" (ami-07eb000b3340966b0), @@ -192,13 +193,17 @@ rule pip_install_rnasamba_no_deps: and there is no way to specify the "--no-deps" flag in a yaml file. This rule installs rnasamba into the conda-generated environment. """ - output: "outputs/models/build/rnasamba/rnasamba_installed.txt" - conda: "envs/rnasamba.yml" - shell:''' - pip install 'nvidia-tensorflow>=1.15' - pip install --no-deps rnasamba - touch {output} - ''' + output: + "outputs/models/build/rnasamba/rnasamba_installed.txt", + conda: + "envs/rnasamba.yml" + shell: + """ + pip install 'nvidia-tensorflow>=1.15' + pip install --no-deps rnasamba + touch {output} + """ + rule build_rnasamba_model: """ @@ -207,8 +212,8 @@ rule build_rnasamba_model: It is the number of epochs after lowest validation loss before stopping training. """ input: - rnasamba = "outputs/models/build/rnasamba/rnasamba_installed.txt", - fa = expand( + rnasamba="outputs/models/build/rnasamba/rnasamba_installed.txt", + fa=expand( "outputs/models/datasets/2_sequence_sets/{coding_type}_train.fa", coding_type=CODING_TYPES, ), From 43e2eae7e8a3c741a5f86a154e9331a0ba0187f6 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 19:25:10 +0000 Subject: [PATCH 27/34] fix typos --- scripts/process_sequences_into_nonoverlapping_sets.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index 0bf0ef6..e2ef135 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -25,7 +25,7 @@ all_fai_filtered <- all_fai %>% # data set separation ----------------------------------------------------- traintest <- all_fai_filtered %>% - filter(!sequence %in% c(coding_validation_filtered$sequence, noncoding_validation_filtered$sequence)) %>% # remove sequences that are in the validation sets + filter(!sequence %in% c(coding_validation$sequence, noncoding_validation$sequence)) %>% # remove sequences that are in the validation sets parse_sequence_information_from_seqkit_fai(dataset_type = FALSE) %>% left_join(metadata, by = c("genome")) @@ -64,7 +64,7 @@ noncoding_train <- noncoding_train %>% snakemake_output <- unlist(snakemake@output) write.table(coding_train$sequence, snakemake_output[1], quote = F, col.names = F, row.names = F) write.table(coding_test$sequence, snakemake_output[2], quote = F, col.names = F, row.names = F) -write.table(coding_validation$sequence, snakemake_output[3], quote = F, col.names = F, row.names = F) -write.table(noncoding_train_split$sequence, snakemake_output[4], quote = F, col.names = F, row.names = F) -write.table(noncoding_test_split$sequence, snakemake_output[5], quote = F, col.names = F, row.names = F) +write.table(coding_validation_filtered$sequence, snakemake_output[3], quote = F, col.names = F, row.names = F) +write.table(noncoding_train$sequence, snakemake_output[4], quote = F, col.names = F, row.names = F) +write.table(noncoding_test$sequence, snakemake_output[5], quote = F, col.names = F, row.names = F) write.table(noncoding_validation_filtered$sequence, snakemake_output[6], quote = F, col.names = F, row.names = F) From f614cbdf920a260cd4424ce6437a54080ee2522a Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 19:26:46 +0000 Subject: [PATCH 28/34] add a benchmark for model building --- curate_datasets.snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index ca3e3e2..af8b5c2 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -221,6 +221,8 @@ rule build_rnasamba_model: "outputs/models/build/rnasamba/0_model/eukaryote_rnasamba.hdf5", conda: "envs/rnasamba.yml" + benchmark: + "benchmarks/models/build/rnasamba/0_model/eukaryote_rnasamba.tsv" shell: """ rnasamba train --early_stopping 5 --verbose 2 {output} {input.fa[0]} {input.fa[1]} From 1a7d863daa678cf7748952b541190fbae34d8ea7 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 13 Feb 2024 14:33:23 -0500 Subject: [PATCH 29/34] missing new line eof --- inputs/models/datasets/train_data_links.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inputs/models/datasets/train_data_links.tsv b/inputs/models/datasets/train_data_links.tsv index ec514dd..0578b74 100644 --- a/inputs/models/datasets/train_data_links.tsv +++ b/inputs/models/datasets/train_data_links.tsv @@ -14,4 +14,4 @@ frog https://ftp.ensembl.org/pub/release-111/fasta/xenopus_tropicalis/ Xenopus_t rat https://ftp.ensembl.org/pub/release-111/fasta/rattus_norvegicus/ Rattus_norvegicus.mRatBN7.2 cdna/Rattus_norvegicus.mRatBN7.2.cdna.all.fa.gz ncrna/Rattus_norvegicus.mRatBN7.2.ncrna.fa.gz mRatBN7 test honeybee https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/apis_mellifera/ Apis_mellifera.Amel_HAv3.1 cdna/Apis_mellifera.Amel_HAv3.1.cdna.all.fa.gz ncrna/Apis_mellifera.Amel_HAv3.1.ncrna.fa.gz Amel_HAv3.1 test fission_yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/schizosaccharomyces_pombe/ Schizosaccharomyces_pombe.ASM294v2 cdna/Schizosaccharomyces_pombe.ASM294v2.cdna.all.fa.gz ncrna/Schizosaccharomyces_pombe.ASM294v2.ncrna.fa.gz ASM294v2 test -tetrahymena https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/tetrahymena_thermophila/ Tetrahymena_thermophila.JCVI-TTA1-2.2 cdna/Tetrahymena_thermophila.JCVI-TTA1-2.2.cdna.all.fa.gz ncrna/Tetrahymena_thermophila.JCVI-TTA1-2.2.ncrna.fa.gz JCVI-TTA1-2.2 test \ No newline at end of file +tetrahymena https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/tetrahymena_thermophila/ Tetrahymena_thermophila.JCVI-TTA1-2.2 cdna/Tetrahymena_thermophila.JCVI-TTA1-2.2.cdna.all.fa.gz ncrna/Tetrahymena_thermophila.JCVI-TTA1-2.2.ncrna.fa.gz JCVI-TTA1-2.2 test From 3e24ad552bbdeb15a1c4ad3cad9f9a1a5d387670 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 14 Feb 2024 00:52:18 +0000 Subject: [PATCH 30/34] woops filepath typo --- curate_datasets.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index af8b5c2..1dec64e 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -253,7 +253,7 @@ rule calculate_rnasamba_model_accuracy: coding_type=CODING_TYPES, ), output: - freq="outputs/models/build/build/1_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", + freq="outputs/models/build/rnasamba/1_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", metrics="outputs/models/build/rnasamba/1_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", conda: "envs/caret.yml" From f97ade025819a7124f7e4b1952120079f2b5d3c2 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 14 Feb 2024 13:25:30 +0000 Subject: [PATCH 31/34] clean up versions around rnasamba env --- curate_datasets.snakefile | 4 ++-- envs/rnasamba.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index 1dec64e..57fabfb 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -199,8 +199,8 @@ rule pip_install_rnasamba_no_deps: "envs/rnasamba.yml" shell: """ - pip install 'nvidia-tensorflow>=1.15' - pip install --no-deps rnasamba + pip install 'nvidia-tensorflow~=1.15' + pip install --no-deps rnasamba # used version 0.2.5 touch {output} """ diff --git a/envs/rnasamba.yml b/envs/rnasamba.yml index 8749a0e..ff75535 100644 --- a/envs/rnasamba.yml +++ b/envs/rnasamba.yml @@ -6,6 +6,6 @@ dependencies: - python=3.8 - pip - pip: - - nvidia-pyindex + - nvidia-pyindex=1.0.9 - keras>=2.1.0,<2.3.0 - - biopython + - biopython=1.83 From ee3c279c584c2df510f6837d3aae92cfa756e776 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 14 Feb 2024 13:26:41 +0000 Subject: [PATCH 32/34] clean up class weights comment --- scripts/process_sequences_into_nonoverlapping_sets.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R index e2ef135..849e3a1 100644 --- a/scripts/process_sequences_into_nonoverlapping_sets.R +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -50,9 +50,9 @@ noncoding_validation_filtered <- all_fai_filtered %>% # augment the noncoding traintest set to balance sizes ------------------ # To avoid biases in classification, it's better to have the same number of sequences in the different classes. -# This could also be done with changing the weights on the activation function of the model building in tensorflow, +# This could also be done with changing the weights on the loss function of the model building in tensorflow, # but that would require altering the source code of the RNAsamba tool. -# This approach should produce equivalent results. +# This approach should produce approximately equivalent results. num_rows_target <- nrow(coding_train) From dcb8b2a7a2d709f07d32d2bb3cdd7634502ef1ff Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 14 Feb 2024 13:31:00 +0000 Subject: [PATCH 33/34] add note about order of arguments --- curate_datasets.snakefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/curate_datasets.snakefile b/curate_datasets.snakefile index 57fabfb..31ce4ee 100644 --- a/curate_datasets.snakefile +++ b/curate_datasets.snakefile @@ -9,6 +9,11 @@ VALIDATION_TYPES = [ "mRNAs", "ncRNAs", ] # inherits names from https://github.com/cbl-nabi/RNAChallenge +# Note that for variables CODING_TYPES and DATASET_TYPES, the order matters; +# The snakemake expand() function maintains the order of these arguments. +# This behavior is used later in the snakefile to specify inputs and outputs of different scripts/shell commands. +# The order of CODING_TYPES must correspond to the order of the positional args that are later passed to rnasamba. +# In addition, the order of CODING_TYPES and DATASET_TYPES is used to declare outputs in the rule/R script process_sequences_into_nonoverlapping_sets. CODING_TYPES = ["coding", "noncoding"] DATASET_TYPES = ["train", "test", "validation"] MODEL_TYPES = ["eukaryote", "human"] From 1287e5c2046714809767bcfaeeff1a8da10d007e Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 14 Feb 2024 13:31:55 +0000 Subject: [PATCH 34/34] change snakefile name since we were able to keep rnasamba build commands --- ...tasets.snakefile => curate_datasets_and_build_models.snakefile | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename curate_datasets.snakefile => curate_datasets_and_build_models.snakefile (100%) diff --git a/curate_datasets.snakefile b/curate_datasets_and_build_models.snakefile similarity index 100% rename from curate_datasets.snakefile rename to curate_datasets_and_build_models.snakefile