diff --git a/curate_datasets_and_build_models.snakefile b/curate_datasets_and_build_models.snakefile new file mode 100644 index 0000000..31ce4ee --- /dev/null +++ b/curate_datasets_and_build_models.snakefile @@ -0,0 +1,314 @@ +import pandas as pd +import os +import re + +metadata = pd.read_csv("inputs/models/datasets/train_data_links.tsv", sep="\t") +GENOMES = metadata["genome"].unique().tolist() +RNA_TYPES = ["cdna", "ncrna"] # inherits names from ensembl +VALIDATION_TYPES = [ + "mRNAs", + "ncRNAs", +] # inherits names from https://github.com/cbl-nabi/RNAChallenge +# Note that for variables CODING_TYPES and DATASET_TYPES, the order matters; +# The snakemake expand() function maintains the order of these arguments. +# This behavior is used later in the snakefile to specify inputs and outputs of different scripts/shell commands. +# The order of CODING_TYPES must correspond to the order of the positional args that are later passed to rnasamba. +# In addition, the order of CODING_TYPES and DATASET_TYPES is used to declare outputs in the rule/R script process_sequences_into_nonoverlapping_sets. +CODING_TYPES = ["coding", "noncoding"] +DATASET_TYPES = ["train", "test", "validation"] +MODEL_TYPES = ["eukaryote", "human"] + + +rule all: + input: + "outputs/models/datasets/3_stats/set_summary.tsv", + expand( + "outputs/models/build/rnasamba/1_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", + model_type=MODEL_TYPES, + dataset_type=DATASET_TYPES, + ), + + +rule download_ensembl_data: + """ + Download ensembl cDNA and ncRNA files. + Ensembl annotates protein coding and non-coding RNA transcripts in their files. + This information will be used to separate protein coding from non-coding RNAs to build an RNAsamba model. + Note this download renames genome files from their names on ensembl to make them simpler to point to. + See example transformations below: + - cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz -> cdna/Homo_sapiens.GRCh38.cdna.fa.gz (dropped "all.") + - ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz -> ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz (no change) + """ + output: + "inputs/models/datasets/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz", + run: + genome_df = metadata.loc[(metadata["genome"] == wildcards.genome)] + root_url = genome_df["root_url"].values[0] + if wildcards.rna_type == "cdna": + suffix = genome_df["cdna_suffix"].values[0] + else: + suffix = genome_df["ncrna_suffix"].values[0] + + url = root_url + suffix + shell("curl -JLo {output} {url}") + + +rule extract_protein_coding_orfs_from_cdna: + """ + Ensembl cDNA files consist of transcript sequences for actual and possible genes, including pseudogenes, NMD and the like. + Transcripts in the cDNA files have headers like: >TRANSCRIPT_ID SEQTYPE LOCATION GENE_ID GENE_BIOTYPE TRANSCRIPT_BIOTYPE, where the gene_biotype and transcript_biotype both contain information about whether the gene is coding or not. + """ + input: + "inputs/models/datasets/ensembl/cdna/{genome}.cdna.fa.gz", + output: + "outputs/models/datasets/0_coding/{genome}.fa.gz", + conda: + "envs/seqkit.yml" + shell: + """ + seqkit grep --use-regexp --by-name --pattern "transcript_biotype:protein_coding" -o {output} {input} + """ + + +rule download_validation_data: + output: + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz", + shell: + """ + curl -JL https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa | gzip > {output} + """ + + +rule combine_sequences: + input: + coding=expand("outputs/models/datasets/0_coding/{genome}.fa.gz", genome=GENOMES), + noncoding=expand( + "inputs/models/datasets/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES + ), + validation=expand( + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz", + validation_type=VALIDATION_TYPES, + ), + output: + "outputs/models/datasets/1_homology_reduction/all_sequences.fa", + shell: + """ + cat {input} | gunzip > {output} + """ + + +rule grab_all_sequence_names_and_lengths: + input: + "outputs/models/datasets/1_homology_reduction/all_sequences.fa", + output: + "outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai", + conda: + "envs/seqkit.yml" + shell: + """ + seqkit faidx -f {input} + """ + + +rule reduce_sequence_homology: + """ + To reduce pollution between training and testing set, cluster sequences at 80% sequence identity. + """ + input: + "outputs/models/datasets/1_homology_reduction/all_sequences.fa", + output: + "outputs/models/datasets/1_homology_reduction/clustered_sequences_rep_seq.fasta", + "outputs/models/datasets/1_homology_reduction/clustered_sequences_cluster.tsv", + params: + prefix="outputs/models/datasets/1_homology_reduction/clustered_sequences", + conda: + "envs/mmseqs2.yml" + shell: + """ + mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2 + """ + + +rule grab_validation_set_names_and_lengths: + """ + The train/test data set sequences are identifiable by the genome information in the header, which is consistently formatted by Ensembl. + The same is not true for the validation data. + This rule grabs the validation sequence header names so they can be separated from the train/test sets. + """ + input: + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz", + output: + validation="inputs/models/datasets/validation/rnachallenge/{validation_type}.fa", + validation_fai="inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.seqkit.fai", + conda: + "envs/seqkit.yml" + shell: + """ + cat {input} | gunzip > {output.validation} + seqkit faidx -f {output.validation} + """ + + +rule process_sequences_into_nonoverlapping_sets: + input: + metadata="inputs/models/datasets/train_data_links.tsv", + all_fai="outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai", + validation_fai=expand( + "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.seqkit.fai", + validation_type=VALIDATION_TYPES, + ), + clusters="outputs/models/datasets/1_homology_reduction/clustered_sequences_cluster.tsv", + output: + expand( + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.txt", + coding_type=CODING_TYPES, + dataset_type=DATASET_TYPES, + ), + conda: + "envs/tidyverse.yml" + script: + "scripts/process_sequences_into_nonoverlapping_sets.R" + + +rule filter_sequence_sets: + input: + fa="outputs/models/datasets/1_homology_reduction/clustered_sequences_rep_seq.fasta", + names="outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.txt", + output: + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa", + conda: + "envs/seqtk.yml" + shell: + """ + seqtk subseq {input.fa} {input.names} > {output} + """ + + +################################################################## +## Build RNAsamba model +################################################################## + + +rule pip_install_rnasamba_no_deps: + """ + To take advantage of nvidia GPU on AWS instance "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" (ami-07eb000b3340966b0), + we need to install specific versions of tensorflow and other dependencies. + This is accomplished in the envs/rnasamba.yml file, however rnasamba itself is not installed there because we need to use the command: + pip install --no-deps rnasamba + and there is no way to specify the "--no-deps" flag in a yaml file. + This rule installs rnasamba into the conda-generated environment. + """ + output: + "outputs/models/build/rnasamba/rnasamba_installed.txt", + conda: + "envs/rnasamba.yml" + shell: + """ + pip install 'nvidia-tensorflow~=1.15' + pip install --no-deps rnasamba # used version 0.2.5 + touch {output} + """ + + +rule build_rnasamba_model: + """ + Build a new rnasamba model from the training data curated above. + The --early_stopping parameter reduces training time and can help avoid overfitting. + It is the number of epochs after lowest validation loss before stopping training. + """ + input: + rnasamba="outputs/models/build/rnasamba/rnasamba_installed.txt", + fa=expand( + "outputs/models/datasets/2_sequence_sets/{coding_type}_train.fa", + coding_type=CODING_TYPES, + ), + output: + "outputs/models/build/rnasamba/0_model/eukaryote_rnasamba.hdf5", + conda: + "envs/rnasamba.yml" + benchmark: + "benchmarks/models/build/rnasamba/0_model/eukaryote_rnasamba.tsv" + shell: + """ + rnasamba train --early_stopping 5 --verbose 2 {output} {input.fa[0]} {input.fa[1]} + """ + + +rule assess_rnasamba_model: + input: + model="outputs/models/build/rnasamba/0_model/{model_type}_rnasamba.hdf5", + faa="outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa", + output: + faa="outputs/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.fa", + predictions="outputs/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv", + benchmark: + "benchmarks/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv" + conda: + "envs/rnasamba.yml" + shell: + """ + rnasamba classify --protein_fasta {output.faa} {output.predictions} {input.faa} {input.model} + """ + + +rule calculate_rnasamba_model_accuracy: + input: + expand( + "outputs/models/build/rnasamba/1_evaluation/{{model_type}}/{coding_type}_{{dataset_type}}.tsv", + coding_type=CODING_TYPES, + ), + output: + freq="outputs/models/build/rnasamba/1_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv", + metrics="outputs/models/build/rnasamba/1_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv", + conda: + "envs/caret.yml" + script: + "scripts/calculate_rnasamba_model_accuracy.R" + + +rule download_rnasamba_human_model: + """ + Use this model to compare whether the new model performs better or worse. + It's saved under a new name so we can use a wildcard to run rnasamba classify and to calculate model accuracy. + """ + output: + "outputs/models/build/rnasamba/0_model/human_rnasamba.hdf5", + shell: + """ + curl -JLo {output} https://github.com/apcamargo/RNAsamba/raw/master/data/full_length_weights.hdf5 + """ + + +################################################################## +## Get sequence statistics +################################################################## + + +rule get_sequence_descriptors: + input: + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa", + output: + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", + conda: + "envs/seqkit.yml" + shell: + """ + seqkit faidx -f {input} + """ + + +rule calculate_sequence_statistics: + input: + expand( + "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai", + coding_type=CODING_TYPES, + dataset_type=DATASET_TYPES, + ), + output: + set_summary="outputs/models/datasets/3_stats/set_summary.tsv", + set_length_summary="outputs/models/datasets/3_stats/set_length_summary.tsv", + set_length_genome_summary="outputs/models/datasets/3_stats/set_length_genome_summary.tsv", + conda: + "envs/tidyverse.yml" + script: + "scripts/calculate_sequence_statistics.R" diff --git a/envs/caret.yml b/envs/caret.yml new file mode 100644 index 0000000..413025a --- /dev/null +++ b/envs/caret.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-tidyverse=2.0.0 + - r-caret=6.0_94 diff --git a/envs/dev.yml b/envs/dev.yml index 3c70ebc..38ee968 100644 --- a/envs/dev.yml +++ b/envs/dev.yml @@ -8,4 +8,5 @@ dependencies: - python=3.12.0 - ruff=0.1.6 - snakefmt=0.8.5 - - snakemake-minimal=8.0.1 + - snakemake-minimal=8.4.2 + - pandas=2.2.0 diff --git a/envs/mmseqs2.yml b/envs/mmseqs2.yml new file mode 100644 index 0000000..339d8b2 --- /dev/null +++ b/envs/mmseqs2.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - mmseqs2=15.6f452 diff --git a/envs/rnasamba.yml b/envs/rnasamba.yml new file mode 100644 index 0000000..ff75535 --- /dev/null +++ b/envs/rnasamba.yml @@ -0,0 +1,11 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.8 + - pip + - pip: + - nvidia-pyindex=1.0.9 + - keras>=2.1.0,<2.3.0 + - biopython=1.83 diff --git a/envs/seqkit.yml b/envs/seqkit.yml new file mode 100644 index 0000000..c9e3ad5 --- /dev/null +++ b/envs/seqkit.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - seqkit=2.6.1 diff --git a/envs/seqtk.yml b/envs/seqtk.yml new file mode 100644 index 0000000..ecb2f15 --- /dev/null +++ b/envs/seqtk.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - seqtk=1.4 diff --git a/envs/tidyverse.yml b/envs/tidyverse.yml new file mode 100644 index 0000000..ee0ca28 --- /dev/null +++ b/envs/tidyverse.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-tidyverse=2.0.0 diff --git a/inputs/models/datasets/train_data_links.tsv b/inputs/models/datasets/train_data_links.tsv new file mode 100644 index 0000000..0578b74 --- /dev/null +++ b/inputs/models/datasets/train_data_links.tsv @@ -0,0 +1,17 @@ +organism root_url genome cdna_suffix ncrna_suffix genome_abbreviation set_name +human https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/ Homo_sapiens.GRCh38 cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz GRCh38 train +yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/ Saccharomyces_cerevisiae.R64-1-1 cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz R64-1-1 train +worm https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/ Caenorhabditis_elegans.WBcel235 cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz WBcel235 train +arabadopsis https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/arabidopsis_thaliana/ Arabidopsis_thaliana.TAIR10 cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz TAIR10 train +drosophila https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/ Drosophila_melanogaster.BDGP6.46 cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz BDGP6.46 train +dictyostelium_discoideum https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/ Dictyostelium_discoideum.dicty_2.7 cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz dicty_2.7 train +mouse https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/ Mus_musculus.GRCm39 cdna/Mus_musculus.GRCm39.cdna.all.fa.gz ncrna/Mus_musculus.GRCm39.ncrna.fa.gz GRCm39 train +zebrafish https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/ Danio_rerio.GRCz11 cdna/Danio_rerio.GRCz11.cdna.all.fa.gz ncrna/Danio_rerio.GRCz11.ncrna.fa.gz GRCz11 train +chicken https://ftp.ensembl.org/pub/release-111/fasta/gallus_gallus/ Gallus_gallus.bGalGal1.mat.broiler.GRCg7b cdna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.cdna.all.fa.gz ncrna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.ncrna.fa.gz bGalGal1.mat.broiler.GRCg7b test +rice https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/oryza_indica/ Oryza_indica.ASM465v1 cdna/Oryza_indica.ASM465v1.cdna.all.fa.gz ncrna/Oryza_indica.ASM465v1.ncrna.fa.gz ASM465v1 test +maize https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/zea_mays/ Zea_mays.Zm-B73-REFERENCE-NAM-5.0 cdna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.cdna.all.fa.gz ncrna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.ncrna.fa.gz Zm-B73-REFERENCE-NAM-5.0 test +frog https://ftp.ensembl.org/pub/release-111/fasta/xenopus_tropicalis/ Xenopus_tropicalis.UCB_Xtro_10.0 cdna/Xenopus_tropicalis.UCB_Xtro_10.0.cdna.all.fa.gz ncrna/Xenopus_tropicalis.UCB_Xtro_10.0.ncrna.fa.gz UCB_Xtro_10.0 test +rat https://ftp.ensembl.org/pub/release-111/fasta/rattus_norvegicus/ Rattus_norvegicus.mRatBN7.2 cdna/Rattus_norvegicus.mRatBN7.2.cdna.all.fa.gz ncrna/Rattus_norvegicus.mRatBN7.2.ncrna.fa.gz mRatBN7 test +honeybee https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/apis_mellifera/ Apis_mellifera.Amel_HAv3.1 cdna/Apis_mellifera.Amel_HAv3.1.cdna.all.fa.gz ncrna/Apis_mellifera.Amel_HAv3.1.ncrna.fa.gz Amel_HAv3.1 test +fission_yeast https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/schizosaccharomyces_pombe/ Schizosaccharomyces_pombe.ASM294v2 cdna/Schizosaccharomyces_pombe.ASM294v2.cdna.all.fa.gz ncrna/Schizosaccharomyces_pombe.ASM294v2.ncrna.fa.gz ASM294v2 test +tetrahymena https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/tetrahymena_thermophila/ Tetrahymena_thermophila.JCVI-TTA1-2.2 cdna/Tetrahymena_thermophila.JCVI-TTA1-2.2.cdna.all.fa.gz ncrna/Tetrahymena_thermophila.JCVI-TTA1-2.2.ncrna.fa.gz JCVI-TTA1-2.2 test diff --git a/pyproject.toml b/pyproject.toml index ede457e..bd55cb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,5 +59,5 @@ no-lines-before = ["future", "standard-library"] [tool.snakefmt] # the line length here should match the one used by ruff line_length = 100 -include = 'Snakefile|Snakefile_*' +include = 'Snakefile|Snakefile_*|\.snakefile$|\.smk$' exclude = 'dev' diff --git a/scripts/calculate_rnasamba_model_accuracy.R b/scripts/calculate_rnasamba_model_accuracy.R new file mode 100644 index 0000000..3260929 --- /dev/null +++ b/scripts/calculate_rnasamba_model_accuracy.R @@ -0,0 +1,29 @@ +library(tidyverse) +library(caret) + +files <- unlist(snakemake@input) + +# read in and format model results for each dataset type +model_predictions <- files %>% + set_names() %>% + map_dfr(read_tsv, .id = "tmp") %>% + mutate(tmp = gsub(".tsv", "", basename(tmp))) %>% + separate(tmp, into = c("coding_type", "dataset_type"), sep = "_", remove = T) %>% + mutate(coding_type = as.factor(coding_type), + classification = as.factor(classification)) + +# determine model performance +model_confusion_matrix <- confusionMatrix(data = model_predictions$classification, + reference = model_predictions$coding_type) + +# convert the model performance metrics into a data frame +model_confusion_matrix_df <- bind_rows(data.frame(value = model_confusion_matrix$overall), + data.frame(value = model_confusion_matrix$byClass)) %>% + rownames_to_column("metric") %>% + mutate(dataset_type = model_predictions$dataset_type[1]) # set dataset type as column even tho it will be in the file name too + +# convert the confusion matrix into a data frame +model_confusion_matrix_freq_df <- model_confusion_matrix %>% as.table() %>% data.frame() + +write_tsv(model_confusion_matrix_df, snakemake@output[['metrics']]) +write_tsv(model_confusion_matrix_freq_df, snakemake@output[['freq']]) diff --git a/scripts/calculate_sequence_statistics.R b/scripts/calculate_sequence_statistics.R new file mode 100644 index 0000000..2000051 --- /dev/null +++ b/scripts/calculate_sequence_statistics.R @@ -0,0 +1,29 @@ +library(tidyverse) +source("scripts/parse_sequence_information.R") + +files <- unlist(snakemake@input) +#files <- Sys.glob("outputs/models/rnasamba/build/2_sequence_sets/*.fa.seqkit.fai") +fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") +fai <- files %>% + set_names() %>% + map_dfr(read_tsv, col_names = fai_col_names, .id = "sequence_set") %>% + parse_sequence_information_from_seqkit_fai(dataset_type = TRUE) + +fai %>% + group_by(coding_type, set_name) %>% + tally() %>% + arrange(set_name) %>% + write_tsv(snakemake@output[['set_summary']]) + +fai %>% + group_by(coding_type, set_name, length_description) %>% + tally() %>% + arrange(set_name) %>% + write_tsv(snakemake@output[['set_length_summary']]) + +fai %>% + group_by(coding_type, set_name, length_description, genome) %>% + tally() %>% + arrange(set_name) %>% + pivot_wider(id_cols = c(coding_type, set_name, length_description), names_from = "genome", values_from = "n") %>% + write_tsv(snakemake@output[['set_length_genome_summary']]) diff --git a/scripts/parse_sequence_information.R b/scripts/parse_sequence_information.R new file mode 100644 index 0000000..b5ada9a --- /dev/null +++ b/scripts/parse_sequence_information.R @@ -0,0 +1,23 @@ +library(tidyverse) + +parse_sequence_information_from_seqkit_fai <- function(df, dataset_type = FALSE){ + parsed_df <- df %>% + separate(sequence, into = c("sequence_id", "sequence_type", "chromosome", "gene", + "gene_biotype", "transcript_biotype", "gene_symbol", + "description"), + sep = " ", remove = FALSE, extra = "merge", fill = "right") %>% + separate(chromosome, into = c("chromosome_type", "genome", "chromosome_id", + "start", "end", "strand"), + sep = ":", remove = FALSE, extra = "merge", fill = "right") %>% + mutate(genome = ifelse(genome == "BDGP6", "BDGP6.46", genome)) %>% # fix some Drosophila names + mutate(genome = ifelse(sequence %in% c("K02E7.12.1", "F29C4.4.1", "C39B10.6b.1", "F53F4.16.1", "F53F4.17.1"), "WBcel235", genome)) %>% # fix some C. elegans genomes + mutate(length_description = ifelse(length <= 300, "<= 300nt", "> 300nt")) + + if(dataset_type == TRUE){ + parsed_df <- parsed_df %>% + mutate(sequence_set = gsub(".fa.seqkit.fai", "", basename(sequence_set))) %>% + separate(sequence_set, into = c("coding_type", "set_name"), sep = "_", remove = FALSE) + } + + return(parsed_df) +} diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R new file mode 100644 index 0000000..849e3a1 --- /dev/null +++ b/scripts/process_sequences_into_nonoverlapping_sets.R @@ -0,0 +1,70 @@ +library(readr) +library(dplyr) +source("scripts/parse_sequence_information.R") + +# read in data sets ------------------------------------------------------ + +fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth") +clusters <- read_tsv(snakemake@input[['clusters']], col_names = c("rep", "cluster_member")) +coding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[1], col_names = fai_col_names) +noncoding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[2], col_names = fai_col_names) +all_fai <- read_tsv(snakemake@input[['all_fai']], col_names = fai_col_names) %>% + parse_sequence_information_from_seqkit_fai() + +metadata <- read_tsv(snakemake@input[['metadata']]) %>% + select(organism, genome = genome_abbreviation, set_name) + +# homology reduction ------------------------------------------------------ + +# filter to homology reduced sequences -- keep sequences that had =<80% homology to other sequences +all_fai_filtered <- all_fai %>% + mutate(sequence_id = gsub(" .*", "", sequence)) %>% + filter(sequence_id %in% clusters$rep) %>% + select(-sequence_id) + +# data set separation ----------------------------------------------------- + +traintest <- all_fai_filtered %>% + filter(!sequence %in% c(coding_validation$sequence, noncoding_validation$sequence)) %>% # remove sequences that are in the validation sets + parse_sequence_information_from_seqkit_fai(dataset_type = FALSE) %>% + left_join(metadata, by = c("genome")) + +train <- traintest %>% + filter(set_name == "train") +coding_train <- train %>% filter(sequence_type == "cdna") +noncoding_train <- train %>% filter(sequence_type == "ncrna") + +test <- traintest %>% + filter(set_name == "test") +coding_test <- test %>% filter(sequence_type == "cdna") +noncoding_test <- test %>% filter(sequence_type == "ncrna") + +coding_validation_filtered <- all_fai_filtered %>% + filter(sequence %in% coding_validation$sequence) %>% # filter to coding sequences + filter(!sequence %in% traintest$sequence_id) # remove sequences that overlap with the training/testing data + +noncoding_validation_filtered <- all_fai_filtered %>% + filter(sequence %in% noncoding_validation$sequence) %>% # filter to noncoding sequences + filter(!sequence %in% traintest$sequence_id) # remove sequences that overlap with the training/testing data + +# augment the noncoding traintest set to balance sizes ------------------ + +# To avoid biases in classification, it's better to have the same number of sequences in the different classes. +# This could also be done with changing the weights on the loss function of the model building in tensorflow, +# but that would require altering the source code of the RNAsamba tool. +# This approach should produce approximately equivalent results. + +num_rows_target <- nrow(coding_train) + +noncoding_train <- noncoding_train %>% + slice_sample(n = num_rows_target, replace = TRUE) + +# write out contigs names for each set ------------------------------------ + +snakemake_output <- unlist(snakemake@output) +write.table(coding_train$sequence, snakemake_output[1], quote = F, col.names = F, row.names = F) +write.table(coding_test$sequence, snakemake_output[2], quote = F, col.names = F, row.names = F) +write.table(coding_validation_filtered$sequence, snakemake_output[3], quote = F, col.names = F, row.names = F) +write.table(noncoding_train$sequence, snakemake_output[4], quote = F, col.names = F, row.names = F) +write.table(noncoding_test$sequence, snakemake_output[5], quote = F, col.names = F, row.names = F) +write.table(noncoding_validation_filtered$sequence, snakemake_output[6], quote = F, col.names = F, row.names = F)