diff --git a/curate_datasets_and_build_models.snakefile b/curate_datasets_and_build_models.snakefile
new file mode 100644
index 0000000..31ce4ee
--- /dev/null
+++ b/curate_datasets_and_build_models.snakefile
@@ -0,0 +1,314 @@
+import pandas as pd
+import os
+import re
+
+metadata = pd.read_csv("inputs/models/datasets/train_data_links.tsv", sep="\t")
+GENOMES = metadata["genome"].unique().tolist()
+RNA_TYPES = ["cdna", "ncrna"]  # inherits names from ensembl
+VALIDATION_TYPES = [
+    "mRNAs",
+    "ncRNAs",
+]  # inherits names from https://github.com/cbl-nabi/RNAChallenge
+# Note that for variables CODING_TYPES and DATASET_TYPES, the order matters;
+# The snakemake expand() function maintains the order of these arguments.
+# This behavior is used later in the snakefile to specify inputs and outputs of different scripts/shell commands.
+# The order of CODING_TYPES must correspond to the order of the positional args that are later passed to rnasamba.
+# In addition, the order of CODING_TYPES and DATASET_TYPES is used to declare outputs in the rule/R script process_sequences_into_nonoverlapping_sets.
+CODING_TYPES = ["coding", "noncoding"]
+DATASET_TYPES = ["train", "test", "validation"]
+MODEL_TYPES = ["eukaryote", "human"]
+
+
+rule all:
+    input:
+        "outputs/models/datasets/3_stats/set_summary.tsv",
+        expand(
+            "outputs/models/build/rnasamba/1_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv",
+            model_type=MODEL_TYPES,
+            dataset_type=DATASET_TYPES,
+        ),
+
+
+rule download_ensembl_data:
+    """
+    Download ensembl cDNA and ncRNA files.
+    Ensembl annotates protein coding and non-coding RNA transcripts in their files.
+    This information will be used to separate protein coding from non-coding RNAs to build an RNAsamba model.
+    Note this download renames genome files from their names on ensembl to make them simpler to point to.
+    See example transformations below:
+    - cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz -> cdna/Homo_sapiens.GRCh38.cdna.fa.gz (dropped "all.")
+    - ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz   -> ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz (no change)
+    """
+    output:
+        "inputs/models/datasets/ensembl/{rna_type}/{genome}.{rna_type}.fa.gz",
+    run:
+        genome_df = metadata.loc[(metadata["genome"] == wildcards.genome)]
+        root_url = genome_df["root_url"].values[0]
+        if wildcards.rna_type == "cdna":
+            suffix = genome_df["cdna_suffix"].values[0]
+        else:
+            suffix = genome_df["ncrna_suffix"].values[0]
+
+        url = root_url + suffix
+        shell("curl -JLo {output} {url}")
+
+
+rule extract_protein_coding_orfs_from_cdna:
+    """
+    Ensembl cDNA files consist of transcript sequences for actual and possible genes, including pseudogenes, NMD and the like.
+    Transcripts in the cDNA files have headers like: >TRANSCRIPT_ID SEQTYPE LOCATION GENE_ID GENE_BIOTYPE TRANSCRIPT_BIOTYPE, where the gene_biotype and transcript_biotype both contain information about whether the gene is coding or not.
+    """
+    input:
+        "inputs/models/datasets/ensembl/cdna/{genome}.cdna.fa.gz",
+    output:
+        "outputs/models/datasets/0_coding/{genome}.fa.gz",
+    conda:
+        "envs/seqkit.yml"
+    shell:
+        """
+        seqkit grep --use-regexp --by-name --pattern "transcript_biotype:protein_coding" -o {output} {input}
+        """
+
+
+rule download_validation_data:
+    output:
+        "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz",
+    shell:
+        """
+        curl -JL https://raw.githubusercontent.com/cbl-nabi/RNAChallenge/main/RNAchallenge/{wildcards.validation_type}.fa | gzip > {output}
+        """
+
+
+rule combine_sequences:
+    input:
+        coding=expand("outputs/models/datasets/0_coding/{genome}.fa.gz", genome=GENOMES),
+        noncoding=expand(
+            "inputs/models/datasets/ensembl/ncrna/{genome}.ncrna.fa.gz", genome=GENOMES
+        ),
+        validation=expand(
+            "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz",
+            validation_type=VALIDATION_TYPES,
+        ),
+    output:
+        "outputs/models/datasets/1_homology_reduction/all_sequences.fa",
+    shell:
+        """
+        cat {input} | gunzip > {output}
+        """
+
+
+rule grab_all_sequence_names_and_lengths:
+    input:
+        "outputs/models/datasets/1_homology_reduction/all_sequences.fa",
+    output:
+        "outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai",
+    conda:
+        "envs/seqkit.yml"
+    shell:
+        """
+        seqkit faidx -f {input}
+        """
+
+
+rule reduce_sequence_homology:
+    """
+    To reduce pollution between training and testing set, cluster sequences at 80% sequence identity.
+    """
+    input:
+        "outputs/models/datasets/1_homology_reduction/all_sequences.fa",
+    output:
+        "outputs/models/datasets/1_homology_reduction/clustered_sequences_rep_seq.fasta",
+        "outputs/models/datasets/1_homology_reduction/clustered_sequences_cluster.tsv",
+    params:
+        prefix="outputs/models/datasets/1_homology_reduction/clustered_sequences",
+    conda:
+        "envs/mmseqs2.yml"
+    shell:
+        """
+        mmseqs easy-cluster {input} {params.prefix} tmp_mmseqs2 --min-seq-id 0.8 --cov-mode 1 --cluster-mode 2
+        """
+
+
+rule grab_validation_set_names_and_lengths:
+    """
+    The train/test data set sequences are identifiable by the genome information in the header, which is consistently formatted by Ensembl.
+    The same is not true for the validation data.
+    This rule grabs the validation sequence header names so they can be separated from the train/test sets.
+    """
+    input:
+        "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.gz",
+    output:
+        validation="inputs/models/datasets/validation/rnachallenge/{validation_type}.fa",
+        validation_fai="inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.seqkit.fai",
+    conda:
+        "envs/seqkit.yml"
+    shell:
+        """
+        cat {input} | gunzip > {output.validation}
+        seqkit faidx -f {output.validation}
+        """
+
+
+rule process_sequences_into_nonoverlapping_sets:
+    input:
+        metadata="inputs/models/datasets/train_data_links.tsv",
+        all_fai="outputs/models/datasets/1_homology_reduction/all_sequences.fa.seqkit.fai",
+        validation_fai=expand(
+            "inputs/models/datasets/validation/rnachallenge/{validation_type}.fa.seqkit.fai",
+            validation_type=VALIDATION_TYPES,
+        ),
+        clusters="outputs/models/datasets/1_homology_reduction/clustered_sequences_cluster.tsv",
+    output:
+        expand(
+            "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.txt",
+            coding_type=CODING_TYPES,
+            dataset_type=DATASET_TYPES,
+        ),
+    conda:
+        "envs/tidyverse.yml"
+    script:
+        "scripts/process_sequences_into_nonoverlapping_sets.R"
+
+
+rule filter_sequence_sets:
+    input:
+        fa="outputs/models/datasets/1_homology_reduction/clustered_sequences_rep_seq.fasta",
+        names="outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.txt",
+    output:
+        "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa",
+    conda:
+        "envs/seqtk.yml"
+    shell:
+        """
+        seqtk subseq {input.fa} {input.names} > {output}
+        """
+
+
+##################################################################
+## Build RNAsamba model
+##################################################################
+
+
+rule pip_install_rnasamba_no_deps:
+    """
+    To take advantage of nvidia GPU on AWS instance "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" (ami-07eb000b3340966b0),
+    we need to install specific versions of tensorflow and other dependencies.
+    This is accomplished in the envs/rnasamba.yml file, however rnasamba itself is not installed there because we need to use the command:
+    pip install --no-deps rnasamba
+    and there is no way to specify the "--no-deps" flag in a yaml file.
+    This rule installs rnasamba into the conda-generated environment.
+    """
+    output:
+        "outputs/models/build/rnasamba/rnasamba_installed.txt",
+    conda:
+        "envs/rnasamba.yml"
+    shell:
+        """
+        pip install 'nvidia-tensorflow~=1.15'
+        pip install --no-deps rnasamba # used version 0.2.5
+        touch {output}
+        """
+
+
+rule build_rnasamba_model:
+    """
+    Build a new rnasamba model from the training data curated above.
+    The --early_stopping parameter reduces training time and can help avoid overfitting.
+    It is the number of epochs after lowest validation loss before stopping training.
+    """
+    input:
+        rnasamba="outputs/models/build/rnasamba/rnasamba_installed.txt",
+        fa=expand(
+            "outputs/models/datasets/2_sequence_sets/{coding_type}_train.fa",
+            coding_type=CODING_TYPES,
+        ),
+    output:
+        "outputs/models/build/rnasamba/0_model/eukaryote_rnasamba.hdf5",
+    conda:
+        "envs/rnasamba.yml"
+    benchmark:
+        "benchmarks/models/build/rnasamba/0_model/eukaryote_rnasamba.tsv"
+    shell:
+        """
+        rnasamba train --early_stopping 5 --verbose 2 {output} {input.fa[0]} {input.fa[1]}
+        """
+
+
+rule assess_rnasamba_model:
+    input:
+        model="outputs/models/build/rnasamba/0_model/{model_type}_rnasamba.hdf5",
+        faa="outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa",
+    output:
+        faa="outputs/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.fa",
+        predictions="outputs/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv",
+    benchmark:
+        "benchmarks/models/build/rnasamba/1_evaluation/{model_type}/{coding_type}_{dataset_type}.tsv"
+    conda:
+        "envs/rnasamba.yml"
+    shell:
+        """
+        rnasamba classify --protein_fasta {output.faa} {output.predictions} {input.faa} {input.model}
+        """
+
+
+rule calculate_rnasamba_model_accuracy:
+    input:
+        expand(
+            "outputs/models/build/rnasamba/1_evaluation/{{model_type}}/{coding_type}_{{dataset_type}}.tsv",
+            coding_type=CODING_TYPES,
+        ),
+    output:
+        freq="outputs/models/build/rnasamba/1_evaluation/{model_type}/confusionmatrix_{dataset_type}.tsv",
+        metrics="outputs/models/build/rnasamba/1_evaluation/{model_type}/accuracy_metrics_{dataset_type}.tsv",
+    conda:
+        "envs/caret.yml"
+    script:
+        "scripts/calculate_rnasamba_model_accuracy.R"
+
+
+rule download_rnasamba_human_model:
+    """
+    Use this model to compare whether the new model performs better or worse.
+    It's saved under a new name so we can use a wildcard to run rnasamba classify and to calculate model accuracy.
+    """
+    output:
+        "outputs/models/build/rnasamba/0_model/human_rnasamba.hdf5",
+    shell:
+        """
+        curl -JLo {output} https://github.com/apcamargo/RNAsamba/raw/master/data/full_length_weights.hdf5
+        """
+
+
+##################################################################
+## Get sequence statistics
+##################################################################
+
+
+rule get_sequence_descriptors:
+    input:
+        "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa",
+    output:
+        "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai",
+    conda:
+        "envs/seqkit.yml"
+    shell:
+        """
+        seqkit faidx -f {input}
+        """
+
+
+rule calculate_sequence_statistics:
+    input:
+        expand(
+            "outputs/models/datasets/2_sequence_sets/{coding_type}_{dataset_type}.fa.seqkit.fai",
+            coding_type=CODING_TYPES,
+            dataset_type=DATASET_TYPES,
+        ),
+    output:
+        set_summary="outputs/models/datasets/3_stats/set_summary.tsv",
+        set_length_summary="outputs/models/datasets/3_stats/set_length_summary.tsv",
+        set_length_genome_summary="outputs/models/datasets/3_stats/set_length_genome_summary.tsv",
+    conda:
+        "envs/tidyverse.yml"
+    script:
+        "scripts/calculate_sequence_statistics.R"
diff --git a/envs/caret.yml b/envs/caret.yml
new file mode 100644
index 0000000..413025a
--- /dev/null
+++ b/envs/caret.yml
@@ -0,0 +1,7 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - r-tidyverse=2.0.0 
+   - r-caret=6.0_94 
diff --git a/envs/dev.yml b/envs/dev.yml
index 3c70ebc..38ee968 100644
--- a/envs/dev.yml
+++ b/envs/dev.yml
@@ -8,4 +8,5 @@ dependencies:
   - python=3.12.0
   - ruff=0.1.6
   - snakefmt=0.8.5
-  - snakemake-minimal=8.0.1
+  - snakemake-minimal=8.4.2
+  - pandas=2.2.0
diff --git a/envs/mmseqs2.yml b/envs/mmseqs2.yml
new file mode 100644
index 0000000..339d8b2
--- /dev/null
+++ b/envs/mmseqs2.yml
@@ -0,0 +1,6 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - mmseqs2=15.6f452
diff --git a/envs/rnasamba.yml b/envs/rnasamba.yml
new file mode 100644
index 0000000..ff75535
--- /dev/null
+++ b/envs/rnasamba.yml
@@ -0,0 +1,11 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+  - python=3.8
+  - pip
+  - pip:
+    - nvidia-pyindex=1.0.9
+    - keras>=2.1.0,<2.3.0
+    - biopython=1.83
diff --git a/envs/seqkit.yml b/envs/seqkit.yml
new file mode 100644
index 0000000..c9e3ad5
--- /dev/null
+++ b/envs/seqkit.yml
@@ -0,0 +1,6 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - seqkit=2.6.1
diff --git a/envs/seqtk.yml b/envs/seqtk.yml
new file mode 100644
index 0000000..ecb2f15
--- /dev/null
+++ b/envs/seqtk.yml
@@ -0,0 +1,6 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - seqtk=1.4
diff --git a/envs/tidyverse.yml b/envs/tidyverse.yml
new file mode 100644
index 0000000..ee0ca28
--- /dev/null
+++ b/envs/tidyverse.yml
@@ -0,0 +1,6 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - r-tidyverse=2.0.0 
diff --git a/inputs/models/datasets/train_data_links.tsv b/inputs/models/datasets/train_data_links.tsv
new file mode 100644
index 0000000..0578b74
--- /dev/null
+++ b/inputs/models/datasets/train_data_links.tsv
@@ -0,0 +1,17 @@
+organism	root_url	genome	cdna_suffix	ncrna_suffix	genome_abbreviation	set_name
+human	https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/	Homo_sapiens.GRCh38	cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz	ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz	GRCh38	train
+yeast	https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/saccharomyces_cerevisiae/	Saccharomyces_cerevisiae.R64-1-1	cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz	ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz	R64-1-1	train
+worm	https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/caenorhabditis_elegans/	Caenorhabditis_elegans.WBcel235	cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz	ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz	WBcel235	train
+arabadopsis	https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/arabidopsis_thaliana/	Arabidopsis_thaliana.TAIR10	cdna/Arabidopsis_thaliana.TAIR10.cdna.all.fa.gz	ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz	TAIR10	train
+drosophila	https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/drosophila_melanogaster/	Drosophila_melanogaster.BDGP6.46	cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz	ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz	BDGP6.46	train
+dictyostelium_discoideum	https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/dictyostelium_discoideum/	Dictyostelium_discoideum.dicty_2.7	cdna/Dictyostelium_discoideum.dicty_2.7.cdna.all.fa.gz	ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz	dicty_2.7	train
+mouse	https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/	Mus_musculus.GRCm39	cdna/Mus_musculus.GRCm39.cdna.all.fa.gz	ncrna/Mus_musculus.GRCm39.ncrna.fa.gz	GRCm39	train
+zebrafish	https://ftp.ensembl.org/pub/release-111/fasta/danio_rerio/	Danio_rerio.GRCz11	cdna/Danio_rerio.GRCz11.cdna.all.fa.gz	ncrna/Danio_rerio.GRCz11.ncrna.fa.gz	GRCz11	train
+chicken	https://ftp.ensembl.org/pub/release-111/fasta/gallus_gallus/	Gallus_gallus.bGalGal1.mat.broiler.GRCg7b	cdna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.cdna.all.fa.gz	ncrna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.ncrna.fa.gz	bGalGal1.mat.broiler.GRCg7b	test
+rice	https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/oryza_indica/	Oryza_indica.ASM465v1	cdna/Oryza_indica.ASM465v1.cdna.all.fa.gz	ncrna/Oryza_indica.ASM465v1.ncrna.fa.gz	ASM465v1	test
+maize	https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-58/fasta/zea_mays/	Zea_mays.Zm-B73-REFERENCE-NAM-5.0	cdna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.cdna.all.fa.gz	ncrna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.ncrna.fa.gz	Zm-B73-REFERENCE-NAM-5.0	test
+frog	https://ftp.ensembl.org/pub/release-111/fasta/xenopus_tropicalis/	Xenopus_tropicalis.UCB_Xtro_10.0	cdna/Xenopus_tropicalis.UCB_Xtro_10.0.cdna.all.fa.gz	ncrna/Xenopus_tropicalis.UCB_Xtro_10.0.ncrna.fa.gz	UCB_Xtro_10.0	test
+rat	https://ftp.ensembl.org/pub/release-111/fasta/rattus_norvegicus/	Rattus_norvegicus.mRatBN7.2	cdna/Rattus_norvegicus.mRatBN7.2.cdna.all.fa.gz	ncrna/Rattus_norvegicus.mRatBN7.2.ncrna.fa.gz	mRatBN7	test
+honeybee	https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-58/fasta/apis_mellifera/	Apis_mellifera.Amel_HAv3.1	cdna/Apis_mellifera.Amel_HAv3.1.cdna.all.fa.gz	ncrna/Apis_mellifera.Amel_HAv3.1.ncrna.fa.gz	Amel_HAv3.1	test
+fission_yeast	https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-58/fasta/schizosaccharomyces_pombe/	Schizosaccharomyces_pombe.ASM294v2	cdna/Schizosaccharomyces_pombe.ASM294v2.cdna.all.fa.gz	ncrna/Schizosaccharomyces_pombe.ASM294v2.ncrna.fa.gz	ASM294v2	test
+tetrahymena	https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-58/fasta/tetrahymena_thermophila/	Tetrahymena_thermophila.JCVI-TTA1-2.2	cdna/Tetrahymena_thermophila.JCVI-TTA1-2.2.cdna.all.fa.gz	ncrna/Tetrahymena_thermophila.JCVI-TTA1-2.2.ncrna.fa.gz	JCVI-TTA1-2.2	test
diff --git a/pyproject.toml b/pyproject.toml
index ede457e..bd55cb4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,5 +59,5 @@ no-lines-before = ["future", "standard-library"]
 [tool.snakefmt]
 # the line length here should match the one used by ruff
 line_length = 100
-include = 'Snakefile|Snakefile_*'
+include = 'Snakefile|Snakefile_*|\.snakefile$|\.smk$'
 exclude = 'dev'
diff --git a/scripts/calculate_rnasamba_model_accuracy.R b/scripts/calculate_rnasamba_model_accuracy.R
new file mode 100644
index 0000000..3260929
--- /dev/null
+++ b/scripts/calculate_rnasamba_model_accuracy.R
@@ -0,0 +1,29 @@
+library(tidyverse)
+library(caret)
+
+files <- unlist(snakemake@input)
+
+# read in and format model results for each dataset type
+model_predictions <- files %>%
+  set_names() %>%
+  map_dfr(read_tsv, .id = "tmp") %>%
+  mutate(tmp = gsub(".tsv", "", basename(tmp))) %>%
+  separate(tmp, into = c("coding_type", "dataset_type"), sep = "_", remove = T) %>%
+  mutate(coding_type = as.factor(coding_type),
+         classification = as.factor(classification))  
+
+# determine model performance
+model_confusion_matrix <- confusionMatrix(data = model_predictions$classification, 
+                                          reference = model_predictions$coding_type)
+
+# convert the model performance metrics into a data frame
+model_confusion_matrix_df <- bind_rows(data.frame(value = model_confusion_matrix$overall), 
+                                       data.frame(value = model_confusion_matrix$byClass)) %>%
+  rownames_to_column("metric") %>%
+  mutate(dataset_type = model_predictions$dataset_type[1]) # set dataset type as column even tho it will be in the file name too
+
+# convert the confusion matrix into a data frame
+model_confusion_matrix_freq_df <- model_confusion_matrix %>% as.table() %>% data.frame()
+
+write_tsv(model_confusion_matrix_df, snakemake@output[['metrics']])
+write_tsv(model_confusion_matrix_freq_df, snakemake@output[['freq']])
diff --git a/scripts/calculate_sequence_statistics.R b/scripts/calculate_sequence_statistics.R
new file mode 100644
index 0000000..2000051
--- /dev/null
+++ b/scripts/calculate_sequence_statistics.R
@@ -0,0 +1,29 @@
+library(tidyverse)
+source("scripts/parse_sequence_information.R")
+
+files <- unlist(snakemake@input)
+#files <- Sys.glob("outputs/models/rnasamba/build/2_sequence_sets/*.fa.seqkit.fai")
+fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth")
+fai <- files %>%
+  set_names() %>%
+  map_dfr(read_tsv, col_names = fai_col_names, .id = "sequence_set") %>%
+  parse_sequence_information_from_seqkit_fai(dataset_type = TRUE)
+
+fai %>%
+  group_by(coding_type, set_name) %>%
+  tally() %>%
+  arrange(set_name) %>%
+  write_tsv(snakemake@output[['set_summary']])
+
+fai %>%
+  group_by(coding_type, set_name, length_description) %>%
+  tally() %>%
+  arrange(set_name) %>%
+  write_tsv(snakemake@output[['set_length_summary']])
+
+fai %>%
+  group_by(coding_type, set_name, length_description, genome) %>%
+  tally() %>%
+  arrange(set_name) %>%
+  pivot_wider(id_cols = c(coding_type, set_name, length_description), names_from = "genome", values_from = "n") %>%
+  write_tsv(snakemake@output[['set_length_genome_summary']])
diff --git a/scripts/parse_sequence_information.R b/scripts/parse_sequence_information.R
new file mode 100644
index 0000000..b5ada9a
--- /dev/null
+++ b/scripts/parse_sequence_information.R
@@ -0,0 +1,23 @@
+library(tidyverse)
+
+parse_sequence_information_from_seqkit_fai <- function(df, dataset_type = FALSE){
+  parsed_df <- df %>%
+    separate(sequence, into = c("sequence_id", "sequence_type", "chromosome", "gene", 
+                                "gene_biotype", "transcript_biotype", "gene_symbol",
+                                "description"), 
+             sep = " ", remove = FALSE, extra = "merge", fill = "right") %>%
+    separate(chromosome, into = c("chromosome_type", "genome", "chromosome_id", 
+                                  "start", "end", "strand"), 
+             sep = ":", remove = FALSE, extra = "merge", fill = "right") %>%
+    mutate(genome = ifelse(genome == "BDGP6", "BDGP6.46", genome)) %>% # fix some Drosophila names
+    mutate(genome = ifelse(sequence %in% c("K02E7.12.1", "F29C4.4.1", "C39B10.6b.1", "F53F4.16.1", "F53F4.17.1"), "WBcel235", genome)) %>% # fix some C. elegans genomes
+    mutate(length_description = ifelse(length <= 300, "<= 300nt", "> 300nt"))
+  
+  if(dataset_type == TRUE){
+    parsed_df <- parsed_df %>%
+      mutate(sequence_set = gsub(".fa.seqkit.fai", "", basename(sequence_set))) %>%
+      separate(sequence_set, into = c("coding_type", "set_name"), sep = "_", remove = FALSE)
+  }
+  
+  return(parsed_df)
+}
diff --git a/scripts/process_sequences_into_nonoverlapping_sets.R b/scripts/process_sequences_into_nonoverlapping_sets.R
new file mode 100644
index 0000000..849e3a1
--- /dev/null
+++ b/scripts/process_sequences_into_nonoverlapping_sets.R
@@ -0,0 +1,70 @@
+library(readr)
+library(dplyr)
+source("scripts/parse_sequence_information.R")
+
+# read in data sets ------------------------------------------------------
+
+fai_col_names <- c("sequence", "length", "offset", "linebases", "linewidth")
+clusters <- read_tsv(snakemake@input[['clusters']], col_names = c("rep", "cluster_member"))
+coding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[1], col_names = fai_col_names)
+noncoding_validation <- read_tsv(unlist(snakemake@input[['validation_fai']])[2], col_names = fai_col_names)
+all_fai <- read_tsv(snakemake@input[['all_fai']], col_names = fai_col_names) %>%
+  parse_sequence_information_from_seqkit_fai()
+
+metadata <- read_tsv(snakemake@input[['metadata']]) %>%
+  select(organism, genome = genome_abbreviation, set_name)
+
+# homology reduction ------------------------------------------------------
+
+# filter to homology reduced sequences -- keep sequences that had =<80% homology to other sequences
+all_fai_filtered <- all_fai %>%
+  mutate(sequence_id = gsub(" .*", "", sequence)) %>%
+  filter(sequence_id %in% clusters$rep) %>%
+  select(-sequence_id)
+
+# data set separation -----------------------------------------------------
+
+traintest <- all_fai_filtered %>%
+  filter(!sequence %in% c(coding_validation$sequence, noncoding_validation$sequence)) %>% # remove sequences that are in the validation sets
+  parse_sequence_information_from_seqkit_fai(dataset_type = FALSE) %>%
+  left_join(metadata, by = c("genome"))
+
+train <- traintest %>%
+  filter(set_name == "train")
+coding_train <- train %>% filter(sequence_type == "cdna")
+noncoding_train <- train %>% filter(sequence_type == "ncrna")
+
+test <- traintest %>%
+  filter(set_name == "test")
+coding_test <- test %>% filter(sequence_type == "cdna")
+noncoding_test <- test %>% filter(sequence_type == "ncrna")
+
+coding_validation_filtered <- all_fai_filtered %>%
+  filter(sequence %in% coding_validation$sequence) %>% # filter to coding sequences
+  filter(!sequence %in% traintest$sequence_id)         # remove sequences that overlap with the training/testing data
+
+noncoding_validation_filtered <- all_fai_filtered %>%
+  filter(sequence %in% noncoding_validation$sequence) %>% # filter to noncoding sequences
+  filter(!sequence %in% traintest$sequence_id)            # remove sequences that overlap with the training/testing data
+
+# augment the noncoding traintest set to balance sizes  ------------------
+
+# To avoid biases in classification, it's better to have the same number of sequences in the different classes.
+# This could also be done with changing the weights on the loss function of the model building in tensorflow,
+# but that would require altering the source code of the RNAsamba tool.
+# This approach should produce approximately equivalent results.
+
+num_rows_target <- nrow(coding_train)
+
+noncoding_train <- noncoding_train %>%
+  slice_sample(n = num_rows_target, replace = TRUE)
+
+# write out contigs names for each set ------------------------------------
+
+snakemake_output <- unlist(snakemake@output)
+write.table(coding_train$sequence, snakemake_output[1], quote = F, col.names = F, row.names = F)
+write.table(coding_test$sequence, snakemake_output[2], quote = F, col.names = F, row.names = F)
+write.table(coding_validation_filtered$sequence, snakemake_output[3], quote = F, col.names = F, row.names = F)
+write.table(noncoding_train$sequence, snakemake_output[4], quote = F, col.names = F, row.names = F)
+write.table(noncoding_test$sequence, snakemake_output[5], quote = F, col.names = F, row.names = F)
+write.table(noncoding_validation_filtered$sequence, snakemake_output[6], quote = F, col.names = F, row.names = F)