diff --git a/bin/create_regex.py b/bin/create_regex.py deleted file mode 100755 index 092f834..0000000 --- a/bin/create_regex.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import sys - -""" -EXAMPLE: - -in_string: (fixed_seq_1)(cell_barcode_1)(umi)(cell_barcode_2)(fixed_seq_2)(sequence) - -in_string_config: - -fixed_seqs: GATTACA, ACATTAG -barcode_lengths: 4, 3 -umi_lengths: 4 - -""" - - -def get_args(): - """Parse the commandine arguments""" - arg_parser = argparse.ArgumentParser() - - arg_parser.add_argument("-i", "--in_string", required=True) - arg_parser.add_argument( - "-c", - "--cb_lengths", - help="The comma delimited " - "list of cell barcodes length. The position in " - "the list corresponds to its order in in_string", - ) - arg_parser.add_argument( - "-u", - "--umi_lengths", - help="The comma delimited " - "list of umi lengths. The position in the list " - "corresponds to its order in in_string", - ) - arg_parser.add_argument( - "-f", - "--fixed_seqs", - help="The comma delimited " - "list of fixed sequences. The position in the " - "corresponds to its order in in_string", - ) - arg_parser.add_argument("-o", "--out_file", help="The out file for the regex and umi_tools pattern to be output.") - - args = arg_parser.parse_args() - - return args - - -def convert_regex(in_string, cb_lengths, umi_lengths, fixed_seqs, out_file): - """Converts the human_readable string to proper regex""" - - feature_annos = convert_seq_features(cb_lengths, umi_lengths, fixed_seqs) - - regex = convert_to_regex(in_string, feature_annos) - umi_tools_pattern = get_umi_tools_pattern(feature_annos["cell_barcode"], feature_annos["umi"]) - - with open(out_file, "w") as f: - f.write("REGEX" + "\t" + regex + "\n") - f.write("UMI_TOOLS" + "\t" + umi_tools_pattern + "\n") - f.write("BC_LENGTH" + "\t" + str(umi_tools_pattern.count("C")) + "\n") - f.write("UMI_LENGTH" + "\t" + str(umi_tools_pattern.count("N")) + "\n") - - return - - -def get_umi_tools_pattern(cell_barcode_info, umi_info): - umi_tools_pattern = "" - - for length in cell_barcode_info: - umi_tools_pattern += "C" * int(length) - - for length in umi_info: - umi_tools_pattern += "N" * int(length) - - return umi_tools_pattern - - -def convert_seq_features(cb_lengths, umi_lengths, fixed_seqs): - seq_features = {} - - seq_features["cell_barcode"] = cb_lengths.strip().split(",") if cb_lengths else "" - seq_features["umi"] = umi_lengths.strip().split(",") if umi_lengths else "" - seq_features["fixed_seq"] = fixed_seqs.strip().split(",") if fixed_seqs else "" - - return seq_features - - -def convert_to_regex(in_string, feature_annos): - """Converts the in_string to regex""" - regex = "" - - for feature in in_string.split(","): - if feature: - feature = feature.strip() - feature_name = "_".join(feature.split("_")[:-1]) - feature_idx = int(feature.split("_")[-1]) - - if feature_name in feature_annos: - regex += add_feature(feature_name, feature_idx, feature_annos[feature_name]) - else: - print(f"Unknown feature: {in_string}") - sys.exit() - - return regex - - -def add_feature(feature_name, feature_idx, feature_info): - regex = "(?P<{}>{}{{{}}})" - - string_char = "" - string_extra = "" - - if feature_name == "fixed_seq": - string_char = "(" + feature_info[feature_idx - 1].strip() + ")" - string_extra = "e<=3" - feature_name = "discard" - else: - string_char = "." - string_extra = feature_info[feature_idx - 1].strip() - - return regex.format("_".join([feature_name, str(feature_idx)]), string_char, string_extra) - - -def main(): - """Main Subroutine""" - - args = get_args() - - convert_regex(args.in_string, args.cb_lengths, args.umi_lengths, args.fixed_seqs, args.out_file) - - -if __name__ == "__main__": - main() diff --git a/bin/pre_extract_barcodes.py b/bin/pre_extract_barcodes.py index 2c51509..b37d7f8 100755 --- a/bin/pre_extract_barcodes.py +++ b/bin/pre_extract_barcodes.py @@ -22,7 +22,7 @@ def parse_args(): ) arg_parser.add_argument("-o", "--output_file", required=True, type=str, help="The output fastq") arg_parser.add_argument( - "-f", "--barcode-format", required=False, type=str, help="The barcode/umi format (Options: cellranger)" + "-f", "--barcode-format", required=True, type=str, help="The barcode/umi format (Options: cellranger)" ) args = arg_parser.parse_args() @@ -88,7 +88,7 @@ def extract_barcode(input_file, barcode_file, output, bc_format): read_info = {} # Strip the primer, bc, umi, and poly-T - if bc_format == "cellranger": + if bc_format in ["cellranger_3_prime", "cellranger_5_prime"]: read_info = strip_read_cellranger(bc_index, seq, quals) if read_info: diff --git a/conf/test.config b/conf/test.config index 8b88e2e..4cbc1f3 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,10 +27,6 @@ params { gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/scnanoseq/reference/chr21.gtf" // Barcode options - cell_barcode_pattern = "" - identifier_pattern = "fixed_seq_1,cell_barcode_1,umi_1,fixed_seq_2" - cell_barcode_lengths = "16" - umi_lengths = "12" - fixed_seqs = "CTACACGACGCTCTTCCGATCT, TTTTTTTTTT" + barcode_format = "cellranger_3_prime" } diff --git a/modules/local/create_regex.nf b/modules/local/create_regex.nf deleted file mode 100644 index 12b4240..0000000 --- a/modules/local/create_regex.nf +++ /dev/null @@ -1,46 +0,0 @@ -process CREATE_REGEX { - label 'process_low' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3': - 'biocontainers/python:3.8.3' }" - - input: - val cell_barcode_pattern - val identifier_pattern - val cell_barcode_lengths - val umi_lengths - val fixed_seqs - - output: - path "regex_patterns.txt", emit: regex_pattern_file - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - - """ - OUT_FILE="regex_patterns.txt" - - if [[ "${cell_barcode_pattern}" ]]; then - echo -e "REGEX: ${cell_barcode_pattern}" > \${OUT_FILE} - echo -e "UMI_TOOLS: N/A" >> \${OUT_FILE} - - else - create_regex.py -i "${identifier_pattern}" \\ - -c "${cell_barcode_lengths}" \\ - -u "${umi_lengths}" \\ - -f "${fixed_seqs}" \\ - -o \${OUT_FILE} - fi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g' ) - END_VERSIONS - """ -} diff --git a/modules/local/preextract_fastq.nf b/modules/local/preextract_fastq.nf index b26a2e6..f6735e4 100644 --- a/modules/local/preextract_fastq.nf +++ b/modules/local/preextract_fastq.nf @@ -10,6 +10,7 @@ process PREEXTRACT_FASTQ { input: tuple val(meta), path(reads), path(bc_list) + val bc_format output: tuple val(meta), path("*.R1.fastq.gz"), emit: r1_reads @@ -27,7 +28,7 @@ process PREEXTRACT_FASTQ { pre_extract_barcodes.py -i ${reads} \\ -b ${bc_list} \\ -o ${prefix} \\ - -f cellranger + -f ${bc_format} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/tag_barcodes.nf b/modules/local/tag_barcodes.nf index 344819c..eb8acae 100644 --- a/modules/local/tag_barcodes.nf +++ b/modules/local/tag_barcodes.nf @@ -8,7 +8,9 @@ process TAG_BARCODES { 'biocontainers/pysam:0.19.1--py310hff46b53_1' }" input: - tuple val(meta), path(bam), path(bai), path(r1_fastq), val(bc_length), val(umi_length) + tuple val(meta), path(bam), path(bai), path(r1_fastq) + val bc_length + val umi_length output: tuple val(meta), path("*.tagged.bam"), emit: tagged_bam diff --git a/nextflow.config b/nextflow.config index 3ebb1e5..e380710 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,13 +27,8 @@ params { skip_trimming = false // Cell barcode options - cell_barcode_pattern = null whitelist = null - identifier_pattern = null - cell_barcode_lengths = null - umi_lengths = null - fixed_seqs = null - barcode_preset = null + barcode_format = null // Library strandness option stranded = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 6b57836..73886be 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -98,7 +98,8 @@ "split_amount": { "type": "integer", "description": "The amount of lines to split the fastq into (Default: 0)", - "default": 0 + "default": 0, + "fa_icon": "fas fa-cut" } } }, @@ -137,35 +138,18 @@ "whitelist": { "type": "string", "description": "The file containing a list of barcodes.", - "format": "file-path" - }, - "cell_barcode_pattern": { - "type": "string", - "description": "Regex for the cell barcode pattern." - }, - "identifier_pattern": { - "type": "string", - "description": "Human readable regex for the cell barcode pattern." - }, - "cell_barcode_lengths": { - "type": "string", - "description": "The comma delimited list of cell barcode lengths." - }, - "umi_lengths": { - "type": "string", - "description": "The comma delimited list of umi lengths." - }, - "fixed_seqs": { - "type": "string", - "description": "The comma delimited list of fixed barcode sequences." + "format": "file-path", + "fa_icon": "far fa-file-alt" }, - "barcode_preset": { + "barcode_format": { "type": "string", - "description": "Specify a preset option for barcode formatting.", - "enum": ["cellranger_3_prime", "cellranget_5_prime"] + "description": "Specify the format for the barcode+umi", + "enum": ["cellranger_3_prime"], + "fa_icon": "fas fa-barcode" } }, - "fa_icon": "fas fa-microscope" + "fa_icon": "fas fa-microscope", + "required": ["barcode_format"] }, "mapping": { "title": "Mapping", @@ -176,12 +160,14 @@ "stranded": { "type": "string", "enum": ["None", "reverse", "forward"], - "description": "Library strandness option." + "description": "Library strandness option.", + "fa_icon": "fas fa-dna" }, "kmer_size": { "type": "integer", "default": 14, - "description": "Minimizer k-mer length." + "description": "Minimizer k-mer length.", + "fa_icon": "fas fa-sort-amount-down" }, "save_secondary_alignment": { "type": "boolean", @@ -199,17 +185,20 @@ "properties": { "analyze_uncorrected_bam": { "type": "boolean", - "description": "Run downstream steps on the bam that contains reads that could not be corrected. Do not use this if no whitelist is provided." + "description": "Run downstream steps on the bam that contains reads that could not be corrected. Do not use this if no whitelist is provided.", + "fa_icon": "fas fa-search" }, "counts_level": { "type": "string", "description": "What level to generate the counts matrix at. Options: 'gene', 'transcript'.", - "enum": ["gene", "transcript", "both"] + "enum": ["gene", "transcript", "both"], + "fa_icon": "fas fa-file-csv" }, "retain_introns": { "type": "boolean", "default": true, - "description": "Indicate whether to include introns in the count matrices" + "description": "Indicate whether to include introns in the count matrices", + "fa_icon": "fas fa-filter" } }, "required": ["counts_level"] @@ -247,7 +236,7 @@ "description": "Skip NanoComp from BAM file(s)." }, "skip_rseqc": { - "type": "string", + "type": "boolean", "fa_icon": "fas fa-forward" }, "skip_seurat": { diff --git a/subworkflows/local/create_regex.nf b/subworkflows/local/create_regex.nf deleted file mode 100644 index 6d5c023..0000000 --- a/subworkflows/local/create_regex.nf +++ /dev/null @@ -1,54 +0,0 @@ -// -// Will parse the user provided barcode regex and/or pattern -// - -include { CREATE_REGEX } from '../../modules/local/create_regex' - -workflow CREATE_REGEX_INFO { - take: - barcode_regex - barcode_pattern - barcode_lengths - umi_lengths - fixed_seqs - - main: - CREATE_REGEX ( barcode_regex, - barcode_pattern, - barcode_lengths, - umi_lengths, - fixed_seqs ) - .regex_pattern_file - .map { parse_regex_info(it) } - .set { regex } - - emit: - regex -} - -// Function to get list of [ umi_tools_barcode, barcode_regex, barcode_length, umi_length ] -def parse_regex_info(regex_file) { - def regex_info = [:] - file(regex_file).withReader { - String line - while ( line = it.readLine() ) { - String[] split_line - split_line = line.split('\t') - - if (split_line[0] == 'REGEX') { - regex_info.regex = split_line[1] - - } else if (split_line[0] == 'UMI_TOOLS') { - regex_info.umi_tools = split_line[1] - - } else if (split_line[0] == 'BC_LENGTH') { - regex_info.bc_length = split_line[1] - - } else if (split_line[0] == 'UMI_LENGTH') { - regex_info.umi_length = split_line[1] - } - } - } - - return regex_info -} diff --git a/workflows/scnanoseq.nf b/workflows/scnanoseq.nf index 8c21c6b..9ffc398 100644 --- a/workflows/scnanoseq.nf +++ b/workflows/scnanoseq.nf @@ -34,34 +34,17 @@ if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input sample def cell_barcode_pattern = "" // This is for if the user wants to do more human readable regex -def identifier_pattern = "" def cell_barcode_lengths = "" def umi_lengths = "" -def fixed_seqs = "" - -if (params.barcode_preset) { - if (params.barcode_preset = "cellranger_3_prime") { - identifier_pattern = "fixed_seq_1,cell_barcode_1,umi_1,fixed_seq_2" - cell_barcode_lengths = "16" - umi_lengths = "12" - fixed_seqs = "CTACACGACGCTCTTCCGATCT, TTTTTTTTTT" - - } else if (params.barcode_preset = "cellranger_5_prime") { - identifier_pattern = "fixed_seq_1,cell_barcode_1,umi_1,fixed_seq_2" - cell_barcode_lengths = "16" - umi_lengths = "12" - fixed_seqs = "CTACACGACGCTCTTCCGATCT, TTTTTTTTTT" - } -} else { - identifier_pattern = params.identifier_pattern - cell_barcode_lengths = params.cell_barcode_lengths - umi_lengths = params.umi_lengths - fixed_seqs = params.fixed_seqs +def blaze_whitelist = "" +// TODO: Move this to a config file +if (params.barcode_preset = "cellranger_3_prime") { + blaze_whitelist = file("$baseDir/assets/whitelist/3M-february-2018.zip") + bc_length = "16" + umi_length = "12" } -// TODO: Adding this in temporarily. Rethink how we want to represent this -def blaze_whitelist = file("$baseDir/assets/whitelist/3M-february-2018.zip") if (params.whitelist) { blaze_whitelist = whitelist } @@ -114,7 +97,6 @@ include { UCSC_GENEPREDTOBED } from "../mo // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // include { INPUT_CHECK } from "../subworkflows/local/input_check" -include { CREATE_REGEX_INFO } from "../subworkflows/local/create_regex" include { PREPARE_REFERENCE_FILES } from "../subworkflows/local/prepare_reference_files" /* @@ -341,22 +323,6 @@ workflow SCNANOSEQ { ch_zipped_reads = ch_cat_fastq } - // - // MODULE: Parse the regex info - // - - // We need to create the regex format - // TODO: Add this information to the samplesheet to allow sample specific barcode detection? - CREATE_REGEX_INFO( cell_barcode_pattern, - identifier_pattern, - cell_barcode_lengths, - umi_lengths, - fixed_seqs) - - val_regex_info = CREATE_REGEX_INFO.out.regex - // TODO: Why can't we use the below code? - //ch_versions = ch_versions.mix(CREATE_REGEX_INFO.out.versions) - // // MODULE: Generate whitelist // @@ -372,7 +338,7 @@ workflow SCNANOSEQ { // MODULE: Extract barcodes // - PREEXTRACT_FASTQ( ch_zipped_reads.join(ch_putative_bc)) + PREEXTRACT_FASTQ( ch_zipped_reads.join(ch_putative_bc), params.barcode_format) ch_zipped_r1_reads = PREEXTRACT_FASTQ.out.r1_reads ch_zipped_r2_reads = PREEXTRACT_FASTQ.out.r2_reads @@ -486,9 +452,9 @@ workflow SCNANOSEQ { TAG_BARCODES ( ch_minimap_filtered_sorted_bam .join( ch_minimap_filtered_sorted_bai, by: 0) - .join( ch_zipped_r1_reads, by: 0 ) - .combine( val_regex_info.bc_length ) - .combine( val_regex_info.umi_length ) + .join( ch_zipped_r1_reads, by: 0 ), + bc_length, + umi_length ) ch_tagged_bam = TAG_BARCODES.out.tagged_bam