diff --git a/tools/scripts/sctools/Dockerfile b/tools/scripts/sctools/Dockerfile deleted file mode 100644 index 669341dd..00000000 --- a/tools/scripts/sctools/Dockerfile +++ /dev/null @@ -1,36 +0,0 @@ -FROM python:3.7.7 - -LABEL maintainer="Farzaneh Khajouei " \ - software="sctools v.1.0.0" \ - description="A collection of tools for single cell data. Splitting fastq files based on cellbarcodes and other tools to compute metrics on single cell data using barcodes and UMIs." - - -RUN apt-get update && apt-get upgrade -y && apt-get install -y patch libhdf5-dev vim apt-utils -RUN mkdir /sctools/ - -COPY . /sctools - -ARG htslib_version="1.13" - -RUN cd /sctools/fastqpreprocessing &&\ - wget https://github.com/khajoue2/libStatGen/archive/refs/tags/v1.0.15.broad.tar.gz &&\ - wget https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 &&\ - tar -zxvf v1.0.15.broad.tar.gz &&\ - tar -jxvf htslib-${htslib_version}.tar.bz2 &&\ - mv libStatGen-1.0.15.broad libStatGen - -RUN cd /sctools/fastqpreprocessing &&\ - wget http://www.cs.unc.edu/Research/compgeom/gzstream/gzstream.tgz &&\ - tar -xvf gzstream.tgz - -RUN cd /sctools/fastqpreprocessing &&\ - make -C libStatGen - -RUN cd /sctools/fastqpreprocessing && make -C htslib-${htslib_version}/ && make -C gzstream - -RUN cd /sctools/fastqpreprocessing && mkdir bin obj && make install - -RUN cp /sctools/fastqpreprocessing/bin/* /usr/local/bin/ - -WORKDIR usr/local/bin/sctools - diff --git a/tools/scripts/sctools/LICENSE b/tools/scripts/sctools/LICENSE deleted file mode 100644 index 45035a5b..00000000 --- a/tools/scripts/sctools/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2017 Human Cell Atlas Authors, https://humancellatlas.org -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name Broad Institute, Inc. nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE \ No newline at end of file diff --git a/tools/scripts/sctools/MANIFEST.in b/tools/scripts/sctools/MANIFEST.in deleted file mode 100644 index a1762055..00000000 --- a/tools/scripts/sctools/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include src/sctools/test/data/* -include README.rst -include LICENSE \ No newline at end of file diff --git a/tools/scripts/sctools/README.rst b/tools/scripts/sctools/README.rst deleted file mode 100644 index 0fda46ea..00000000 --- a/tools/scripts/sctools/README.rst +++ /dev/null @@ -1,157 +0,0 @@ -Single Cell Tools -################# - -.. image:: https://img.shields.io/circleci/project/github/HumanCellAtlas/sctools.svg?label=Unit%20Test%20on%20Circle%20CI%20&style=flat-square&logo=circleci - :target: https://circleci.com/gh/HumanCellAtlas/sctools/tree/master - :alt: Unit Test Status - -.. image:: https://img.shields.io/codecov/c/github/HumanCellAtlas/sctools/master.svg?label=Test%20Coverage&logo=codecov&style=flat-square - :target: https://codecov.io/gh/HumanCellAtlas/sctools - :alt: Test Coverage on Codecov - -.. image:: https://img.shields.io/readthedocs/sctools/latest.svg?label=ReadtheDocs%3A%20Latest&logo=Read%20the%20Docs&style=flat-square - :target: http://sctools.readthedocs.io/en/latest/?badge=latest - :alt: Documentation Status - -.. image:: https://img.shields.io/snyk/vulnerabilities/github/HumanCellAtlas/sctools/requirements.txt.svg?label=Snyk%20Vulnerabilities&logo=Snyk - :target: https://snyk.io/test/github/HumanCellAtlas/sctools/?targetFile=requirements.txt - :alt: Snyk Vulnerabilities for GitHub Repo (Specific Manifest) - -.. image:: https://img.shields.io/github/release/HumanCellAtlas/sctools.svg?label=Latest%20Release&style=flat-square&colorB=green - :target: https://github.com/HumanCellAtlas/sctools/releases - :alt: Latest Release - -.. image:: https://img.shields.io/github/license/HumanCellAtlas/sctools.svg?style=flat-square - :target: https://img.shields.io/github/license/HumanCellAtlas/sctools.svg?style=flat-square - :alt: License - -.. image:: https://img.shields.io/badge/python-3.6-green.svg?style=flat-square&logo=python&colorB=blue - :target: https://img.shields.io/badge/python-3.6-green.svg?style=flat-square&logo=python&colorB=blue - :alt: Language - -.. image:: https://img.shields.io/badge/Code%20Style-black-000000.svg?style=flat-square - :target: https://github.com/ambv/black - :alt: Code Style - -Single Cell Tools provides utilities for manipulating sequence data formats suitable for use in -distributed systems analyzing large biological datasets. - -Download and Installation -========================= - -.. code bash - git clone https://github.com/humancellatlas/sctools.git - cd sctools - pip3 install . - pytest # verify installation; run tests - -sctools Package -=============== - -The sctools package provides both command line utilities and classes designed for use in python -programs. - -Command Line Utilities -====================== - -1. Attach10XBarcodes: Attached barcodes stored in fastq files to reads in an unaligned bam file -2. SplitBam: Split a bam file into chunks, guaranteeing that cells are contained in 1 chunk -3. CalculateGeneMetrics: Calculate information about genes in an experiment or chunk -4. CalculateCellMetrics: Calculate information about cells in an experiment or chunk -5. MergeGeneMetrics: Merge gene metrics calculated from different chunks of an experiment -6. MergeCellMetrics Merge cell metrics calculated from different chunks of an experiment - -Main Package Classes -==================== - -1. **Platform**: an abstract class that defines a common data structure for different 3' sequencing - formats. All algorithms and methods in this package that are designed to work on 3' sequencing data - speak to this common data structure. Currently 10X_v2 is defined. - -2. **Reader**: a general iterator over arbitrarily zipped file(s) that is extended to work with common - sequence formats like fastq (fastq.Reader) and gtf (gtf.Reader). We recommend using the pysam - package for reading sam and bam files. - -3. **TwoBit & ThreeBit** DNA encoders that store DNA in 2- and 3-bit form. 2-bit is smaller but - randomizes "N" nucleotides. Both classes support fastq operations over common sequence tasks such - as the calculation of GC content. - -4. **ObservedBarcodeSet & PriorBarcodeSet**: classes for analysis and comparison of sets of barcodes - such as the cell barcodes used by 10X genomics. Supports operations like summarizing hamming - distances and comparing observed sequence diversity to expected (normally uniform) diversity. - -5. **gtf.Reader & gtf.Record** GTF iterator and GTF record class that exposes the gtf - fields as a lightweight, lazy-parsed python object. - -6. **fastq.Reader & fastq.Record** fastq reader and fastq record class that exposes the fastq fields - as a lightweight, lazy-parsed python object. - -7. **Metrics** calculate information about the genes and cells of an experiment - -8. **Bam** Split bam files into chunks and attach barcodes as tags - - -Viewing Test Results and Coverage -================================= - -To calculate and view test coverage cd to the ``sctools`` directory and -type the following two commands to generate the report and open it in your web browser: - -.. code:: bash - - pytest --cov-report html:cov_html --cov=sctools - open cov_html/index.html - -Definitions -=========== - -Several definitions are helpful to understand how sequence data is analyzed. - -1. **Cell**: an individual cell, the target of single-cell RNA-seq experiments and the entity that we -wish to characterize - -2. **Capture Primer**: A DNA oligonucleotide containing amplification machinery, a fixed cell barcode, -a random molecule barcode, and an oligo-dT tail to capture poly-adenylated RNA - -3. **Molecule**: A molecule refers to a single mRNA molecule that is captured by an oligo-dT capture -primer in a single-cell sequencing experiment - -4. **Molecule Barcode**: A molecule barcode (alias: UMI, RMT) is a short, random DNA barcode attached -to the capture primer that has adequate length to be probabilistically unique across the experiment. -Therefore, when multiple molecules of the same gene are captured in the same cell, they can be -differentiated through having different molecule barcodes. The proposed GA4GH standard tag for a -molecule barcode is UB and molecule barcode qualities is UY - -5. **Cell Barcode**: A short DNA barcode that is typically selected from a whitelist of barcodes that -will be used in an experiment. All capture primers for a given cell will contain the same cell -barcode. The proposed GA4GH standard tag for a cell barcode is CB and cell barcode qualities is CY - -6. **Fragment**: During library construction, mRNA molecules captured on capture primers are amplified, -and the resulting amplified oligonucleotides are fragmented. In 3' experiments, only the fragment -that contains the 3' end is retained, but the break point will be random, which means fragments -often have different lengths. Once sequenced, different fragments can be identified as unique -combinations of cell barcode, molecule barcode, the chromosome the sequence aligns to, and the -position it aligns to on that chromosome, after correcting for clipping that the aligner may add - -7. **Bam/Sam file**: The GA4GH standard file type for the storage of aligned sequencing reads. -Unless specified, our Single Cell Tools will operate over bam files containing either aligned or -unaligned reads - -Development -=========== - -Code Style ----------- -The sctools code base is complying with the PEP-8 and using `Black `_ to -format our code, in order to avoid "nitpicky" comments during the code review process so we spend more time discussing about the logic, -not code styles. - -In order to enable the auto-formatting in the development process, you have to spend a few seconds setting -up the ``pre-commit`` the first time you clone the repo: - -1. Install ``pre-commit`` by running: ``pip install pre-commit`` (or simply run ``pip install -r requirements.txt``). -2. Run `pre-commit install` to install the git hook. - -Once you successfully install the ``pre-commit`` hook to this repo, the Black linter/formatter will be automatically triggered and run on this repo. Please make sure you followed the above steps, otherwise your commits might fail at the linting test! - -If you really want to manually trigger the linters and formatters on your code, make sure ``Black`` and ``flake8`` are installed in your Python environment and run ``flake8 DIR1 DIR2`` and ``black DIR1 DIR2 --skip-string-normalization`` respectively. diff --git a/tools/scripts/sctools/build/lib/sctools/__init__.py b/tools/scripts/sctools/build/lib/sctools/__init__.py deleted file mode 100644 index 1fec1fb4..00000000 --- a/tools/scripts/sctools/build/lib/sctools/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# flake8: noqa -from . import bam -from . import encodings -from . import barcode -from . import fastq -from . import gtf -from . import stats -from . import reader -from . import metrics -from . import platform -from . import consts -from . import groups -from pkg_resources import get_distribution, DistributionNotFound - - -try: - __version__ = get_distribution(__name__).version -except DistributionNotFound: - pass diff --git a/tools/scripts/sctools/build/lib/sctools/bam.py b/tools/scripts/sctools/build/lib/sctools/bam.py deleted file mode 100644 index d8477386..00000000 --- a/tools/scripts/sctools/build/lib/sctools/bam.py +++ /dev/null @@ -1,728 +0,0 @@ -""" -Tools for Manipulating SAM/BAM format files -=========================================== - -.. currentmodule:: sctools - -This module provides functions and classes to subsample reads from bam files that correspond to -specific chromosomes, split bam files into chunks, assign tags to bam files from paired fastq -records, and iterate over sorted bam files by one or more tags - -This module makes heavy use of the pysam wrapper for HTSlib, a high-performance c-library designed -to manipulate sam files - -Methods -------- -iter_tag_groups function to iterate over reads by an arbitrary tag -iter_cell_barcodes wrapper for iter_tag_groups that iterates over cell barcode tags -iter_genes wrapper for iter_tag_groups that iterates over gene tags -iter_molecules wrapper for iter_tag_groups that iterates over molecule tags -sort_by_tags_and_queryname sort bam by given list of zero or more tags, followed by query name -verify_sort verifies whether bam is correctly sorted by given list of tags, then query name - -Classes -------- -SubsetAlignments class to extract reads specific to requested chromosome(s) -Tagger class to add tags to sam/bam records from paired fastq records -AlignmentSortOrder abstract class to represent alignment sort orders -QueryNameSortOrder alignment sort order by query name -TagSortableRecord class to facilitate sorting of pysam.AlignedSegments -SortError error raised when sorting is incorrect - -References ----------- -htslib : https://github.com/samtools/htslib - -""" - -import functools -from functools import partial, reduce -import math -import os -import warnings -from abc import abstractmethod -from typing import ( - Iterator, - Iterable, - Generator, - List, - Set, - Dict, - Union, - Tuple, - Callable, - Any, - Optional, -) - -import pysam -import shutil -import multiprocessing -import uuid - -from . import consts - -# File descriptor to write log messages to -STDERR = 2 - - -class SubsetAlignments: - """Wrapper for pysam/htslib that extracts reads corresponding to requested chromosome(s) - - Parameters - ---------- - alignment_file : str - sam or bam file - open_mode : {'r', 'rb', None}, optional - open mode for pysam.AlignmentFile. 'r' indicates a sam file, 'rb' indicates a bam file, - and None attempts to autodetect based on the file suffix (Default = None) - - Methods - ------- - indices_by_chromosome - returns indices to line numbers containing the requested number of reads for a specified - chromosome - - Notes - ----- - samtools is a good general-purpose tool for that is capable of most subsampling tasks. It is a - good idea to check the samtools documentation when approaching these types of tasks. - - References - ---------- - samtools documentation : http://www.htslib.org/doc/samtools.html - - """ - - def __init__(self, alignment_file: str, open_mode: str = None): - if open_mode is None: - if alignment_file.endswith(".bam"): - open_mode = "rb" - elif alignment_file.endswith(".sam"): - open_mode = "r" - else: - raise ValueError( - f"Could not autodetect file type for alignment_file {alignment_file} (detectable suffixes: " - f".sam, .bam)" - ) - self._file: str = alignment_file - self._open_mode: str = open_mode - - def indices_by_chromosome( - self, n_specific: int, chromosome: str, include_other: int = 0 - ) -> Union[List[int], Tuple[List[int], List[int]]]: - """Return the list of first `n_specific` indices of reads aligned to `chromosome`. - - Parameters - ---------- - n_specific : int - Number of aligned reads to return indices for - chromosome : str - Only reads from this chromosome are considered valid - include_other : int, optional - The number of reads to include that are NOT aligned to chromosome. These can be aligned - or unaligned reads (default = 0). - - Returns - ------- - chromosome_indices : List[int] - list of indices to reads aligning to `chromosome` - other_indices : List[int], optional - list of indices to reads NOT aligning to chromosome, only returned if include_other is - not 0. - - """ - - # acceptable chromosomes - valid_chromosomes = [str(i) for i in range(1, 23)] + ["M", "MT", "X", "Y"] - valid_chromosomes.extend(["chr" + v for v in valid_chromosomes]) - - # check chromosome - if isinstance(chromosome, int) and chromosome < 23: - chromosome = str(chromosome) # try to convert - if chromosome not in valid_chromosomes: - warnings.warn( - "chromsome %s not in list of expected chromosomes: %r" - % (chromosome, valid_chromosomes) - ) - - with pysam.AlignmentFile(self._file, self._open_mode) as fin: - chromosome = str(chromosome) - chromosome_indices = [] - other_indices = [] - - for i, record in enumerate(fin): - - if not record.is_unmapped: # record is mapped - if chromosome == record.reference_name: - if len(chromosome_indices) < n_specific: - chromosome_indices.append(i) - elif len(other_indices) < include_other: - other_indices.append(i) - elif len(other_indices) < include_other: # record is not mapped - other_indices.append(i) - - # check termination condition (we have the requisite number of reads - if ( - len(chromosome_indices) == n_specific - and len(other_indices) == include_other - ): - break - - if len(chromosome_indices) < n_specific or len(other_indices) < include_other: - warnings.warn( - "Only %d unaligned and %d reads aligned to chromosome %s were found in" - "%s" - % (len(other_indices), len(chromosome_indices), chromosome, self._file) - ) - - if include_other != 0: - return chromosome_indices, other_indices - else: - return chromosome_indices - - -class Tagger: - """Add tags to a bam file from tag generators. - - Parameters - ---------- - bam_file : str - Bam file that tags are to be added to. - - Methods - ------- - tag - tag bam records given tag_generators (often generated from paired bam or fastq files) - # todo this should probably be wrapped up in __init__ to make this more function-like - """ - - def __init__(self, bam_file: str) -> None: - if not isinstance(bam_file, str): - raise TypeError( - f'The argument "bam_file" must be of type str, not {type(bam_file)}' - ) - self.bam_file = bam_file - - # todo add type to tag_generators (make sure it doesn't introduce import issues - def tag(self, output_bam_name: str, tag_generators) -> None: - """Add tags to bam_file. - - Given a bam file and tag generators derived from files sharing the same sort order, - adds tags to the .bam file, and writes the resulting file to output_bam_name. - - Parameters - ---------- - output_bam_name : str - Name of output tagged bam. - tag_generators : List[fastq.TagGenerator] - list of generators that yield fastq.Tag objects - - """ - with pysam.AlignmentFile( - self.bam_file, "rb", check_sq=False - ) as inbam, pysam.AlignmentFile( - output_bam_name, "wb", template=inbam - ) as outbam: - - # zip up all the iterators - for *tag_sets, sam_record in zip(*tag_generators, inbam): - for tag_set in tag_sets: - for tag in tag_set: - sam_record.set_tag(*tag) - outbam.write(sam_record) - - -def get_barcodes_from_bam( - in_bam: str, tags: List[str], raise_missing: bool -) -> Set[str]: - """Get all the distinct barcodes from a bam - - :param in_bam: str - Input bam file. - :param tags: List[str] - Tags in the bam that might contain barcodes. - :param raise_missing: bool - Raise an error if no barcodes can be found. - :return: set - A set of barcodes found in the bam - This set will not contain a None value - """ - barcodes = set() - # Get all the Barcodes from the BAM - with pysam.AlignmentFile(in_bam, "rb", check_sq=False) as input_alignments: - for alignment in input_alignments: - barcode = get_barcode_for_alignment(alignment, tags, raise_missing) - # If no provided tag was found on the record that had a non-null value - if barcode is not None: - barcodes.add(barcode) - return barcodes - - -def get_barcode_for_alignment( - alignment: pysam.AlignedSegment, tags: List[str], raise_missing: bool -) -> str: - """ Get the barcode for an Alignment - - :param alignment: pysam.AlignedSegment - An Alignment from pysam. - :param tags: List[str] - Tags in the bam that might contain barcodes. If multiple Tags are passed, will - return the contents of the first tag that contains a barcode. - :param raise_missing: bool - Raise an error if no barcodes can be found. - :return: str - A barcode for the alignment, or None if one is not found and raise_missing is False. - """ - alignment_barcode = None - for tag in tags: - # The non-existent barcode should be the exceptional case, so try/except is faster than if/else - try: - alignment_barcode = alignment.get_tag(tag) - break # Got the key, don't bother getting the next tag - except KeyError: - continue # Try to get the next tag - - if raise_missing and alignment_barcode is None: - raise RuntimeError( - "Alignment encountered that is missing {} tag(s).".format(tags) - ) - - return alignment_barcode - - -def write_barcodes_to_bins( - in_bam: str, tags: List[str], barcodes_to_bins: Dict[str, int], raise_missing: bool -) -> List[str]: - """ Write barcodes to appropriate bins as defined by barcodes_to_bins - - :param in_bam: str - The bam file to read. - :param tags: List[str] - Tags in the bam that might contain barcodes. - :param barcodes_to_bins: Dict[str, int] - A Dict from barcode to bin. All barcodes of the same type need to be written to the same bin. - These numbered bins are merged after parallelization so that all alignments with the same - barcode are in the same bam. - :param raise_missing: bool - Raise an error if no barcodes can be found. - :return: A list of paths to the written bins. - """ - # Create all the output files - with pysam.AlignmentFile(in_bam, "rb", check_sq=False) as input_alignments: - - # We need a random int appended to the dirname to make sure input bams with the same name don't clash - dirname = ( - os.path.splitext(os.path.basename(in_bam))[0] + "_" + str(uuid.uuid4()) - ) - os.makedirs(dirname) - - files = [] - bins = list(set(barcodes_to_bins.values())) - filepaths = [] - # barcode_to_bins is a dict of barcodes to ints. The ints are contiguous and are used as indices - # in the files array. The files array is an array of open file handles to write to. - for i in range(len(bins)): - out_bam_name = os.path.join(f"{dirname}", f"{dirname}_{i}.bam") - filepaths.append(out_bam_name) - - open_bam = pysam.AlignmentFile(out_bam_name, "w", template=input_alignments) - files.append(open_bam) - - # Loop over input; check each tag in priority order and partition barcodes into files based - # on the highest priority tag that is identified - for alignment in input_alignments: - barcode = get_barcode_for_alignment(alignment, tags, raise_missing) - if barcode is not None: - # Find or set the file associated with the tag and write the record to the correct file - out_file = files[barcodes_to_bins[barcode]] - out_file.write(alignment) - - for file in files: - file.close() - - return filepaths - - -def merge_bams(bams: List[str]) -> str: - """ Merge input bams using samtools. - - This cannot be a local function within `split` because then Python "cannot pickle a local object". - :param bams: Name of the final bam + bams to merge. - Because of how its called using multiprocessing, the bam basename is the first element of the list. - :return: The output bam name. - """ - bam_name = os.path.realpath(bams[0] + ".bam") - bams_to_merge = bams[1:] - pysam.merge("-c", "-p", bam_name, *bams_to_merge) - return bam_name - - -def split( - in_bams: List[str], - out_prefix: str, - tags: List[str], - approx_mb_per_split: float = 1000, - raise_missing: bool = True, - num_processes: int = None, -) -> List[str]: - """split `in_bam` by tag into files of `approx_mb_per_split` - - Parameters - ---------- - in_bams : str - Input bam files. - out_prefix : str - Prefix for all output files; output will be named as prefix_n where n is an integer equal - to the chunk number. - tags : List[str] - The bam tags to split on. The tags are checked in order, and sorting is done based on the - first identified tag. Further tags are only checked if the first tag is missing. This is - useful in cases where sorting is executed over a corrected barcode, but some records only - have a raw barcode. - approx_mb_per_split : float - The target file size for each chunk in mb - raise_missing : bool, optional - if True, raise a RuntimeError if a record is encountered without a tag. Else silently - discard the record (default = True) - num_processes : int, optional - The number of processes to parallelize over. If not set, will use all available processes. - - Returns - ------- - output_filenames : List[str] - list of filenames of bam chunks - - Raises - ------ - ValueError - when `tags` is empty - RuntimeError - when `raise_missing` is true and any passed read contains no `tags` - - """ - - if len(tags) == 0: - raise ValueError("At least one tag must be passed") - - if num_processes is None: - num_processes = multiprocessing.cpu_count() - - # find correct number of subfiles to spawn - bam_mb = sum(os.path.getsize(b) * 1e-6 for b in in_bams) - n_subfiles = int(math.ceil(bam_mb / approx_mb_per_split)) - if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN: - warnings.warn( - f"Number of requested subfiles ({n_subfiles}) exceeds " - f"{consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN}; this may cause OS errors by exceeding fid limits" - ) - if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE: - raise ValueError( - f"Number of requested subfiles ({n_subfiles}) exceeds " - f"{consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE}; this will usually cause OS errors, " - f"think about increasing max_mb_per_split." - ) - - full_pool = multiprocessing.Pool(num_processes) - - # Get all the barcodes over all the bams - os.write(STDERR, b"Retrieving barcodes from bams\n") - result = full_pool.map( - partial(get_barcodes_from_bam, tags=tags, raise_missing=raise_missing), in_bams - ) - - barcodes_list = list(reduce(lambda set1, set2: set1.union(set2), result)) - os.write(STDERR, b"Retrieved barcodes from bams\n") - - # Create the barcodes to bin mapping - os.write(STDERR, b"Allocating bins\n") - barcodes_to_bins_dict = {} - - # barcodes_list will always contain non-None elements from get_barcodes_from_bam - if len(barcodes_list) <= n_subfiles: - for barcode_index in range(len(barcodes_list)): - barcodes_to_bins_dict[barcodes_list[barcode_index]] = barcode_index - else: - for barcode_index in range(len(barcodes_list)): - file_index = barcode_index % n_subfiles - barcodes_to_bins_dict[barcodes_list[barcode_index]] = file_index - - # Split the bams by barcode in parallel - os.write(STDERR, b"Splitting the bams by barcode\n") - # Samtools needs a thread for compression, so we leave half the given processes open. - write_pool_processes = math.ceil(num_processes / 2) if num_processes > 2 else 1 - write_pool = multiprocessing.Pool(write_pool_processes) - scattered_split_result = write_pool.map( - partial( - write_barcodes_to_bins, - tags=list(tags), - raise_missing=raise_missing, - barcodes_to_bins=barcodes_to_bins_dict, - ), - in_bams, - ) - - bin_indices = list(set(barcodes_to_bins_dict.values())) - # Create a list of lists, where the first element of every sub-list is the name of the final output bam - bins = list([f"{out_prefix}_{index}"] for index in bin_indices) - - # A shard is the computation of writing barcodes to bins - # Gather all the files for each bin into the same sub-list. - for shard_index in range(len(scattered_split_result)): - shard = scattered_split_result[shard_index] - for file_index in range(len(shard)): - bins[file_index].append(shard[file_index]) - - write_pool.close() - - # Recombine the binned bams - os.write(STDERR, b"Merging temporary bam files\n") - merged_bams = full_pool.map(partial(merge_bams), bins) - - os.write(STDERR, b"deleting temporary files\n") - for paths in scattered_split_result: - shutil.rmtree(os.path.dirname(paths[0])) - - full_pool.close() - - return merged_bams - - -# todo change this to throw away "None" reads instead of appending them if we are filtering them -def iter_tag_groups( - tag: str, bam_iterator: Iterator[pysam.AlignedSegment], filter_null: bool = False -) -> Generator: - """Iterates over reads and yields them grouped by the provided tag value - - Parameters - ---------- - tag : str - BAM tag to group over - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - filter_null : bool, optional - If False, all reads that lack the requested tag are yielded together. Else, all reads - that lack the tag will be discarded (default = False). - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique value of tag - current_tag : str - the tag that reads in the group all share - - """ - - # get first read and tag set - reads = [next(bam_iterator)] - try: - current_tag = reads[0].get_tag(tag) - except KeyError: - current_tag = None # null tag is a category that gets emitted - - # now iterate over alignment sets - for alignment in bam_iterator: - try: - next_tag = alignment.get_tag(tag) - except KeyError: - next_tag = None # null tag is a category that we will emit - if next_tag == current_tag: - reads.append(alignment) - else: - # only yield if the tag is non-null or filter_null is false - if not filter_null or current_tag is not None: - yield iter(reads), current_tag - # reset to next group - reads = [alignment] - current_tag = next_tag - - if not filter_null or current_tag is not None: - yield iter(reads), current_tag - - -def iter_molecule_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: - """Iterate over all the molecules of a bam file sorted by molecule. - - Parameters - ---------- - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique molecule barcode tag - current_tag : str - the molecule barcode that records in the group all share - - """ - return iter_tag_groups( - tag=consts.MOLECULE_BARCODE_TAG_KEY, bam_iterator=bam_iterator - ) - - -def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: - """Iterate over all the cells of a bam file sorted by cell. - - Parameters - ---------- - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique cell barcode tag - current_tag : str - the cell barcode that reads in the group all share - - """ - return iter_tag_groups(tag=consts.CELL_BARCODE_TAG_KEY, bam_iterator=bam_iterator) - - -def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: - """Iterate over all the cells of a bam file sorted by gene. - - Parameters - ---------- - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique gene name tag - current_tag : str - the gene id that reads in the group all share - - """ - return iter_tag_groups(tag=consts.GENE_NAME_TAG_KEY, bam_iterator=bam_iterator) - - -def get_tag_or_default( - alignment: pysam.AlignedSegment, tag_key: str, default: Optional[str] = None -) -> Optional[str]: - """Extracts the value associated to `tag_key` from `alignment`, and returns a default value - if the tag is not present.""" - try: - return alignment.get_tag(tag_key) - except KeyError: - return default - - -class AlignmentSortOrder: - """The base class of alignment sort orders.""" - - @property - @abstractmethod - def key_generator(self) -> Callable[[pysam.AlignedSegment], Any]: - """Returns a callable function that calculates a sort key from given pysam.AlignedSegment.""" - raise NotImplementedError - - -class QueryNameSortOrder(AlignmentSortOrder): - """Alignment record sort order by query name.""" - - @staticmethod - def get_sort_key(alignment: pysam.AlignedSegment) -> str: - return alignment.query_name - - @property - def key_generator(self): - return QueryNameSortOrder.get_sort_key - - def __repr__(self) -> str: - return "query_name" - - -@functools.total_ordering -class TagSortableRecord(object): - """Wrapper for pysam.AlignedSegment that facilitates sorting by tags and query name.""" - - def __init__( - self, - tag_keys: Iterable[str], - tag_values: Iterable[str], - query_name: str, - record: pysam.AlignedSegment = None, - ) -> None: - self.tag_keys = tag_keys - self.tag_values = tag_values - self.query_name = query_name - self.record = record - - @classmethod - def from_aligned_segment( - cls, record: pysam.AlignedSegment, tag_keys: Iterable[str] - ) -> "TagSortableRecord": - """Create a TagSortableRecord from a pysam.AlignedSegment and list of tag keys""" - assert record is not None - tag_values = [get_tag_or_default(record, key, "") for key in tag_keys] - query_name = record.query_name - return cls(tag_keys, tag_values, query_name, record) - - def __lt__(self, other: object) -> bool: - if not isinstance(other, TagSortableRecord): - return NotImplemented - self.__verify_tag_keys_match(other) - for (self_tag_value, other_tag_value) in zip(self.tag_values, other.tag_values): - if self_tag_value < other_tag_value: - return True - elif self_tag_value > other_tag_value: - return False - return self.query_name < other.query_name - - def __eq__(self, other: object) -> bool: - # TODO: Add more error checking - if not isinstance(other, TagSortableRecord): - return NotImplemented - self.__verify_tag_keys_match(other) - for (self_tag_value, other_tag_value) in zip(self.tag_values, other.tag_values): - if self_tag_value != other_tag_value: - return False - return self.query_name == other.query_name - - def __verify_tag_keys_match(self, other) -> None: - if self.tag_keys != other.tag_keys: - format_str = "Cannot compare records using different tag lists: {0}, {1}" - raise ValueError(format_str.format(self.tag_keys, other.tag_keys)) - - def __str__(self) -> str: - return self.__repr__() - - def __repr__(self) -> str: - format_str = "TagSortableRecord(tags: {0}, tag_values: {1}, query_name: {2}" - return format_str.format(self.tag_keys, self.tag_values, self.query_name) - - -def sort_by_tags_and_queryname( - records: Iterable[pysam.AlignedSegment], tag_keys: Iterable[str] -) -> Iterable[pysam.AlignedSegment]: - """Sorts the given bam records by the given tags, followed by query name. - If no tags are given, just sorts by query name. - """ - tag_sortable_records = ( - TagSortableRecord.from_aligned_segment(r, tag_keys) for r in records - ) - sorted_records = sorted(tag_sortable_records) - aligned_segments = (r.record for r in sorted_records) - return aligned_segments - - -def verify_sort(records: Iterable[TagSortableRecord], tag_keys: Iterable[str]) -> None: - """Raise AssertionError if the given records are not correctly sorted by the given tags and query name""" - # Setting tag values and query name to empty string ensures first record will never be less than old_record - old_record = TagSortableRecord( - tag_keys=tag_keys, tag_values=["" for _ in tag_keys], query_name="", record=None - ) - i = 0 - for record in records: - i += 1 - if not record >= old_record: - msg = "Records {0} and {1} are not in correct order:\n{1}:{2} \nis less than \n{0}:{3}" - raise SortError(msg.format(i - 1, i, record, old_record)) - old_record = record - - -class SortError(Exception): - pass diff --git a/tools/scripts/sctools/build/lib/sctools/barcode.py b/tools/scripts/sctools/build/lib/sctools/barcode.py deleted file mode 100644 index f26aac24..00000000 --- a/tools/scripts/sctools/build/lib/sctools/barcode.py +++ /dev/null @@ -1,379 +0,0 @@ -""" -Nucleotide Barcode Manipulation Tools -===================================== - -.. currentmodule:: sctools - -This module contains tools to characterize oligonucleotide barcodes and a simple hamming-base -error-correction approach which corrects barcodes within a specified distance of a "whitelist" of -expected barcodes. - -Classes -------- -Barcodes Class to characterize a set of barcodes -ErrorsToCorrectBarcodesMap Class to carry out error correction routines - -""" - -import itertools -from collections import Counter -from typing import Mapping, Iterator, List, Tuple, Iterable - -import numpy as np -import pysam - -from . import consts -from .encodings import TwoBit -from .stats import base4_entropy - - -class Barcodes: - """Container for a set of nucleotide barcodes. - - Contained barcodes are encoded in 2bit representation for fast operations. Instances of this - class can optionally be constructed from an iterable where barcodes can be present multiple - times. In these cases, barcodes are analyzed based on their observed frequencies. - - Parameters - ---------- - barcodes: Mapping[str, int] - dictionary-like mapping barcodes to the number of times they were observed - barcode_length: int - the length of all barcodes in the set. Different-length barcodes are not supported. - - See Also - -------- - sctools.encodings.TwoBit - - """ - - def __init__(self, barcodes: Mapping[str, int], barcode_length: int): - if not isinstance(barcodes, Mapping): - raise TypeError( - 'The argument "barcodes" must be a dict-like object mapping barcodes to counts' - ) - self._mapping: Mapping[str, int] = barcodes - - if not isinstance(barcode_length, int) and barcode_length > 0: - raise ValueError('The argument "barcode_length" must be a positive integer') - self._barcode_length: int = barcode_length - - def __contains__(self, item) -> bool: - return item in self._mapping - - def __iter__(self) -> Iterator[str]: - return iter(self._mapping) - - def __len__(self) -> int: - return len(self._mapping) - - def __getitem__(self, item) -> int: - return self._mapping[item] - - def summarize_hamming_distances(self) -> Mapping[str, float]: - """Returns descriptive statistics on hamming distances between pairs of barcodes. - - Returns - ------- - descriptive_statistics : Mapping[str, float] - minimum, 25th percentile, median, 75th percentile, maximum, and average hamming - distance between all pairs of barcodes - - References - ---------- - https://en.wikipedia.org/wiki/Hamming_distance - - """ - distances: List = [] - - for a, b in itertools.combinations(self, 2): - distances.append(TwoBit.hamming_distance(a, b)) - - keys: Tuple = ( - "minimum", - "25th percentile", - "median", - "75th percentile", - "maximum", - "average", - ) - values: List = list(np.percentile(distances, [0, 25, 50, 75, 100])) - values.append(np.mean(distances)) - - return dict(zip(keys, values)) - - def base_frequency(self, weighted=False) -> np.ndarray: - """return the frequency of each base at each position in the barcode set - - Notes - ----- - weighting is currently not supported, and must be set to False or base_frequency will raise - NotImplementedError # todo fix - - Parameters - ---------- - weighted: bool, optional - if True, each barcode is counted once for each time it was observed (default = False) - - Returns - ------- - frequencies : np.array - barcode_length x 4 2d numpy array - - Raises - ------ - NotImplementedError - if weighted is True - - """ - base_counts_by_position: np.ndarray = np.zeros( - (self._barcode_length, 4), dtype=np.uint64 - ) - - keys: np.ndarray = np.fromiter(self._mapping.keys(), dtype=np.uint64) - - for i in reversed(range(self._barcode_length)): - binary_base_representations, counts = np.unique( - keys & 3, return_counts=True - ) - if weighted: - raise NotImplementedError - else: - base_counts_by_position[i, binary_base_representations] = counts - - # finished with this nulceotide, move two bits forward to the next one - keys >>= 2 - - return base_counts_by_position - - def effective_diversity(self, weighted=False) -> np.ndarray: - """Returns the effective base diversity of the barcode set by position. - - maximum diversity for each position is 1, and represents a perfect split of 25% per base at - a given position. - - Parameters - ---------- - weighted : bool, optional - if True, each barcode is counted once for each time it was observed (default = False) - - Returns - ------- - effective_diversity : np.array[float] - 1-d array of size barcode_length containing floats in [0, 1] - - """ - return base4_entropy(self.base_frequency(weighted=weighted)) - - @classmethod - def from_whitelist(cls, file_: str, barcode_length: int): - """Creates a barcode set from a whitelist file. - - Parameters - ---------- - file_ : str - location of the whitelist file. Should be formatted one barcode per line. Barcodes - should be encoded in plain text (UTF-8, ASCII), not bit-encoded. Each barcode will be - assigned a count of 1. - barcode_length : int - Length of the barcodes in the file. - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - - """ - tbe = TwoBit(barcode_length) - with open(file_, "rb") as f: - return cls( - Counter(tbe.encode(barcode[:-1]) for barcode in f), barcode_length - ) - - @classmethod - def from_iterable_encoded(cls, iterable: Iterable[int], barcode_length: int): - """Construct an ObservedBarcodeSet from an iterable of encoded barcodes. - - Parameters - ---------- - iterable : Iterable[int] - iterable of barcodes encoded in TwoBit representation - barcode_length : int - the length of the barcodes in `iterable` - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - """ - return cls(Counter(iterable), barcode_length=barcode_length) - - @classmethod - def from_iterable_strings(cls, iterable: Iterable[str], barcode_length: int): - """Construct an ObservedBarcodeSet from an iterable of string barcodes. - - Parameters - ---------- - iterable : Iterable[str] - iterable of barcodes encoded in TwoBit representation - barcode_length : int - the length of the barcodes in `iterable` - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - """ - tbe: TwoBit = TwoBit(barcode_length) - return cls( - Counter(tbe.encode(b.encode()) for b in iterable), - barcode_length=barcode_length, - ) - - @classmethod - def from_iterable_bytes(cls, iterable: Iterable[bytes], barcode_length: int): - """Construct an ObservedBarcodeSet from an iterable of bytes barcodes. - - Parameters - ---------- - iterable : Iterable[bytes] - iterable of barcodes in bytes representation - barcode_length : int - the length of the barcodes in `iterable` - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - """ - tbe: TwoBit = TwoBit(barcode_length) - return cls( - Counter(tbe.encode(b) for b in iterable), barcode_length=barcode_length - ) - - -class ErrorsToCorrectBarcodesMap: - """Correct any barcode that is within one hamming distance of a whitelisted barcode - - Parameters - ---------- - errors_to_barcodes : Mapping[str, str] - dict-like mapping 1-base errors to the whitelist barcode that they could be generated from - - Methods - ------- - get_corrected_barcode(barcode: str) - Return a barcode if it is whitelist, or the corrected version if within edit distance 1 - correct_bam(bam_file: str, output_bam_file: str) - correct barcodes in a bam file, given a whitelist - - References - ---------- - https://en.wikipedia.org/wiki/Hamming_distance - - """ - - def __init__(self, errors_to_barcodes: Mapping[str, str]): - if not isinstance(errors_to_barcodes, Mapping): - raise TypeError( - f'The argument "errors_to_barcodes" must be a mapping of erroneous barcodes to correct ' - f"barcodes, not {type(errors_to_barcodes)}" - ) - self._map = errors_to_barcodes - - def get_corrected_barcode(self, barcode: str) -> str: - """Return a barcode if it is whitelist, or the corrected version if within edit distance 1 - - Parameters - ---------- - barcode : str - the barcode to return the corrected version of. If the barcode is in the whitelist, - the input barcode is returned unchanged. - - Returns - ------- - corrected_barcode : str - corrected version of the barcode - - Raises - ------ - KeyError - if the passed barcode is not within 1 hamming distance of any whitelist barcode - - References - ---------- - https://en.wikipedia.org/wiki/Hamming_distance - - """ - return self._map[barcode] - - @staticmethod - def _prepare_single_base_error_hash_table( - barcodes: Iterable[str], - ) -> Mapping[str, str]: - """Generate a map of correct barcodes and single base error codes to whitelist barcodes - - Parameters - ---------- - barcodes : Iterable[str] - :param Iterable barcodes: iterable of string barcodes - :return dict: mapping between erroneous barcodes with single-base mutations and the barcode - they were generated from - """ - error_map = {} - for barcode in barcodes: - - # include correct barcode - error_map[barcode] = barcode - - # include all single-base errors - for i, nucleotide in enumerate(barcode): - errors = set("ACGTN") - errors.discard(nucleotide) - for e in errors: - error_map[barcode[:i] + e + barcode[i + 1 :]] = barcode - return error_map - - @classmethod - def single_hamming_errors_from_whitelist(cls, whitelist_file: str): - """Factory method to generate instance of class from a file containing "correct" barcodes. - - Parameters - ---------- - whitelist_file : str - Text file containing barcode per line. - - Returns - ------- - errors_to_barcodes_map : ErrorsToCorrectBarcodesMap - instance of cls, built from whitelist - - """ - with open(whitelist_file, "r") as f: - return cls( - cls._prepare_single_base_error_hash_table((line[:-1] for line in f)) - ) - - def correct_bam(self, bam_file: str, output_bam_file: str) -> None: - """Correct barcodes in a (potentially unaligned) bamfile, given a whitelist. - - Parameters - ---------- - bam_file : str - BAM format file in same order as the fastq files - output_bam_file : str - BAM format file containing cell, umi, and sample tags. - - """ - with pysam.AlignmentFile(bam_file, "rb") as fin, pysam.AlignmentFile( - output_bam_file, "wb", template=fin - ) as fout: - for alignment in fin: - try: - tag = self.get_corrected_barcode(alignment.get_tag("CR")) - except KeyError: # pass through the uncorrected barcode. - tag = alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) - alignment.set_tag( - tag=consts.CELL_BARCODE_TAG_KEY, value=tag, value_type="Z" - ) - fout.write(alignment) diff --git a/tools/scripts/sctools/build/lib/sctools/consts.py b/tools/scripts/sctools/build/lib/sctools/consts.py deleted file mode 100644 index e07980cb..00000000 --- a/tools/scripts/sctools/build/lib/sctools/consts.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Global constants -================ - -.. currentmodule:: sctools - -This module contains global constants, such as various barcoded BAM tags, and sctools-specific -constants. -""" - -# BAM tag constants - -RAW_SAMPLE_BARCODE_TAG_KEY = "SR" -QUALITY_SAMPLE_BARCODE_TAG_KEY = "SY" - -MOLECULE_BARCODE_TAG_KEY = "UB" -RAW_MOLECULE_BARCODE_TAG_KEY = "UR" -QUALITY_MOLECULE_BARCODE_TAG_KEY = "UY" - -CELL_BARCODE_TAG_KEY = "CB" -RAW_CELL_BARCODE_TAG_KEY = "CR" -QUALITY_CELL_BARCODE_TAG_KEY = "CY" - -GENE_NAME_TAG_KEY = "GE" -NUMBER_OF_HITS_TAG_KEY = "NH" - -ALIGNMENT_LOCATION_TAG_KEY = "XF" -INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTRONIC" -CODING_ALIGNMENT_LOCATION_TAG_VALUE = "CODING" -UTR_ALIGNMENT_LOCATION_TAG_VALUE = "UTR" -INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTERGENIC" - -# bam.py constants - -MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500 -MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000 - - -# modes of the count matrix runs -SINGLE_CELL_COUNT_MATRIX = 0 -SINGLE_NUCLEI_COUNT_MATRIX = 1 diff --git a/tools/scripts/sctools/build/lib/sctools/count.py b/tools/scripts/sctools/build/lib/sctools/count.py deleted file mode 100644 index b8d2e740..00000000 --- a/tools/scripts/sctools/build/lib/sctools/count.py +++ /dev/null @@ -1,400 +0,0 @@ -""" -Construct Count Matrices -======================== - -This module defines methods that enable (optionally) distributed construction of count matrices. -This module outputs coordinate sparse matrices that are converted to CSR matrices prior to delivery -for compact storage, and helper functions to convert this format into other commonly used formats. - -Methods -------- -from_sorted_tagged_bam( - bam_file: str, annotation_file: str, cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag: str=consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag: str=consts.GENE_NAME_TAG_KEY, open_mode: str='rb') -from_mtx(matrix_mtx: str, row_index_file: str, col_index_file: str) - - -Notes ------ -Memory usage of this module can be roughly approximated by the chunk_size parameter in Optimus. -The memory usage is equal to approximately 6*8 bytes per molecules in the file. -""" - -import itertools -import operator -from typing import List, Dict, Tuple, Set, Optional, Generator - -import numpy as np -import pysam -import scipy.sparse as sp -from scipy.io import mmread - -from sctools import consts, bam - - -class CountMatrix: - def __init__( - self, matrix: sp.csr_matrix, row_index: np.ndarray, col_index: np.ndarray - ): - self._matrix = matrix - self._row_index = row_index - self._col_index = col_index - - @property - def matrix(self): - return self._matrix - - @property - def row_index(self): - return self._row_index - - @property - def col_index(self): - return self._col_index - - @staticmethod - def _get_alignments_grouped_by_query_name_generator( - bam_file: str, - cell_barcode_tag: str, - molecule_barcode_tag: str, - open_mode: str = "rb", - ) -> Generator[ - Tuple[str, Optional[str], Optional[str], List[pysam.AlignedSegment]], None, None - ]: - """Iterates through a query_name-sorted BAM file, groups all alignments with the same query name - - Parameters - ---------- - bam_file : str - input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that - order - cell_barcode_tag : str - Tag that specifies the cell barcode for each read. - molecule_barcode_tag : str - Tag that specifies the molecule barcode for each read. - - Returns - ------- - a generator for tuples (query_name, cell_barcode, molecule_barcode, alignments) - """ - with pysam.AlignmentFile(bam_file, mode=open_mode) as bam_records: - for (query_name, grouper) in itertools.groupby( - bam_records, key=lambda record: record.query_name - ): - alignments: List[pysam.AlignedSegment] = list(grouper) - cell_barcode: Optional[str] = bam.get_tag_or_default( - alignments[0], cell_barcode_tag - ) - molecule_barcode: Optional[str] = bam.get_tag_or_default( - alignments[0], molecule_barcode_tag - ) - yield query_name, cell_barcode, molecule_barcode, alignments - - """Looks through a list of gene locations to find the one that the given read_start ovelaps - - Parameters - ---------- - _gene_locations: Array - array with gene start end locations and names - search_start: - index of gene to start searching from - search_end: - index of gene up to which to search to - read_start: - position at which the read starts at - - Returns - ------- - name of gene with overlap or None if no overlap is found - - """ - - @classmethod - def binary_overlap(cls, _gene_locations, search_start, search_end, read_start): - while search_start <= search_end: - current_gene_index = int((search_start + search_end) / 2) - if ( - _gene_locations[current_gene_index][0][0] - < read_start - < _gene_locations[current_gene_index][0][1] - ): - return _gene_locations[current_gene_index][1] - elif _gene_locations[current_gene_index][0][0] < read_start: - search_start = current_gene_index + 1 - else: - search_end = current_gene_index - 1 - return None - - # todo add support for generating a matrix of invalid barcodes - # todo add support for splitting spliced and unspliced reads - # todo add support for generating a map of cell barcodes - # todo add the option for stringent checks on the input (e.g. BAM sort order) - # todo once the stringent checks are in place, safely move on to the hashset-free implementation - @classmethod - def from_sorted_tagged_bam( - cls, - bam_file: str, - gene_name_to_index: Dict[str, int], - chromosomes_gene_locations_extended: Dict[str, List[tuple]] = None, - cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag: str = consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag: str = consts.GENE_NAME_TAG_KEY, - open_mode: str = "rb", - ) -> "CountMatrix": - """Generate a count matrix from a sorted, tagged bam file - - Notes - ----- - - Input bam file must be sorted by query name. - - - The sort order of the input BAM file is not strictly checked. If the input BAM file not sorted - by query_name, the output counts will be wrong without any warnings being issued. - - This method returns counts that correspond to both spliced and unspliced reads. - - Description of the algorithm - ---------------------------- - The implemented counting strategy is intended to closely match that of CellRanger 2.1.1 - (see the references). The following pseudo-code describes the counting algorithm: - - for each query_name (i.e. unique sequenced read): - - if only a single alignment exists, _consider_ the read - - if multiple alignments exist, - - if a unique gene name is associated to all alignments that have a gene name tag, - _consider_ the read; otherwise, the read is useless and neglect it - - if the read is to be _considered_, - - if the triple (cell barcode, molecule barcode, gene name) is not encountered before, - count it as evidence for a unique transcript; otherwise, consider the read as duplicate - and neglect it - - Parameters - ---------- - bam_file : str - input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that - order - chromosomes_gene_locations_extended : dict - Location of genes by chromosome - (default = None) - cell_barcode_tag : str, optional - Tag that specifies the cell barcode for each read. Reads without this tag will be ignored - (default = consts.CELL_BARCODE_TAG_KEY) - molecule_barcode_tag : str, optional - Tag that specifies the molecule barcode for each read. Reads without this tag will be - ignored (default = consts.MOLECULE_BARCODE_TAG_KEY) - gene_name_tag - Tag that specifies the gene name for each read. Reads without this tag will be ignored - (default = consts.GENE_NAME_TAG_KEY) - gene_name_to_index : dict - A map from gene names to their counts matrix column index - open_mode : {'r', 'rb'}, optional - indicates that the passed file is a bam file ('rb') or sam file ('r') (default = 'rb'). - - Returns - ------- - count_matrix : CountMatrix - cells x genes sparse count matrix in compressed sparse row format (cells are compressed) - - Notes - ----- - All matrices produced by this function called on different BAM chunks that share the same annotation - file can be concatenated using the scipy sparse vstack function, since by definition, the cell barcodes - contained in different BAM chunks are mutually exclusive. for example: - - >>> import scipy.sparse as sp - >>> A = sp.coo_matrix([[1, 2], [3, 4]]).tocsr() - >>> B = sp.coo_matrix([[5, 6]]).tocsr() - >>> sp.vstack([A, B]).toarray() - array([[1, 2], - [3, 4], - [5, 6]]) - - See Also - -------- - samtools sort (-t parameter): - C library that can sort files as required. - http://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS - - TagSortBam.CellSortBam: - WDL task that accomplishes the sorting necessary for this module. - https://github.com/HumanCellAtlas/skylab/blob/master/library/tasks/TagSortBam.wdl - - Relevant parmalinks to the counting algorithm in CellRanger: - [1] https://github.com/10XGenomics/cellranger/blob/aba5d379169ff0d4bee60e3d100df35752b90383/mro/stages/counter/ - attach_bcs_and_umis/__init__.py - [2] https://github.com/10XGenomics/cellranger/blob/aba5d379169ff0d4bee60e3d100df35752b90383/lib/rust/ - annotate_reads/src/main.rs - """ - # map the gene from reach record to an index in the sparse matrix - n_genes = len(gene_name_to_index) - - # track which tuples (cell_barcode, molecule_barcode, gene_name) we've encountered so far - observed_cell_molecule_gene_set: Set[Tuple[str, str, str]] = set() - - # COO sparse matrix entries - data: List[int] = [] - cell_indices: List[int] = [] - gene_indices: List[int] = [] - - # track which cells we've seen, and what the current cell number is - n_cells = 0 - cell_barcode_to_index: Dict[str, int] = {} - - grouped_records_generator = cls._get_alignments_grouped_by_query_name_generator( - bam_file, cell_barcode_tag, molecule_barcode_tag, open_mode=open_mode - ) - - for ( - query_name, - cell_barcode, - molecule_barcode, - input_alignments, - ) in grouped_records_generator: - - # modify alignments to include the gene name to the alignments to INTRONIC regions - alignments = input_alignments - - # only keep queries w/ well-formed UMIs - gene_name = None - if cell_barcode is None or molecule_barcode is None: - continue - - if len(alignments) == 1: - primary_alignment = alignments[0] - if ( - primary_alignment.has_tag(gene_name_tag) - and primary_alignment.has_tag("XF") - and primary_alignment.get_tag("XF") != "INTERGENIC" - ): - gene_name = primary_alignment.get_tag(gene_name_tag) - # overlaps multiple genes, drop query, and unfortunately there only one - # one alignment for this query - if len(gene_name.split(",")) != 1: - continue - else: - continue # drop query - else: # multi-map - implicated_gene_names: Set[str] = set() - for alignment in alignments: - if ( - alignment.has_tag(gene_name_tag) - and alignment.has_tag("XF") - and alignment.get_tag("XF") != "INTERGENIC" - ): - # consider its gene name only if it has only gene name - gene_name = alignment.get_tag(gene_name_tag) - if len(gene_name.split(",")) == 1: - implicated_gene_names.add(alignment.get_tag(gene_name_tag)) - - if len(implicated_gene_names) == 1: # only one gene - gene_name = implicated_gene_names.__iter__().__next__() - else: - continue # drop query - - if gene_name is None: - continue - - if ( - cell_barcode, - molecule_barcode, - gene_name, - ) in observed_cell_molecule_gene_set: - continue # optical/PCR duplicate -> drop query - else: - observed_cell_molecule_gene_set.add( - (cell_barcode, molecule_barcode, gene_name) - ) - - # find the indices that this molecule should correspond to - gene_index = gene_name_to_index[gene_name] - - # if we've seen this cell before, get its index, else set it - try: - cell_index = cell_barcode_to_index[cell_barcode] - except KeyError: - cell_index = n_cells - cell_barcode_to_index[cell_barcode] = n_cells - n_cells += 1 - - # record the molecule data - data.append(1) # one count of this molecule - cell_indices.append(cell_index) - gene_indices.append(gene_index) - - # convert into coo_matrix - coordinate_matrix = sp.coo_matrix( - (data, (cell_indices, gene_indices)), - shape=(n_cells, n_genes), - dtype=np.uint32, - ) - - # convert to a csr sparse matrix and return - col_index = np.asarray( - [ - k - for k, v in sorted( - gene_name_to_index.items(), key=operator.itemgetter(1) - ) - ] - ) - row_index = np.asarray( - [ - k - for k, v in sorted( - cell_barcode_to_index.items(), key=operator.itemgetter(1) - ) - ] - ) - - return cls(coordinate_matrix.tocsr(), row_index, col_index) - - def save(self, prefix: str) -> None: - sp.save_npz(prefix + ".npz", self._matrix, compressed=True) - np.save(prefix + "_row_index.npy", self._row_index) - np.save(prefix + "_col_index.npy", self._col_index) - - @classmethod - def load(cls, prefix: str) -> "CountMatrix": - matrix = sp.load_npz(prefix + ".npz") - row_index = np.load(prefix + "_row_index.npy") - col_index = np.load(prefix + "_col_index.npy") - return cls(matrix, row_index, col_index) - - @classmethod - def merge_matrices(cls, input_prefixes: str) -> "CountMatrix": - col_indices = [np.load(p + "_col_index.npy") for p in input_prefixes] - row_indices = [np.load(p + "_row_index.npy") for p in input_prefixes] - matrices = [sp.load_npz(p + ".npz") for p in input_prefixes] - - matrix: sp.csr_matrix = sp.vstack(matrices, format="csr") - # todo test that col_indices are all same shape - col_index = col_indices[0] - row_index = np.concatenate(row_indices) - return cls(matrix, row_index, col_index) - - @classmethod - def from_mtx( - cls, matrix_mtx: str, row_index_file: str, col_index_file: str - ) -> "CountMatrix": - """ - - Parameters - ---------- - matrix_mtx : str - file containing count matrix in matrix market sparse format - row_index_file : str - newline delimited row index file - col_index_file : str - newline delimited column index file - - Returns - ------- - CountMatrix - instance of class - """ - matrix: sp.csr_matrix = mmread(matrix_mtx).tocsr() - with open(row_index_file, "r") as fin: - row_index = np.array(fin.readlines()) - with open(col_index_file, "r") as fin: - col_index = np.array(fin.readlines()) - return cls(matrix, row_index, col_index) diff --git a/tools/scripts/sctools/build/lib/sctools/encodings.py b/tools/scripts/sctools/build/lib/sctools/encodings.py deleted file mode 100644 index 85f1cef8..00000000 --- a/tools/scripts/sctools/build/lib/sctools/encodings.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -Compressed Barcode Encoding Methods -=================================== - -.. currentmodule:: sctools - -This module defines several classes to encode DNA sequences in memory-efficient forms, using 2 bits -to encode bases of a 4-letter DNA alphabet (ACGT) or 3 bits to encode a 5-letter DNA alphabet -that includes the ambiguous call often included by Illumina base calling software (ACGTN). The -classes also contain several methods useful for efficient querying and manipulation of the encoded -sequence. - -Classes -------- -Encoding Encoder base class -ThreeBit Three bit DNA encoder / decoder -TwoBit Two bit DNA encoder / decoder - -""" - -import random -from typing import Mapping, AnyStr, Set - - -class Encoding: - """ - - Attributes - ---------- - encoding_map : TwoBitEncodingMap - Class that mimics a Mapping[bytes, str] where bytes must be a single byte encoded character - (encoder) - decoding_map : Mapping[int, bytes] - Dictionary that maps integers to bytes human-readable representations (decoder) - bits_per_base : int - number of bits used to encode each base - - Methods - ------- - encode(bytes_encoded: bytes) - encode a DNA string in a compressed representation - decode(integer_encoded: int) - decode a compressed DNA string into a human readable bytes format - gc_content(integer_encoded: int) - calculate the GC content of an encoded DNA string - hamming_distance(a: int, b: int) - calculate the hamming distance between two encoded DNA strings - - """ - - encoding_map: Mapping[AnyStr, int] = NotImplemented - decoding_map: Mapping[int, AnyStr] = NotImplemented - bits_per_base: int = NotImplemented - - @classmethod - def encode(cls, bytes_encoded: bytes) -> int: - """Encode a DNA bytes string. - - Parameters - ---------- - bytes_encoded : bytes - bytes DNA string - - Returns - ------- - encoded : int - Encoded DNA sequence - - """ - raise NotImplementedError - - def decode(self, integer_encoded: int) -> bytes: - """Decode a DNA bytes string. - - Parameters - ---------- - integer_encoded : bytes - Integer encoded DNA string - - Returns - ------- - decoded : bytes - Bytes decoded DNA sequence - - """ - raise NotImplementedError - - def gc_content(self, integer_encoded: int) -> int: - """Return the number of G or C nucleotides in `integer_encoded` - - Parameters - ---------- - integer_encoded : int - Integer encoded DNA string - - Returns - ------- - gc_content, int - number of bases in `integer_encoded` input that are G or C. - - """ - raise NotImplementedError - - @staticmethod - def hamming_distance(a, b) -> int: - """Calculate the hamming distance between two DNA sequences - - The hamming distance counts the number of bases that are not the same nucleotide - - Parameters - ---------- - a, b : int - integer encoded - - - Returns - ------- - d : int - hamming distance between a and b - """ - raise NotImplementedError - - -class TwoBit(Encoding): - """Encode a DNA sequence using a 2-bit encoding. - - Two-bit encoding uses 0 for an encoded nucleotide. As such, it cannot distinguish between - the end of sequence and trailing A nucleotides, and thus decoding these strings requires - knowledge of their length. Therefore, it is only appropriate for encoding fixed sequence - lengths - - In addition, in order to encode in 2-bit, N-nucleotides must be randomized to one of A, C, - G, and T. - - Parameters - ---------- - sequence_length : int - number of nucleotides that are being encoded - - """ - - __doc__ += Encoding.__doc__ - - def __init__(self, sequence_length: int): - self.sequence_length: int = sequence_length - - class TwoBitEncodingMap: - """Dict-like class that maps bytes to 2-bit integer representations - - Generates random nucleotides for ambiguous nucleotides e.g. N - - """ - - map_ = { - ord("A"): 0, - ord("C"): 1, - ord("T"): 2, - ord("G"): 3, - ord("a"): 0, - ord("c"): 1, - ord("t"): 2, - ord("g"): 3, - } - - iupac_ambiguous: Set[int] = {ord(c) for c in "MRWSYKVHDBNmrwsykvhdbn"} - - def __getitem__(self, byte: int) -> int: - try: - return self.map_[byte] - except KeyError: - if byte not in self.iupac_ambiguous: - raise KeyError(f"{chr(byte)} is not a valid IUPAC nucleotide code") - return random.randint(0, 3) - - encoding_map: TwoBitEncodingMap = TwoBitEncodingMap() - decoding_map: Mapping[int, bytes] = {0: b"A", 1: b"C", 2: b"T", 3: b"G"} - bits_per_base: int = 2 - - @classmethod - def encode(cls, bytes_encoded: bytes) -> int: - encoded = 0 - for character in bytes_encoded: - encoded <<= 2 - encoded += cls.encoding_map[character] - return encoded - - def decode(self, integer_encoded: int) -> bytes: - decoded = b"" - for _ in range(self.sequence_length): - decoded = self.decoding_map[integer_encoded & 3] + decoded - integer_encoded >>= 2 - return decoded - - def gc_content(self, integer_encoded: int) -> int: - i = 0 - for _ in range(self.sequence_length): - i += integer_encoded & 1 - integer_encoded >>= 2 - return i - - @staticmethod - def hamming_distance(a: int, b: int) -> int: - difference = a ^ b - d_hamming = 0 - while difference: - if difference & 3: - d_hamming += 1 - difference >>= 2 - return d_hamming - - -class ThreeBit(Encoding): - """Encode a DNA sequence using a 3-bit encoding. - - Since no bases are encoded as 0, an empty triplet is interpreted as the end of the encoded - string; Three-bit encoding can be used to encode and decode strings without knowledge of their - length. - - """ - - __doc__ += Encoding.__doc__ - - def __init__(self, *args, **kwargs): - """ - Notes - ----- - args and kwargs are not used, but allow ThreeBit to be initialized the same way as TwoBit, - despite not requiring a sequence length parameter. - - """ - pass - - class ThreeBitEncodingMap: - """Dict-like class that maps bytes to 3-bit integer representations - - All IUPAC ambiguous codes are treated as "N" - - """ - - # C: 1, A: 2, G: 3, T: 4, N: 6; # note, not using 0 - map_ = { - ord("C"): 1, - ord("A"): 2, - ord("G"): 3, - ord("T"): 4, - ord("N"): 6, - ord("c"): 1, - ord("a"): 2, - ord("g"): 3, - ord("t"): 4, - ord("n"): 6, - } - - def __getitem__(self, byte: int) -> int: - try: - return self.map_[byte] - except KeyError: - return 6 # any non-standard nucleotide gets "N" - - encoding_map: ThreeBitEncodingMap = ThreeBitEncodingMap() - decoding_map: Mapping[int, bytes] = {1: b"C", 2: b"A", 3: b"G", 4: b"T", 6: b"N"} - bits_per_base: int = 3 - - @classmethod - def encode(cls, bytes_encoded: bytes) -> int: - encoded = 0 - for character in bytes_encoded: - encoded <<= 3 - encoded += cls.encoding_map[character] - return encoded - - @classmethod - def decode(cls, integer_encoded: int) -> bytes: - decoded = b"" - while integer_encoded: - decoded = cls.decoding_map[integer_encoded & 7] + decoded - integer_encoded >>= 3 - return decoded - - @classmethod - def gc_content(cls, integer_encoded: int) -> int: - i = 0 - while integer_encoded: - i += integer_encoded & 1 - integer_encoded >>= 3 - return i - - @staticmethod - def hamming_distance(a: int, b: int) -> int: - difference = a ^ b - d_hamming = 0 - while difference: - if difference & 7: - d_hamming += 1 - difference >>= 3 - return d_hamming diff --git a/tools/scripts/sctools/build/lib/sctools/fastq.py b/tools/scripts/sctools/build/lib/sctools/fastq.py deleted file mode 100644 index c6749de0..00000000 --- a/tools/scripts/sctools/build/lib/sctools/fastq.py +++ /dev/null @@ -1,404 +0,0 @@ -""" -Efficient Fastq Iterators and Representations -============================================= - -.. currentmodule:: sctools - -This module implements classes for representing fastq records, reading and writing them, and -extracting parts of fastq sequence for transformation into bam format tags - -Methods -------- -extract_barcode(record, embedded_barcode) - extract a barcode, defined by `embedded_barcode` from `record` - -Classes -------- -Record Represents fastq records (input as bytes) -StrRecord Represents fastq records (input as str) -Reader Opens and iterates over fastq files -EmbeddedBarcodeGenerator Generates barcodes from a fastq file -BarcodeGeneratorWithCorrectedCellBarcodes Generates (corrected) barcodes from a fastq file - -References ----------- -https://en.wikipedia.org/wiki/FASTQ_format - -""" - -from collections import namedtuple -from typing import Iterable, AnyStr, Iterator, Union, Tuple - -from . import reader, consts -from .barcode import ErrorsToCorrectBarcodesMap - - -# todo the inheritance pattern of this class is a bit confusing, particularly the str vs. bytes -# in the daughter classes -class Record: - """Fastq Record. - - Parameters - ---------- - record : Iterable[bytes] - Iterable of 4 bytes strings that comprise a fastq record - - Attributes - ---------- - name : bytes - fastq record name - sequence : bytes - fastq nucleotide sequence - name2 : bytes - second fastq record name field (rarely used) - quality : bytes - base call quality for each nucleotide in sequence - - Methods - ------- - average_quality() - The average quality of the fastq record - - """ - - __slots__ = ["_name", "_sequence", "_name2", "_quality"] - - def __init__(self, record: Iterable[AnyStr]): - # use the setter functions - self.name, self.sequence, self.name2, self.quality = record - - @property - def name(self) -> AnyStr: - return self._name - - @name.setter - def name(self, value): - """fastq record name""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ name must be bytes") - elif not value.startswith(b"@"): - raise ValueError("FASTQ name must start with @") - else: - self._name = value - - @property - def sequence(self) -> AnyStr: - return self._sequence - - @sequence.setter - def sequence(self, value): - """FASTQ nucleotide sequence""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ sequence must be str or bytes") - else: - self._sequence = value - - @property - def name2(self) -> AnyStr: - return self._name2 - - @name2.setter - def name2(self, value): - """second FASTQ record name field (rarely used)""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ name2 must be str or bytes") - else: - self._name2 = value - - @property - def quality(self) -> AnyStr: - return self._quality - - @quality.setter - def quality(self, value): - """FASTQ record base call quality scores""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ quality must be str or bytes") - else: - self._quality = value - - def __bytes__(self): - return b"".join((self.name, self.sequence, self.name2, self.quality)) - - def __str__(self): - return b"".join((self.name, self.sequence, self.name2, self.quality)).decode() - - def __repr__(self): - return "Name: %s\nSequence: %s\nName2: %s\nQuality: %s\n" % ( - self.name, - self.sequence, - self.name2, - self.quality, - ) - - def __len__(self): - return len(self.sequence) - - def average_quality(self) -> float: - """return the average quality of this record""" - # -33 due to solexa/illumina phred conversion - return sum(c for c in self.quality[:-1]) / (len(self.quality) - 1) - 33 - - -class StrRecord(Record): - """Fastq Record. - - Parameters - ---------- - record : Iterable[str] - Iterable of 4 bytes strings that comprise a FASTQ record - - Attributes - ---------- - name : str - FASTQ record name - sequence : str - FASTQ nucleotide sequence - name2 : str - second FASTQ record name field (rarely used) - quality : str - base call quality for each nucleotide in sequence - - Methods - ------- - average_quality() - The average quality of the FASTQ record - - """ - - def __bytes__(self): - return "".join((self.name, self.sequence, self.name2, self.quality)).encode() - - def __str__(self): - return "".join((self.name, self.sequence, self.name2, self.quality)) - - # todo is this method necessary? - @property - def name(self) -> str: - return self._name - - @name.setter - def name(self, value): - """FASTQ record name""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ name must be str or bytes") - if not value.startswith("@"): - raise ValueError("FASTQ name must start with @") - else: - self._name = value - - def average_quality(self) -> float: - """return the average quality of this record""" - b = self.quality[:-1].encode() - return ( - sum(c for c in b) / len(b) - 33 - ) # -33 due to solexa/illumina phred conversion - - -class Reader(reader.Reader): - """Fastq Reader that defines some special methods for reading and summarizing FASTQ data. - - Simple reader class that exposes an __iter__ and __len__ method - - Examples - -------- - #todo add examples - - See Also - -------- - sctools.reader.Reader - - References - ---------- - https://en.wikipedia.org/wiki/FASTQ_format - - """ - - @staticmethod - def _record_grouper(iterable): - """Groups contents of an iterator, yielding 4 objects at a time instead of one - - This is a somewhat complex python function. It creates 4 iterators on the same iterable; - each moves the pointer to the position in the iterable forward when called, yielding 4 - objects at a time - - Returns - ------- - grouped_iterator : Iterator[Str], Iterator[Str], Iterator[Str], Iterator[Str] - - """ - args = [iter(iterable)] * 4 - return zip(*args) - - def __iter__(self) -> Iterator[Tuple[str]]: - """Iterate over a FASTQ file, returning records - - Yields - ------ - fastq_record : Tuple[str] - tuple of length 4 containing the name, sequence, name2, and quality for a FASTQ record - - """ - record_type = StrRecord if self._mode == "r" else Record - for record in self._record_grouper(super().__iter__()): - yield record_type(record) - - -# namedtuple that defines the start and end position of a barcode sequence and provides the name -# for both a quality and sequence tag -EmbeddedBarcode = namedtuple("Tag", ["start", "end", "sequence_tag", "quality_tag"]) - - -def extract_barcode( - record, embedded_barcode -) -> Tuple[Tuple[str, str, str], Tuple[str, str, str]]: - """Extracts barcodes from a FASTQ record at positions defined by an EmbeddedBarcode object. - - Parameters - ---------- - record : FastqRecord - Record to extract from - embedded_barcode : EmbeddedBarcode - Defines the barcode start and end positions and the tag name for the sequence and quality - tags - - Returns - ------- - sequence_tag : Tuple[str, str, 'Z'] - sequence tag identifier, sequence, SAM tag type ('Z' implies a string tag) - quality_tag : Tuple[str, str, 'Z'] - quality tag identifier, quality, SAM tag type ('Z' implies a string tag) - - """ - seq = record.sequence[embedded_barcode.start : embedded_barcode.end] - qual = record.quality[embedded_barcode.start : embedded_barcode.end] - return ( - (embedded_barcode.sequence_tag, seq, "Z"), - (embedded_barcode.quality_tag, qual, "Z"), - ) - - -# todo the reader subclasses need better docs -class EmbeddedBarcodeGenerator(Reader): - """Generate barcodes from a FASTQ file(s) from positions defined by EmbeddedBarcode(s) - - Extracted barcode objects are produced in a form that is consumable by pysam's bam and sam - set_tag methods. - - Parameters - ---------- - embedded_barcodes : Iterable[EmbeddedBarcode] - tag objects defining start and end of the sequence containing the tag, and the tag - identifiers for sequence and quality tags - fastq_files : str | List, optional - FASTQ file or files to be read. (default = sys.stdin) - mode : {'r', 'rb'}, optional - open mode for FASTQ files. If 'r', return string. If 'rb', return bytes (default = 'r') - - """ - - def __init__(self, fastq_files, embedded_barcodes, *args, **kwargs): - super().__init__(files=fastq_files, *args, **kwargs) - self.embedded_barcodes = embedded_barcodes - - def __iter__(self): - """iterates over barcodes extracted from FASTQ""" - for record in super().__iter__(): # iterates records; we extract barcodes. - barcodes = [] - for barcode in self.embedded_barcodes: - barcodes.extend(extract_barcode(record, barcode)) - yield barcodes - - -# todo the reader subclasses need better docs -class BarcodeGeneratorWithCorrectedCellBarcodes(Reader): - """Generate barcodes from FASTQ file(s) from positions defined by EmbeddedBarcode(s) - - Extracted barcode objects are produced in a form that is consumable by pysam's bam and sam - set_tag methods. In this class, one EmbeddedBarcode must be defined as an - `embedded_cell_barcode`, which is checked against a whitelist and error corrected during - generation - - Parameters - ---------- - fastq_files : str | List, optional - FASTQ file or files to be read. (default = sys.stdin) - mode : {'r', 'rb'}, optional - open mode for fastq files. If 'r', return string. If 'rb', return bytes (default = 'r') - whitelist : str - whitelist file containing "correct" cell barcodes for an experiment - embedded_cell_barcodes : EmbeddedBarcode - EmbeddedBarcode containing information about the position and names of cell barcode tags - other_embedded_barcodes : Iterable[EmbeddedBarcode], optional - tag objects defining start and end of the sequence containing the tag, and the tag - identifiers for sequence and quality tags (default = None) - - Methods - ------- - extract_cell_barcode(record: Record, cb: str) - - """ - - def __init__( - self, - fastq_files: Union[str, Iterable[str]], - embedded_cell_barcode: EmbeddedBarcode, - whitelist: str, - other_embedded_barcodes: Iterable[EmbeddedBarcode] = tuple(), - *args, - **kwargs - ): - - super().__init__(files=fastq_files, *args, **kwargs) - if isinstance(other_embedded_barcodes, (list, tuple)): - self.embedded_barcodes = other_embedded_barcodes - else: - raise TypeError( - "if passed, other_embedded_barcodes must be a list or tuple" - ) - - self._error_mapping = ErrorsToCorrectBarcodesMap.single_hamming_errors_from_whitelist( - whitelist - ) - self.embedded_cell_barcode = embedded_cell_barcode - - def __iter__(self): - """iterates over barcodes extracted from fastq""" - for record in super().__iter__(): # iterates records; we extract barcodes. - barcodes = [] - - barcodes.extend( - self.extract_cell_barcode(record, self.embedded_cell_barcode) - ) - for barcode in self.embedded_barcodes: - barcodes.extend(extract_barcode(record, barcode)) - - yield barcodes - - def extract_cell_barcode(self, record: Tuple[str], cb: EmbeddedBarcode): - """Extract a cell barcode from a fastq record - - Parameters - ---------- - record : Tuple[str] - fastq record comprised of four strings: name, sequence, name2, and quality - cb : EmbeddedBarcode - defines the position and tag identifier for a call barcode - - Returns - ------- - sequence_tag : Tuple[str, str, 'Z'] - raw sequence tag identifier, sequence, SAM tag type ('Z' implies a string tag) - quality_tag : Tuple[str, str, 'Z'] - quality tag identifier, quality, SAM tag type ('Z' implies a string tag) - corrected_tag : Optional[Tuple[str, str, 'Z']] - Whitelist verified sequence tag. Only present if the raw sequence tag is in the - whitelist or within 1 hamming distance of one of its barcodes - - """ - seq_tag, qual_tag = extract_barcode(record, cb) - try: - corrected_cb = self._error_mapping.get_corrected_barcode(seq_tag[1]) - return seq_tag, qual_tag, (consts.CELL_BARCODE_TAG_KEY, corrected_cb, "Z") - except KeyError: - return seq_tag, qual_tag diff --git a/tools/scripts/sctools/build/lib/sctools/groups.py b/tools/scripts/sctools/build/lib/sctools/groups.py deleted file mode 100644 index 2a3592f2..00000000 --- a/tools/scripts/sctools/build/lib/sctools/groups.py +++ /dev/null @@ -1,195 +0,0 @@ -""" -Group QC outputs - -""" - -from crimson import picard -import os -import pandas as pd - - -def write_aggregated_picard_metrics_by_row(file_names, output_name): - """Command line entrypoint to parse, aggreagete and write Picard row metrics. - Parameters - ---------- - args: - file_names: array of files. the basename of inputs should be formated - as 'samplename_qc',such as - "samplename_qc.alignment_summary_metrics.txt" and "samplename_qc.insert_size_metrics.txt" - output_name: prefix of output file name without extension. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - # initial output - metrics = {} - d = pd.DataFrame() - for file_name in file_names: - cell_id = os.path.basename(file_name).split("_qc")[0] - metrics[cell_id] = {} - parsed = picard.parse(file_name) - class_name = parsed["metrics"]["class"].split(".")[2] - # Alignment metrics return multiple lines, - # but only output PAIRED-READS/third line - contents = parsed["metrics"]["contents"] - if class_name == "AlignmentSummaryMetrics": - # parse out PE, R1 and R2. If the reads are unpaired, the contents - # will be a single dict rather than a list of dicts. - if isinstance(contents, dict): - contents = [contents] - rows = {} - for m in contents: - cat = m["CATEGORY"] - rows.update( - { - k + "." + cat: v - for k, v in m.items() - if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"] - } - ) - # sometimes(very rare), insertion metrics also return multiple lines - # results to include TANDEM repeats. but we only output the first line. - elif class_name == "InsertSizeMetrics": - # if the element counts is less than 21, - # it means insertion metrics returns multiple line results. - if len(contents) < 21: - rows = contents[0] - else: - rows = contents - else: - # other metrics(so far) only return one line results. - rows = contents - metrics[cell_id].update( - { - k: rows[k] - for k in rows - if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"] - } - ) - df = pd.DataFrame.from_dict(metrics, orient="columns") - df.insert(0, "Class", class_name) - d = d.append(df) - d_T = d.T - d_T.to_csv(output_name + ".csv") - - -def write_aggregated_picard_metrics_by_table(file_names, output_name): - """Command line entrypoint to parse and write Picard table metrics. - Parameters - ---------- - args: - file_names: array of files.the basename of inputs should be formated as 'samplename_qc' - output_name: prefix of output file name. the basename of outputs - includes the Picard metrics class name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - for file_name in file_names: - cell_id = os.path.basename(file_name).split("_qc")[0] - class_name = os.path.basename(file_name).split(".")[1] - parsed = picard.parse(file_name) - dat = pd.DataFrame.from_dict(parsed["metrics"]["contents"]) - dat.insert(0, "Sample", cell_id) - dat.to_csv(output_name + "_" + class_name + ".csv", index=False) - - -def write_aggregated_qc_metrics(file_names, output_name): - """Command line entrypoint to merge Picard metrics along with RSEM and HISAT2 log - Parameters - ---------- - args: - file_names: array of files,such as Picard row metric, hisat2 metrics. - output_name: prefix of output file name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - df = pd.DataFrame() - for file_name in file_names: - dat = pd.read_csv(file_name, index_col=0) - print(dat.index) - print(df.head()) - df = pd.concat([df, dat], axis=1, join="outer") - df.to_csv(output_name + ".csv", index=True) - - -def parse_hisat2_log(file_names, output_name): - """Command line entrypoint parse, aggreagete and write HISAT2 logs - Parameters - ---------- - args: - file_names: array of HISAT2 log files. Basename of file indicates - the alignment references 'samplename_qc.log' indicates the genome reference and - 'samplename_rsem.log' indicates the transcriptome reference alignment. - output_name: prefix of output file name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - metrics = {} - tag = "NONE" - for file_name in file_names: - if "_qc" in file_name: - cell_id = os.path.basename(file_name).split("_qc")[0] - tag = "HISAT2G" - elif "_rsem" in file_name: - cell_id = os.path.basename(file_name).split("_rsem")[0] - tag = "HISAT2T" - with open(file_name) as f: - dat = f.readlines() - d = [x.strip().split(":") for x in dat] - # remove the first row of each section. - d.pop(0) - metrics[cell_id] = {x[0]: x[1].strip().split(" ")[0] for x in d} - df = pd.DataFrame.from_dict(metrics, orient="columns") - df.insert(0, "Class", tag) - df_T = df.T - df_T.to_csv(output_name + ".csv") - - -def parse_rsem_cnt(file_names, output_name): - """Command line entrypoint parse, aggreagete and write RSEM cnt - Parameters - ---------- - args: - file_names: array of RSEM cnt files. The basename of inputs should be - 'samplename_rsem.cnt' - output_name: prefix of output file name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - metrics = {} - for file_name in file_names: - cell_id = os.path.basename(file_name).split("_rsem")[0] - i = 0 - with open(file_name) as f: - while i < 3: - if i == 0: - [N0, N1, N2, N_tot] = f.readline().strip().split(" ") - elif i == 1: - [n_unique, n_multi, n_uncertain] = f.readline().strip().split(" ") - elif i == 2: - [n_hits, read_type] = f.readline().strip().split(" ") - i = i + 1 - metrics[cell_id] = { - "unalignable reads": N0, - "alignable reads": N1, - "filtered reads": N2, - "total reads": N_tot, - "unique aligned": n_unique, - "multiple mapped": n_multi, - "total alignments": n_hits, - "strand": read_type, - "uncertain reads": n_uncertain, - } - df = pd.DataFrame.from_dict(metrics, orient="columns") - df.insert(0, "Class", "RSEM") - df_T = df.T - df_T.to_csv(output_name + ".csv") diff --git a/tools/scripts/sctools/build/lib/sctools/gtf.py b/tools/scripts/sctools/build/lib/sctools/gtf.py deleted file mode 100644 index 7f574a9e..00000000 --- a/tools/scripts/sctools/build/lib/sctools/gtf.py +++ /dev/null @@ -1,446 +0,0 @@ -""" -GTF Records and Iterators -========================= - -.. currentmodule:: sctools - -This module defines a GTF record class and a Reader class to iterate over GTF-format files - -Classes -------- -Record Data class that exposes GTF record fields by name -Reader GTF file reader that yields GTF Records - -References ----------- -https://useast.ensembl.org/info/website/upload/gff.html -""" - -import logging -import string -import re -from typing import List, Dict, Generator, Iterable, Union, Set - -from . import reader - -_logger = logging.getLogger(__name__) - - -class GTFRecord: - """Data class for storing and interacting with GTF records - - Subclassed to produce exon, transcript, and gene-specific record types. - A GTF record has 8 fixed fields which are followed by optional fields separated by ;\t, which - are stored by this class in the attributes field and accessible by get_attribute. Fixed fields - are accessible by name. - - Parameters - ---------- - record : str - an unparsed GTF record - - Attributes - ---------- - seqname : str - The name of the sequence (often chromosome) this record is found on. - chromosome : str - Synonym for seqname. - source : str - The group responsible for generating this annotation. - feature : str - The type of record (e.g. gene, exon, ...). - start : str - The start position of this feature relative to the beginning of seqname. - end : str - The end position of this feature relative to the beginning of seqname.... - score : str - The annotation score. Rarely used. - strand : {'+', '-'} - The strand of seqname that this annotation is found on - frame : {'0', '1', '2'} - '0' indicates that the first base of the feature is the first base of a codon, - '1' that the second base is the first base of a codon, and so on - size : int - the number of nucleotides spanned by this feature - - Methods - ------- - get_attribute(key: str) - attempt to retrieve a variable field with name equal to `key` - set_attribute(key: str, value: str) - set variable field `key` equal to `value`. Overwrites `key` if already present. - - """ - - __slots__ = ["_fields", "_attributes"] - - _del_letters: str = string.ascii_letters - _del_non_letters: str = "".join( - set(string.printable).difference(string.ascii_letters) - ) - - def __init__(self, record: str): - fields: List[str] = record.strip(";\n").split("\t") - - self._fields: List[str] = fields[:8] - - self._attributes: Dict[str, str] = {} - for field in fields[8].split(";"): - try: - key, _, value = field.strip().partition(" ") - self._attributes[key] = value.strip('"') - except Exception: - raise RuntimeError( - f'Error parsing field "{field}" of GTF record "{record}"' - ) - - def __repr__(self): - return "" % self.__str__() - - def __bytes__(self): - return self.__str__().encode() - - def __str__(self): - return "\t".join(self._fields) + self._format_attribute() + "\n" - - def __hash__(self) -> int: - return hash(self.__str__()) - - def _format_attribute(self): - return " ".join('%s "%s";' % (k, v) for k, v in self._attributes.items()) - - @property - def seqname(self) -> str: - return self._fields[0] - - @property - def chromosome(self) -> str: - return self._fields[0] # synonym for seqname - - @property - def source(self) -> str: - return self._fields[1] - - @property - def feature(self) -> str: - return self._fields[2] - - @property - def start(self) -> int: - return int(self._fields[3]) - - @property - def end(self) -> int: - return int(self._fields[4]) - - @property - def score(self) -> str: - return self._fields[5] - - @property - def strand(self) -> str: - return self._fields[6] - - @property - def frame(self) -> str: - return self._fields[7] - - @property - def size(self) -> int: - size = self.end - self.start - if size < 0: - raise ValueError(f"Invalid record: negative size {size} (start > end)") - else: - return size - - def get_attribute(self, key) -> str: - """access an item from the attribute field of a GTF file. - - Parameters - ---------- - key : str - Item to retrieve - - Returns - ------- - value : str - Contents of variable attribute `key` - - Raises - ------ - KeyError - if there is no variable attribute `key` associated with this record - - """ - return self._attributes.get(key) - - def set_attribute(self, key, value) -> None: - """Set variable attribute `key` equal to `value` - - If attribute `key` is already set for this record, its contents are overwritten by `value` - - Parameters - ---------- - key : str - attribute name - value : str - attribute content - - """ - self._attributes[key] = value - - def __eq__(self, other): - return hash(self) == hash(other) - - def __ne__(self, other): - return not self.__eq__(other) - - -class Reader(reader.Reader): - """GTF file iterator - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Methods - ------- - filter(retain_types: Iterable[str]) - Iterate over a GTF file, only yielding records in `retain_types`. - __iter__() - iterate over GTF records in file, yielding `Record` objects - - See Also - -------- - sctools.reader.Reader - - """ - - def __init__(self, files="-", mode="r", header_comment_char="#"): - super().__init__( - files, mode, header_comment_char - ) # has different default args from super - - def __iter__(self): - for line in super().__iter__(): - yield GTFRecord(line) - - def filter(self, retain_types: Iterable[str]) -> Generator: - """Iterate over a GTF file, returning only record whose feature type is in retain_types. - - Features are stored in GTF field 2. - - Parameters - ---------- - retain_types : Iterable[str] - Record feature types to retain. - - Yields - ------ - gtf_record : Record - gtf `Record` object - - """ - retain_types = set(retain_types) - for record in self: - if record.feature in retain_types: - yield record - - -# todo this lenient behavior is deemed to change in the future (warning -> exception) -def _resolve_multiple_gene_names(gene_name: str): - _logger.warning( - f'Multiple entries encountered for "{gene_name}". Please validate the input GTF file(s). ' - f"Skipping the record for now; in the future, this will be considered as a " - f"malformed GTF file." - ) - - -def get_mitochondrial_gene_names( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Set[str]: - """Extract mitocholdrial gene names from GTF file(s) and returns a set of mitochondrial - gene id occurrence in the given file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Set(str) - A set of the mitochondrial gene ids - """ - - mitochondrial_gene_ids: Set[str] = set() - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["gene"] - ): - gene_name = record.get_attribute("gene_name") - gene_id = record.get_attribute("gene_id") - - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - if re.match("^mt-", gene_name, re.IGNORECASE): - if gene_id not in mitochondrial_gene_ids: - mitochondrial_gene_ids.add(gene_id) - - return mitochondrial_gene_ids - - -def extract_gene_names( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Dict[str, int]: - """Extract gene names from GTF file(s) and returns a map from gene names to their corresponding - occurrence orders in the given file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Dict[str, int] - A map from gene names to their linear index - """ - gene_name_to_index: Dict[str, int] = dict() - gene_index = 0 - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["gene"] - ): - gene_name = record.get_attribute("gene_name") - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - if gene_name in gene_name_to_index: - _resolve_multiple_gene_names(gene_name) - continue - gene_name_to_index[gene_name] = gene_index - gene_index += 1 - return gene_name_to_index - - -def extract_extended_gene_names( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Dict[str, List[tuple]]: - """Extract extended gene names from GTF file(s) and returns a map from gene names to their corresponding - occurrence locations the given file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Dict[str, List[tuple]] - A dictionary of chromosome names mapping to a List of tuples, each containing - a range as the the first element and a gene name as the second. - Dict[str, List(Tuple((start,end), gene))) - """ - gene_name_to_start_end = dict() - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["gene"] - ): - gene_name = record.get_attribute("gene_name") - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - # find gene collisions - if gene_name in gene_name_to_start_end: - _resolve_multiple_gene_names(gene_name) - continue - if record.chromosome not in gene_name_to_start_end: - gene_name_to_start_end[record.chromosome] = dict() - gene_name_to_start_end[record.chromosome][gene_name] = ( - record.start, - record.end, - ) - gene_locations = dict() - # For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start - for chromosome in gene_name_to_start_end: - gene_locations[chromosome] = [ - (locs, key) for key, locs in gene_name_to_start_end[chromosome].items() - ] - # Sort by starting location - gene_locations[chromosome].sort(key=lambda x: x[0]) - return gene_locations - - -def extract_gene_exons( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Dict[str, List[tuple]]: - """Extract extended gene names from GTF file(s) and returns a map from gene names to the the - list of exons in the ascending order of the start positions file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Dict[str, List[tuple]] - A dictionary of chromosome names mapping to a List of tuples, each containing - a the exons in the ascending order of the start positions. - Dict[str, List(Tuple((start,end), gene))) - """ - gene_name_to_start_end = dict() - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["exon"] - ): - gene_name = record.get_attribute("gene_name") - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - if record.chromosome not in gene_name_to_start_end: - gene_name_to_start_end[record.chromosome] = dict() - - if gene_name not in gene_name_to_start_end[record.chromosome]: - gene_name_to_start_end[record.chromosome][gene_name] = [] - - gene_name_to_start_end[record.chromosome][gene_name].append( - (record.start, record.end) - ) - - gene_locations_exons = dict() - # For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start - for chromosome in gene_name_to_start_end: - gene_locations_exons[chromosome] = [ - (locs, key) for key, locs in gene_name_to_start_end[chromosome].items() - ] - # Sort by starting location - gene_locations_exons[chromosome].sort(key=lambda x: x[0]) - return gene_locations_exons diff --git a/tools/scripts/sctools/build/lib/sctools/metrics/README.md b/tools/scripts/sctools/build/lib/sctools/metrics/README.md deleted file mode 100644 index 8ee554ae..00000000 --- a/tools/scripts/sctools/build/lib/sctools/metrics/README.md +++ /dev/null @@ -1,59 +0,0 @@ -## Metric Processing -This module implements a metric suite that generates information on data quality at the level of -both cells and genes. This QC information aligns with the cells and genes that make up the -expression matrix, providing easy access to information that the user can examine to make decisions -about which cells or genes are of adequate quality to include in downstream processing. - -Metric processing in sctools can be run on large individual files, but also implements a map-reduce -architecture execution at production scale. Specifically, the workflow is as follows: - -1. Chunk the input bam file using `SplitBam`, which generates several chunks, each of which is -guaranteed to contain all data for any cell it contains -2. Sort each chunk by cell, gene, and molecule tags to ensure that all the reads associated with -a molecule are stored sequentially by cell (`CalculateCellMetrics`) or by gene -(`CalculateGeneMetrics`) -3. For each cell or gene, parse the information by molecule, which typically loads fewer than -10,000 records into memory at a time. -4. Merge data across chunks using `MergeCellMetrics` or `MergeGeneMetrics`. - -This map-reduce approach is currently implemented by the -[HCA 3' pipeline](https://github.com/HumanCellAtlas/skylab/blob/master/pipelines/optimus/Optimus.wdl), -but an abbreviated WDL could be made in the future which would contain: - -``` -1. SplitBamByCellBarcode -2. scatter[CalculateMetrics] -3. MergeMetrics -``` - -## Implementation Details: - -This module implements 4 base classes that carry out metric processing. These are: - -``` -MetricAggregator: - - CellMetricAggregator - - GeneMetricAggregator - -MetricGatherer: - - CellMetricGatherer - - GeneMetricGatherer - -MetricCSVWriter - -MergeMetrics: - - MergeCellMetrics - - MergeGeneMetrics -``` -MetricGatherer defines generator functions to group records into molecules, the bam parsing pattern -necessary to process data iteratively. - -MetricAggregator stores the information for a unit of the relevant data (cell, gene), -and processses all the records with the `.parse_records()` method. - -When all records of a single unit (cell, gene) have been processed, `.finalize()` is called to -calculate any higher-order metrics (for example, the variance in quality scores across reads of the -cell or gene), and it is written to file by `MetricSCVWriter`. - -MergeMetrics merges multiple metric outputs from the scattered chunks. This is a trivial -concatenation in the case of cell metrics, and a more complex merge in the case of gene metrics. diff --git a/tools/scripts/sctools/build/lib/sctools/metrics/__init__.py b/tools/scripts/sctools/build/lib/sctools/metrics/__init__.py deleted file mode 100644 index 9ba20677..00000000 --- a/tools/scripts/sctools/build/lib/sctools/metrics/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa -from . import aggregator -from . import gatherer -from . import merge diff --git a/tools/scripts/sctools/build/lib/sctools/metrics/aggregator.py b/tools/scripts/sctools/build/lib/sctools/metrics/aggregator.py deleted file mode 100644 index 2d85199d..00000000 --- a/tools/scripts/sctools/build/lib/sctools/metrics/aggregator.py +++ /dev/null @@ -1,595 +0,0 @@ -""" -Sequence Metric Aggregators -=========================== - -.. currentmodule:: sctools.metrics - -This module provides classes useful for aggregating metric information for individual cells or -genes. These classes consume BAM files that have been pre-sorted such that all sequencing reads -that correspond to the molecules of a cell (CellMetrics) or the molecules of a gene (GeneMetrics) -are yielded sequentially. - -Classes -------- - -.. autosummary:: - :toctree: generated/ - - MetricAggregatorBase Aggregator Base Class - GeneMetrics Class to iteratively calculate metrics for a gene (by molecule) - CellMetrics Class to iteratively calculate metrics for a cell (by molecule) - -Notes ------ -This module can be rewritten with dataclass when python 3.7 stabilizes, see -https://www.python.org/dev/peps/pep-0557/ - - -See Also --------- -sctools.metrics.gatherer -sctools.metrics.merge -sctools.metrics.writer - -""" - - -from typing import Iterable, Tuple, Counter, List, Sequence - -import numpy as np -import pysam - -from sctools import consts -from sctools.stats import OnlineGaussianSufficientStatistic - - -class MetricAggregator: - """Metric Aggregator Base Class - - The ``MetricAggregator`` class defines a set of metrics that can be extracted from an - aligned bam file. It defines all the metrics that are general across genes and cells. This - class is subclassed by ``GeneMetrics`` and ``CellMetrics``, which define data-specific metrics - in the ``parse_extra_fields`` method. An instance of ``GeneMetrics`` or ``CellMetrics`` is - instantiated for each gene or molecule in a bam file, respectively. - - Attributes - ---------- - n_reads : int - The number of reads associated with this entity - noise_reads : int, NotImplemented - Number of reads that are categorized by 10x genomics cellranger as "noise". Refers to - long polymers, or reads with high numbers of N (ambiguous) nucleotides - perfect_molecule_barcodes : int - The number of reads with molecule barcodes that have no errors (cell barcode tag == raw barcode tag) - reads_mapped_exonic : int - The number of reads for this entity that are mapped to exons - reads_mapped_intronic : int - The number of reads for this entity that are mapped to introns - reads_mapped_utr : int - The number of reads for this entity that are mapped to 3' untranslated regions (UTRs) - reads_mapped_uniquely : int - The number of reads mapped to a single unambiguous location in the genome - reads_mapped_multiple : int - The number of reads mapped to multiple genomic positions with equal confidence - # todo make sure equal confidence is accurate - duplicate_reads : int - The number of reads that are duplicates (see README.md for defition of a duplicate) - spliced_reads : int - The number of reads that overlap splicing junctions - antisense_reads : int - The number of reads that are mapped to the antisense strand instead of the transcribed - strand - molecule_barcode_fraction_bases_above_30_mean : float - The average fraction of bases in molecule barcodes that receive quality scores greater than - 30 across the reads of this entity - molecule_barcode_fraction_bases_above_30_variance : float - The variance in the fraction of bases in molecule barcodes that receive quality scores - greater than 30 across the reads of this entity - genomic_reads_fraction_bases_quality_above_30_mean : float - The average fraction of bases in the genomic read that receive quality scores greater than - 30 across the reads of this entity (included for 10x cell ranger count comparison) - genomic_reads_fraction_bases_quality_above_30_variance : float - The variance in the fraction of bases in the genomic read that receive quality scores - greater than 30 across the reads of this entity (included for 10x cell ranger count - comparison) - genomic_read_quality_mean : float - Average quality of Illumina base calls in the genomic reads corresponding to this entity - genomic_read_quality_variance : float - Variance in quality of Illumina base calls in the genomic reads corresponding to this - entity - n_molecules : float - Number of molecules corresponding to this entity. See README.md for the definition of a - Molecule - n_fragments : float - Number of fragments corresponding to this entity. See README.md for the definition of a - Fragment - reads_per_molecule : float - The average number of reads associated with each molecule in this entity - reads_per_fragment : float - The average number of reads associated with each fragment in this entity - fragments_per_molecule : float - The average number of fragments associated with each molecule in this entity - fragments_with_single_read_evidence : int - The number of fragments associated with this entity that are observed by only one read - molecules_with_single_read_evidence : int - The number of molecules associated with this entity that are observed by only one read - - Methods - ------- - parse_extra_fields(tags, record), NotImplemented - Abstract method that must be implemented by subclasses. Called by ``parse_molecule()`` - to gather information for subclass-specific metrics - parse_molecule(tags, record) - Extract information from a set of sequencing reads that correspond to a molecule and store - the data in the MetricAggregator class. - finalize() - Some metrics cannot be calculated until all the information for an entity has been - aggregated, for example, the number of `fragments_per_molecule`. Finalize calculates all - such higher-order metrics - - """ - - def __init__(self): - - # type definitions - Chromosome: int - Strand: bool # reverse = True, see pysam.AlignedSegment.is_reverse - Position: int - Fragment: Tuple[Chromosome, Position, Strand] # noqa: F821 - - # count information - self.n_reads: int = 0 - self.noise_reads: int = 0 # long polymers, N-sequences; NotImplemented - self._fragment_histogram: Counter[Fragment] = Counter() # noqa: F821 - self._molecule_histogram: Counter[str] = Counter() - - # molecule information - self._molecule_barcode_fraction_bases_above_30 = ( - OnlineGaussianSufficientStatistic() - ) - self.perfect_molecule_barcodes = 0 - - self._genomic_reads_fraction_bases_quality_above_30 = ( - OnlineGaussianSufficientStatistic() - ) - self._genomic_read_quality = OnlineGaussianSufficientStatistic() - - # alignment location information - self.reads_mapped_exonic = 0 - self.reads_mapped_intronic = 0 - self.reads_mapped_utr = 0 - - # todo implement this once we have a gene model - # self.reads_mapped_outside_window = 0 # reads should be within 1000 bases of UTR - # self._read_distance_from_termination_site = OnlineGaussianSufficientStatistic() - - # alignment uniqueness information - self.reads_mapped_uniquely = 0 - self.reads_mapped_multiple = 0 - self.duplicate_reads = 0 - - # alignment splicing information - self.spliced_reads = 0 - self.antisense_reads = 0 - self._plus_strand_reads = 0 # strand balance # todo implement property here - - # higher-order methods, filled in by finalize() when all data is extracted - self.molecule_barcode_fraction_bases_above_30_mean: float = None - self.molecule_barcode_fraction_bases_above_30_variance: float = None - self.genomic_reads_fraction_bases_quality_above_30_mean: float = None - self.genomic_reads_fraction_bases_quality_above_30_variance: float = None - self.genomic_read_quality_mean: float = None - self.genomic_read_quality_variance: float = None - self.n_molecules: float = None - self.n_fragments: float = None - self.reads_per_molecule: float = None - self.reads_per_fragment: float = None - self.fragments_per_molecule: float = None - self.fragments_with_single_read_evidence: int = None - self.molecules_with_single_read_evidence: int = None - - @staticmethod - def _quality_string_to_numeric(quality_sequence: Iterable[str]) -> List[int]: - """Convert an HTSlib ASCII quality string to an integer representation. - - Parameters - ---------- - quality_sequence : Iterable[str] - An iterable of Illumina base call qualities in ASCII encoding - - Returns - ------- - numeric_qualities : List[int] - A list of Illumina base call qualities converted to integers - - """ - return [ - ord(c) - 33 for c in quality_sequence - ] # todo look up if this is accurate - - @staticmethod - def _quality_above_threshold( - threshold: int, quality_sequence: Sequence[int] - ) -> float: - """Calculate the fraction of bases called with a quality above ``threshold``. - - Parameters - ---------- - threshold: int - The quality threshold - quality_sequence: Sequence[int] - A sequence of Illumina base qualities - - Returns - ------- - fraction : float - The fraction of bases in ``quality_sequence`` with quality greater than ``threshold`` - - """ - return sum(1 for base in quality_sequence if base > threshold) / len( - quality_sequence - ) - - def _is_noise(self, record: pysam.AlignedSegment) -> bool: - return NotImplemented # todo required because 10x measures this - - def parse_molecule( - self, tags: Sequence[str], records: Iterable[pysam.AlignedSegment] - ) -> None: - """Parse information from all records of a molecule. - - The parsed information is stored in the MetricAggregator in-place. - - Parameters - ---------- - tags : Sequence[str] - all the tags that define this molecule. one of {[CB, GE, UB], [GE, CB, UB]} - records : Iterable[pysam.AlignedSegment] - the sam records associated with the molecule - - """ - for record in records: - - # todo think about how I could use the duplicate tag to reduce computation; duplicates - # should normally come in order in a sorted file - - # extract sub-class-specific information - self.parse_extra_fields(tags=tags, record=record) - - self.n_reads += 1 - # self.noise_reads += self.is_noise(record) # todo implement me - - # the tags passed to this function define a molecule, this increments the counter, - # identifying a new molecule only if a new tag combination is observed - self._molecule_histogram[tags] += 1 - - self._molecule_barcode_fraction_bases_above_30.update( - self._quality_above_threshold( - 30, - self._quality_string_to_numeric( - record.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY) - ), - ) - ) - - # we should be tolerant and handle it if the pysam.AlignedSegment.get_tag - # cannot retrieve the data by a tag since it's not a fatal error - try: - self.perfect_molecule_barcodes += record.get_tag( - consts.RAW_MOLECULE_BARCODE_TAG_KEY - ) == record.get_tag(consts.MOLECULE_BARCODE_TAG_KEY) - except KeyError: - # An error occurred while retrieving the data from the optional alighment section, which - # indicates that the read did not have a corrected UMI sequence. In the future we would like to - # keep track of these reads. - pass - - self._genomic_reads_fraction_bases_quality_above_30.update( - self._quality_above_threshold(30, record.query_alignment_qualities) - ) - - mean_alignment_quality: float = np.mean(record.query_alignment_qualities) - self._genomic_read_quality.update(mean_alignment_quality) - - # the remaining portions deal with aligned reads, so if the read is not mapped, we are - # done with it - if record.is_unmapped: - continue - - # get components that define a unique sequence fragment and increment the histogram - position: int = record.pos - strand: bool = record.is_reverse - reference: int = record.reference_id - self._fragment_histogram[reference, position, strand, tags] += 1 - - alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY) - if alignment_location == consts.CODING_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_exonic += 1 - elif alignment_location == consts.INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_intronic += 1 - elif alignment_location == consts.UTR_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_utr += 1 - - # todo check if read maps outside window (needs gene model) - # todo create distances from terminate side (needs gene model) - - # uniqueness - number_mappings = record.get_tag(consts.NUMBER_OF_HITS_TAG_KEY) - if number_mappings == 1: - self.reads_mapped_uniquely += 1 - else: - self.reads_mapped_multiple += ( - 1 # todo without multi-mapping, this number is zero! - ) - - if record.is_duplicate: - self.duplicate_reads += 1 - - # cigar N field (3) indicates a read is spliced if the value is non-zero - cigar_stats, num_blocks = record.get_cigar_stats() - if cigar_stats[3]: - self.spliced_reads += 1 - - # todo figure out antisense and make this notation clearer; info likely in dropseqtools - self._plus_strand_reads += not record.is_reverse - - def parse_extra_fields( - self, tags: Sequence[str], record: pysam.AlignedSegment - ) -> None: - """Defined by subclasses to extract class-specific information from molecules""" - raise NotImplementedError - - def finalize(self) -> None: - """Calculate metrics that require information from all molecules of an entity - - ``finalize()`` replaces attributes in-place that were initialized by the constructor as - ``None`` with a value calculated across all molecule data that has been aggregated. - - """ - - self.molecule_barcode_fraction_bases_above_30_mean: float = self._molecule_barcode_fraction_bases_above_30.mean - - self.molecule_barcode_fraction_bases_above_30_variance: float = self._molecule_barcode_fraction_bases_above_30.calculate_variance() - - self.genomic_reads_fraction_bases_quality_above_30_mean: float = self._genomic_reads_fraction_bases_quality_above_30.mean - - self.genomic_reads_fraction_bases_quality_above_30_variance: float = self._genomic_reads_fraction_bases_quality_above_30.calculate_variance() - - self.genomic_read_quality_mean: float = self._genomic_read_quality.mean - - self.genomic_read_quality_variance: float = self._genomic_read_quality.calculate_variance() - - self.n_molecules: int = len(self._molecule_histogram.keys()) - - self.n_fragments: int = len(self._fragment_histogram.keys()) - - try: - self.reads_per_molecule: float = self.n_reads / self.n_molecules - except ZeroDivisionError: - self.reads_per_molecule: float = float("nan") - - try: - self.reads_per_fragment: float = self.n_reads / self.n_fragments - except ZeroDivisionError: - self.reads_per_fragment: float = float("nan") - - try: - self.fragments_per_molecule: float = self.n_fragments / self.n_molecules - except ZeroDivisionError: - self.fragments_per_molecule: float = float("nan") - - self.fragments_with_single_read_evidence: int = sum( - 1 for v in self._fragment_histogram.values() if v == 1 - ) - - self.molecules_with_single_read_evidence: int = sum( - 1 for v in self._molecule_histogram.values() if v == 1 - ) - - -class CellMetrics(MetricAggregator): - """Cell Metric Aggregator - - Aggregator that captures metric information about a cell by parsing all of the molecules in - an experiment that were annotated with a specific cell barcode, as recorded in the ``CB`` tag. - - Attributes - ---------- - perfect_cell_barcodes : int - The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``) - reads_mapped_intergenic : int - The number of reads mapped to an intergenic region for this cell - reads_mapped_too_many_loci : int - The number of reads that were mapped to too many loci across the genome and as a - consequence, are reported unmapped by the aligner - cell_barcode_fraction_bases_above_30_variance : float - The variance of the fraction of Illumina base calls for the cell barcode sequence that - are greater than 30, across molecules - cell_barcode_fraction_bases_above_30_mean : float - The average fraction of Illumina base calls for the cell barcode sequence that - are greater than 30, across molecules - n_genes : int - The number of genes detected by this cell - genes_detected_multiple_observations : int - The number of genes that are observed by more than one read in this cell - n_mitochondrial_genes: int - The number of mitochondrial genes detected by this cell - n_mitochondrial_molecules: int - The number of molecules from mitochondrial genes detected for this cell - pct_mitochondrial_molecules: int - The percentage of molecules from mitochondrial genes detected for this cell - - """ - - extra_docs = """ - Examples - -------- - # todo implement me - - See Also - -------- - GeneMetrics - - """ - - __doc__ += MetricAggregator.__doc__ + extra_docs - - def __init__(self): - super().__init__() - - # barcode quality data - self._cell_barcode_fraction_bases_above_30 = OnlineGaussianSufficientStatistic() - self.perfect_cell_barcodes = 0 # inv: fraction cells with errors - - # track non-transcriptomic reads - self.reads_mapped_intergenic = 0 - self.reads_unmapped = 0 - self.reads_mapped_too_many_loci = 0 - - self._genes_histogram = Counter() - - # todo think about whether we can build molecule models that map to things that aren't genes - # i.e. to integentic regions or intronic regions. This could be a part of multi-mapping - # self.molecules_mapped_intergenic = 0 - - self.cell_barcode_fraction_bases_above_30_variance: float = None - self.cell_barcode_fraction_bases_above_30_mean: float = None - self.n_genes: int = None - self.genes_detected_multiple_observations: int = None - self.n_mitochondrial_genes: int = None - self.n_mitochondrial_molecules: int = None - self.pct_mitochondrial_molecules: float = None - - def finalize(self, mitochondrial_genes=set()): - super().finalize() - - self.cell_barcode_fraction_bases_above_30_mean: float = self._cell_barcode_fraction_bases_above_30.mean - - self.cell_barcode_fraction_bases_above_30_variance: float = self._cell_barcode_fraction_bases_above_30.calculate_variance() - - self.n_genes: int = len(self._genes_histogram.keys()) - - self.genes_detected_multiple_observations: int = sum( - 1 for v in self._genes_histogram.values() if v > 1 - ) - - self.n_mitochondrial_genes: int = sum( - 1 for g in self._genes_histogram.keys() if g in mitochondrial_genes - ) - - self.n_mitochondrial_molecules: int = sum( - c for g, c in self._genes_histogram.items() if g in mitochondrial_genes - ) - - if self.n_mitochondrial_molecules: - tot_molecules = sum(self._genes_histogram.values()) - self.pct_mitochondrial_molecules = ( - self.n_mitochondrial_molecules / tot_molecules * 100.0 - ) - else: - self.pct_mitochondrial_molecules = 0.00 - - def parse_extra_fields( - self, tags: Sequence[str], record: pysam.AlignedSegment - ) -> None: - """Parses a record to extract gene-specific information - - Gene-specific metric data is stored in-place in the MetricAggregator - - Parameters - ---------- - tags : Sequence[str] - The GE, UB and CB tags that define this molecule - record : pysam.AlignedSegment - SAM record to be parsed - - """ - self._cell_barcode_fraction_bases_above_30.update( - self._quality_above_threshold( - 30, - self._quality_string_to_numeric( - record.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY) - ), - ) - ) - - # Exclude reads that do not have a CB tag from the perfect_cell_barcodes count - if record.has_tag(consts.CELL_BARCODE_TAG_KEY): - raw_cell_barcode_tag = record.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) - cell_barcode_tag = record.get_tag(consts.CELL_BARCODE_TAG_KEY) - self.perfect_cell_barcodes += raw_cell_barcode_tag == cell_barcode_tag - - try: - alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY) - if alignment_location == consts.INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_intergenic += 1 - except KeyError: - self.reads_unmapped += 1 - - # todo track reads_mapped_too_many_loci after multi-alignment is done - self._genes_histogram[tags[2]] += 1 # note that no gene == None - - -class GeneMetrics(MetricAggregator): - """Gene Metric Aggregator - - Aggregator that captures metric information about a gene by parsing all of the molecules in - an experiment that were annotated with a specific gene ID, as recorded in the ``GE`` tag. - - Attributes - ---------- - number_cells_detected_multiple : int - The number of cells which observe more than one read of this gene - number_cells_expressing : int - The number of cells that detect this gene - - """ - - extra_docs = """ - Examples - -------- - # todo implement me - - See Also - -------- - CellMetrics - - """ - - __doc__ += MetricAggregator.__doc__ + extra_docs - - def __init__(self): - super().__init__() - - self._cells_histogram = Counter() - # todo we don't tag exon right now. Not sure if we want to or not - # self._exon_histogram = Counter() - - self.number_cells_detected_multiple: int = None - self.number_cells_expressing: int = None - - def finalize(self): - super().finalize() - - self.number_cells_expressing: int = len(self._cells_histogram.keys()) - - self.number_cells_detected_multiple: int = sum( - 1 for c in self._cells_histogram.values() if c > 1 - ) - - def parse_extra_fields( - self, tags: Sequence[str], record: pysam.AlignedSegment - ) -> None: - """Parses a record to extract cell-specific information - - Cell-specific metric data is stored in-place in the MetricAggregator - - Parameters - ---------- - tags : Sequence[str] - The CB, UB and GE tags that define this molecule - record : pysam.AlignedSegment - SAM record to be parsed - - """ - self._cells_histogram[tags[1]] += 1 diff --git a/tools/scripts/sctools/build/lib/sctools/metrics/gatherer.py b/tools/scripts/sctools/build/lib/sctools/metrics/gatherer.py deleted file mode 100644 index 91f7287f..00000000 --- a/tools/scripts/sctools/build/lib/sctools/metrics/gatherer.py +++ /dev/null @@ -1,232 +0,0 @@ -""" -Sequence Metric Gatherers -========================= - -..currentmodule:: sctools.metrics - -This module defines classes to gather metrics across the cells or genes of an experiment and write -them to gzip-compressed csv files - -Classes -------- - -.. autosummary:: - :toctree: generated/ - - MetricGatherer Gatherer Base Class - GatherCellMetrics Class to gather metrics on all cells in an experiment - GatherGeneMetrics Class to gather metrics on all genes in an experiment - -See Also --------- -sctools.metrics.aggregator -sctools.metrics.merge -sctools.metrics.writer - -""" - -from contextlib import closing - -import pysam -from typing import Set - -from sctools.bam import iter_cell_barcodes, iter_genes, iter_molecule_barcodes -from sctools.metrics.aggregator import CellMetrics, GeneMetrics -from sctools.metrics.writer import MetricCSVWriter - - -class MetricGatherer: - """Gathers Metrics from an experiment - - Because molecules tend to have relatively small numbers of reads, the memory footprint of - this method is typically small (tens of megabytes). - - Parameters - ---------- - bam_file : str - the bam file containing the reads that metrics should be calculated from. Can be a chunk - of cells or an entire experiment - output_stem : str - the file stem for the gzipped csv output - - Methods - ------- - extract_metrics - extracts metrics from ``bam_file`` and writes them to output_stem.csv.gz - - """ - - def __init__( - self, - bam_file: str, - output_stem: str, - mitochondrial_gene_ids: Set[str] = set(), - compress: bool = True, - ): - self._bam_file = bam_file - self._output_stem = output_stem - self._compress = compress - self._mitochondrial_gene_ids = mitochondrial_gene_ids - - @property - def bam_file(self) -> str: - """the bam file that metrics are generated from""" - return self._bam_file - - def extract_metrics(self, mode="rb") -> None: - """extract metrics from the provided bam file and write the results to csv. - - Parameters - ---------- - mode : {'r', 'rb'}, default 'rb' - the open mode for pysam.AlignmentFile. 'r' indicates the input is a sam file, and 'rb' - indicates a bam file. - - """ - raise NotImplementedError - - -class GatherCellMetrics(MetricGatherer): - - extra_docs = """ - Notes - ----- - ``bam_file`` must be sorted by gene (``GE``), molecule (``UB``), and cell (``CB``), where gene - varies fastest. - - Examples - -------- - >>> from sctools.metrics.gatherer import GatherCellMetrics - >>> import os, tempfile - - >>> # example data - >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam' - >>> temp_dir = tempfile.mkdtemp() - >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True) - >>> g.extract_metrics() - - See Also - -------- - GatherGeneMetrics - - """ - - __doc__ += extra_docs - - def extract_metrics(self, mode: str = "rb") -> None: - """Extract cell metrics from self.bam_file - - Parameters - ---------- - mode : str, optional - Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb'). - - """ - # open the files - with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing( - MetricCSVWriter(self._output_stem, self._compress) - ) as cell_metrics_output: - - # write the header - cell_metrics_output.write_header(vars(CellMetrics())) - - # break up the bam file into sub-iterators over cell barcodes - for cell_iterator, cell_tag in iter_cell_barcodes( - bam_iterator=bam_iterator - ): - metric_aggregator = CellMetrics() - - # break up cell barcodes by molecule barcodes - for molecule_iterator, molecule_tag in iter_molecule_barcodes( - bam_iterator=cell_iterator - ): - - # break up molecule barcodes by gene ids - for gene_iterator, gene_tag in iter_genes( - bam_iterator=molecule_iterator - ): - - # process the data - metric_aggregator.parse_molecule( - tags=(cell_tag, molecule_tag, gene_tag), - records=gene_iterator, - ) - - # write a record for each cell - metric_aggregator.finalize( - mitochondrial_genes=self._mitochondrial_gene_ids - ) - cell_metrics_output.write(cell_tag, vars(metric_aggregator)) - - -class GatherGeneMetrics(MetricGatherer): - - extra_docs = """ - Notes - ----- - ``bam_file`` must be sorted by molecule (``UB``), cell (``CB``), and gene (``GE``), where - molecule varies fastest. - - Examples - -------- - >>> from sctools.metrics.gatherer import GatherCellMetrics - >>> import os, tempfile - - >>> # example data - >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam' - >>> temp_dir = tempfile.mkdtemp() - >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True) - >>> g.extract_metrics() - - See Also - -------- - GatherGeneMetrics - - """ - - __doc__ += extra_docs - - def extract_metrics(self, mode: str = "rb") -> None: - """Extract gene metrics from self.bam_file - - Parameters - ---------- - mode : str, optional - Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb'). - - """ - # open the files - with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing( - MetricCSVWriter(self._output_stem, self._compress) - ) as gene_metrics_output: - - # write the header - gene_metrics_output.write_header(vars(GeneMetrics())) - - # break up the bam file into sub-iterators over gene ids - for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator): - metric_aggregator = GeneMetrics() - - # in case of multi-genes ignore as in the counting stage - if gene_tag and len(gene_tag.split(",")) > 1: - continue - - # break up gene ids by cell barcodes - for cell_iterator, cell_tag in iter_cell_barcodes( - bam_iterator=gene_iterator - ): - - # break up cell barcodes by molecular barcodes - for molecule_iterator, molecule_tag in iter_molecule_barcodes( - bam_iterator=cell_iterator - ): - - # process the data - metric_aggregator.parse_molecule( - tags=(gene_tag, cell_tag, molecule_tag), - records=molecule_iterator, - ) - - # write a record for each gene id - metric_aggregator.finalize() - gene_metrics_output.write(gene_tag, vars(metric_aggregator)) diff --git a/tools/scripts/sctools/build/lib/sctools/metrics/merge.py b/tools/scripts/sctools/build/lib/sctools/metrics/merge.py deleted file mode 100644 index aa4d4831..00000000 --- a/tools/scripts/sctools/build/lib/sctools/metrics/merge.py +++ /dev/null @@ -1,191 +0,0 @@ -""" -Merge Sequence Metrics -====================== - -..currentmodule:: sctools.metrics - -This module defines classes to merge multiple metrics files that have been gathered from bam files -containing disjoint sets of cells. This is a common use pattern, as sequencing datasets are often -chunked to enable horizontal scaling using scatter-gather patterns. - -Classes -------- -MergeMetrics Merge Metrics base class -MergeCellMetrics Class to merge cell metrics -MergeGeneMetrics Class to merge gene metrics - -See Also --------- -sctools.metrics.gatherer -sctools.metrics.aggregator -sctools.metrics.writer - -""" - -from typing import List, Sequence - -import pandas as pd -import numpy as np - - -class MergeMetrics: - """Merges multiple metrics files into a single gzip compressed csv file - - Parameters - ---------- - metric_files : Sequence[str] - metrics files to merge - output_file : str - file name for the merged output - - Methods - ------- - execute - merge metrics files - # todo this should probably be wrapped into __init__ to make this more like a function - - """ - - def __init__(self, metric_files: Sequence[str], output_file: str): - self._metric_files = metric_files - if not output_file.endswith(".csv.gz"): - output_file += ".csv.gz" - self._output_file = output_file - - def execute(self) -> None: - raise NotImplementedError # merge the metrics - - -class MergeCellMetrics(MergeMetrics): - def execute(self) -> None: - """Concatenate input cell metric files - - Since bam files that metrics are calculated from contain disjoint sets of cells, cell - metrics can simply be concatenated together. - - """ - metric_dataframes: List[pd.DataFrame] = [ - pd.read_csv(f, index_col=0) for f in self._metric_files - ] - concatenated_frame: pd.DataFrame = pd.concat(metric_dataframes, axis=0) - concatenated_frame.to_csv(self._output_file, compression="gzip") - - -class MergeGeneMetrics(MergeMetrics): - def execute(self) -> None: - """Merge input gene metric files - - The bam files that metrics are calculated from contain disjoint sets of cells, each - of which can measure the same genes. - As a result, the metric values must be summed (count based metrics) averaged over - (fractional, averge, or variance metrics) or recalculated (metrics that depend on other - metrics). - - """ - - count_data_to_sum = [ - "n_reads", - "noise_reads", - "perfect_molecule_barcodes", - "reads_mapped_exonic", - "reads_mapped_intronic", - "reads_mapped_utr", - "reads_mapped_uniquely", - "reads_mapped_multiple", - "duplicate_reads", - "spliced_reads", - "antisense_reads", - "n_molecules", - "n_fragments", - "fragments_with_single_read_evidence", - "molecules_with_single_read_evidence", - "number_cells_detected_multiple", - "number_cells_expressing", - ] - - sum_operations = {c: "sum" for c in count_data_to_sum} - - def weighted_average(data_frame: pd.DataFrame) -> pd.Series: - """Calculate the average of each metric, weighted by number of reads per chunk - - Parameters - ---------- - data_frame : pd.DataFrame - chunks x metrics data frame - - Returns - ------- - weighted_average_metrics : pd.Series - The average of each metric across chunks, weighted by the number of reads per chunk - - """ - weights = data_frame["n_reads"].values - - columns_to_average_by_read = [ - "molecule_barcode_fraction_bases_above_30_mean", - "molecule_barcode_fraction_bases_above_30_variance", - "genomic_reads_fraction_bases_quality_above_30_mean", - "genomic_reads_fraction_bases_quality_above_30_variance", - "genomic_read_quality_mean", - "genomic_read_quality_variance", - ] - - return pd.Series( - { - c: np.average(data_frame[c], weights=weights) - for c in columns_to_average_by_read - } - ) - - def recalculate_operation(data_frame) -> pd.DataFrame: - """Recalculate metrics that are dependent on other metric values - - Other metrics should be merged before this function is executed - - Parameters - ---------- - data_frame : pd.DataFrame - chunks x metrics data frame - - Returns - ------- - recalculated_metrics : pd.DataFrame - data frame containing recalculated metrics - - """ - return pd.DataFrame( - data={ - "reads_per_molecule": data_frame["n_reads"] - / data_frame["n_molecules"], - "fragments_per_molecule": data_frame["n_fragments"] - / data_frame["n_molecules"], - "reads_per_fragment": data_frame["n_reads"] - / data_frame["n_fragments"], - } - ) - - # pick one file as a nucleus and merge each subsequent dataframe into it - nucleus = pd.read_csv(self._metric_files[0], index_col=0) - for filename in self._metric_files[1:]: - leaf = pd.read_csv(filename, index_col=0) - - # concatenate this leaf with the nucleus metrics file - concatenated = pd.concat([nucleus, leaf], axis=0) - - # group all duplicate gene names together - grouped = concatenated.groupby(level=0, axis=0) - - # execute the merging operations - summed_columns = grouped.agg(sum_operations) - averaged_columns = grouped.apply(weighted_average) - - # stitch the columns back together, add the metrics that need to be recalculated - merged = pd.concat([summed_columns, averaged_columns], axis=1) - recalculated_columns = recalculate_operation(merged) - merged = pd.concat([merged, recalculated_columns], axis=1) - - # set as nucleus and continue - nucleus = merged - - # write the data - nucleus.to_csv(self._output_file, compression="gzip") diff --git a/tools/scripts/sctools/build/lib/sctools/metrics/writer.py b/tools/scripts/sctools/build/lib/sctools/metrics/writer.py deleted file mode 100644 index 2379418c..00000000 --- a/tools/scripts/sctools/build/lib/sctools/metrics/writer.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Metric Writers -============== - -..currentmodule:: sctools.metrics - -This module defines a class to write metrics to csv as the data is generated, cell by cell or gene -by gene. This strategy keeps memory usage low, as no more than a single molecule's worth of sam -records and one cell or gene's worth of metric data are in-memory at a time. - -Classes -------- -MetricCSVWriter Class to write metrics to file - -See Also --------- -sctools.metrics.gatherer -sctools.metrics.aggregator -sctools.metrics.merge - -""" -from typing import TextIO, List, Mapping, Any -from numbers import Number -import gzip - - -class MetricCSVWriter: - """Writes metric information iteratively to (optionally compressed) csv. - - Parameters - ---------- - output_stem : str - File stem for the output file. - compress : bool, optional - Whether or not to compress the output file (default = True). - - Methods - ------- - write_header - Write the metric header to file. - write - Write an array of cell or gene metrics to file. - close - Close the metric file. - - """ - - def __init__(self, output_stem: str, compress=True): - - # check and fix extension: - if compress: - if not output_stem.endswith(".csv.gz"): - output_stem += ".csv.gz" - else: - if not output_stem.endswith(".csv"): - output_stem += ".csv" - self._filename: str = output_stem - - # open the file - if compress: - self._open_fid: TextIO = gzip.open(self._filename, "wt") - else: - self._open_fid: TextIO = open(self._filename, "w") - self._header: List[str] = None - - @property - def filename(self) -> str: - """filename with correct suffix added""" - return self._filename - - def write_header(self, record: Mapping[str, Any]) -> None: - """Write the metric keys to file, producing the header line of the csv file. - - Parameters - ---------- - record : Mapping[str, Any] - Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance, - producing a dictionary of keys to metric values. - - """ - self._header = list(key for key in record.keys() if not key.startswith("_")) - self._open_fid.write("," + ",".join(self._header) + "\n") - - def write(self, index: str, record: Mapping[str, Number]) -> None: - """Write the array of metric values for a cell or gene to file. - - Parameters - ---------- - index : str - The name of the cell or gene that these metrics summarize - record : Mapping[str, Number] - Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance, - producing a dictionary of keys to metric values. - - """ - ordered_fields = [str(record[k]) for k in self._header] - - # genes and cells can be None, call repr to convert to string when this induces a TypeError - try: - self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n") - except TypeError: - index = repr(index) - self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n") - - def close(self) -> None: - """Close the metrics file.""" - self._open_fid.close() diff --git a/tools/scripts/sctools/build/lib/sctools/platform.py b/tools/scripts/sctools/build/lib/sctools/platform.py deleted file mode 100644 index 460f26ac..00000000 --- a/tools/scripts/sctools/build/lib/sctools/platform.py +++ /dev/null @@ -1,1126 +0,0 @@ -""" -Command Line Interface for SC Tools: -==================================== - -.. currentmodule:: sctools - -This module defines the command line interface for SC Tools. Tools are separated into those that -are specific to particular chemistries (e.g. Smart-seq 2) or experimental platforms (e.g. 10x -Genomics v2) and those that are general across any sequencing experiment. - -Currently, only general modules and those used for 10x v2 are implemented - -Classes -------- -GenericPlatform Class containing all general command line utilities -TenXV2 Class containing 10x v2 specific command line utilities - -""" - -import argparse -from typing import Iterable, List, Dict, Set, Optional, Sequence -from itertools import chain - -import pysam -from sctools import fastq, bam, metrics, count, consts, gtf, groups - - -class GenericPlatform: - """Platform-agnostic command line functions available in SC Tools. - - Platform-Agnostic Methods - ------------------------- - tag_sort_bam(): - sort a bam file by zero or more tags and then by queryname - verify_bam_sort(): - verifies whether bam file is correctly sorted by given list of zero or more tags, then queryname - split_bam() - split a bam file into subfiles of equal size - calculate_gene_metrics() - calculate information about genes captured by a sequencing experiment - calculate_cell_metrics() - calculate information about cells captured by a sequencing experiment - merge_gene_metrics() - merge multiple gene metrics files into a single output - merge_cell_metrics() - merge multiple cell metrics files into a single output - bam_to_count() - construct a compressed sparse row count file from a tagged, aligned bam file - merge_count_matrices() - merge multiple csr-format count matrices into a single csr matrix - group_qc_outputs() - aggregate Picard, HISAT2 and RSME QC statisitics - """ - - @classmethod - def tag_sort_bam(cls, args: Iterable = None) -> int: - """Command line entrypoint for sorting a bam file by zero or more tags, followed by queryname. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - description = "Sorts bam by list of zero or more tags, followed by query name" - parser = argparse.ArgumentParser(description=description) - parser.add_argument("-i", "--input_bam", required=True, help="input bamfile") - parser.add_argument("-o", "--output_bam", required=True, help="output bamfile") - parser.add_argument( - "-t", - "--tags", - nargs="+", - action="append", - help="tag(s) to sort by, separated by space, e.g. -t CB GE UB", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - tags = cls.get_tags(args.tags) - with pysam.AlignmentFile(args.input_bam, "rb") as f: - header = f.header - records = f.fetch(until_eof=True) - sorted_records = bam.sort_by_tags_and_queryname(records, tags) - with pysam.AlignmentFile(args.output_bam, "wb", header=header) as f: - for record in sorted_records: - f.write(record) - - return 0 - - @classmethod - def verify_bam_sort(cls, args: Iterable = None) -> int: - """Command line entrypoint for verifying bam is properly sorted by zero or more tags, followed by queryname. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - description = "Verifies whether bam is sorted by the list of zero or more tags, followed by query name" - parser = argparse.ArgumentParser(description=description) - parser.add_argument("-i", "--input_bam", required=True, help="input bamfile") - parser.add_argument( - "-t", - "--tags", - nargs="+", - action="append", - help="tag(s) to use to verify sorting, separated by space, e.g. -t CB GE UB", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - tags = cls.get_tags(args.tags) - with pysam.AlignmentFile(args.input_bam, "rb") as f: - aligned_segments = f.fetch(until_eof=True) - sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(r, tags) - for r in aligned_segments - ) - bam.verify_sort(sortable_records, tags) - - print( - "{0} is correctly sorted by {1} and query name".format(args.input_bam, tags) - ) - return 0 - - @classmethod - def get_tags(cls, raw_tags: Optional[Sequence[str]]) -> Iterable[str]: - if raw_tags is None: - raw_tags = [] - # Flattens into single list when tags specified like -t A -t B -t C - return [t for t in chain.from_iterable(raw_tags)] - - @classmethod - def split_bam(cls, args: Iterable = None) -> int: - """Command line entrypoint for splitting a bamfile into subfiles of equal size. - - prints filenames of chunks to stdout - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-b", "--bamfile", nargs="+", required=True, help="input bamfile" - ) - parser.add_argument( - "-p", "--output-prefix", required=True, help="prefix for output chunks" - ) - parser.add_argument( - "-s", - "--subfile-size", - required=False, - default=1000, - type=float, - help="approximate size target for each subfile (in MB)", - ) - parser.add_argument( - "--num-processes", - required=False, - default=None, - type=int, - help="Number of processes to parallelize over", - ) - parser.add_argument( - "-t", - "--tags", - nargs="+", - help="tag(s) to split bamfile over. Tags are checked sequentially, " - "and tags after the first are only checked if the first tag is " - "not present.", - ) - parser.set_defaults(raise_missing=True) - parser.add_argument( - "--drop-missing", - action="store_false", - help="drop records without tag specified by -t/--tag (default " - "behavior is to raise an exception", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - filenames = bam.split( - args.bamfile, - args.output_prefix, - args.tags, - approx_mb_per_split=args.subfile_size, - raise_missing=args.drop_missing, - num_processes=args.num_processes, - ) - - print(" ".join(filenames)) - return 0 - - @classmethod - def calculate_gene_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for calculating gene metrics from a sorted bamfile. - - Writes metrics to .csv - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input-bam", required=True, help="Input bam file name." - ) - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - gene_metric_gatherer = metrics.gatherer.GatherGeneMetrics( - args.input_bam, args.output_filestem - ) - gene_metric_gatherer.extract_metrics() - return 0 - - @classmethod - def calculate_cell_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for calculating cell metrics from a sorted bamfile. - - Writes metrics to .csv - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input-bam", required=True, help="Input bam file name." - ) - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - parser.add_argument( - "-a", - "--gtf-annotation-file", - required=False, - default=None, - help="gtf annotation file that bam_file was aligned against", - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - # load mitochondrial gene ids from the annotation file - mitochondrial_gene_ids: Set(str) = set() - if args.gtf_annotation_file: - mitochondrial_gene_ids = gtf.get_mitochondrial_gene_names( - args.gtf_annotation_file - ) - - cell_metric_gatherer = metrics.gatherer.GatherCellMetrics( - args.input_bam, args.output_filestem, mitochondrial_gene_ids - ) - cell_metric_gatherer.extract_metrics() - return 0 - - @classmethod - def merge_gene_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for merging multiple gene metrics files. - - Merges multiple metrics inputs into a single metrics file that matches the shape and - order of the generated count matrix. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument("metric_files", nargs="+", help="Input metric files") - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - merge = metrics.merge.MergeGeneMetrics(args.metric_files, args.output_filestem) - merge.execute() - return 0 - - @classmethod - def merge_cell_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for merging multiple cell metrics files. - - Merges multiple metrics inputs into a single metrics file that matches the shape and - order of the generated count matrix. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument("metric_files", nargs="+", help="Input metric files") - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - merge = metrics.merge.MergeCellMetrics(args.metric_files, args.output_filestem) - merge.execute() - return 0 - - @classmethod - def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for constructing a count matrix from a tagged bam file. - - Constructs a count matrix from an aligned bam file sorted by cell barcode, molecule - barcode, and gene id. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.set_defaults( - cell_barcode_tag=consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag=consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag=consts.GENE_NAME_TAG_KEY, - sn_rna_seq_mode=False, - ) - parser.add_argument("-b", "--bam-file", help="input_bam_file", required=True) - parser.add_argument( - "-o", "--output-prefix", help="file stem for count matrix", required=True - ) - parser.add_argument( - "-a", - "--gtf-annotation-file", - required=True, - help="gtf annotation file that bam_file was aligned against", - ) - parser.add_argument( - "-c", - "--cell-barcode-tag", - help=f"tag that identifies the cell barcode (default = {consts.CELL_BARCODE_TAG_KEY})", - ) - parser.add_argument( - "-m", - "--molecule-barcode-tag", - help=f"tag that identifies the molecule barcode (default = {consts.MOLECULE_BARCODE_TAG_KEY})", - ) - parser.add_argument( - "-g", - "--gene-id-tag", - help=f"tag that identifies the gene name (default = {consts.GENE_NAME_TAG_KEY})", - ) - - parser.add_argument( - "-n", - "--sn-rna-seq-mode", - action="store_true", - help=f"snRNA Seq mode (default = False)", - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - # assume bam file unless the file explicitly has a sam suffix - open_mode = "r" if args.bam_file.endswith(".sam") else "rb" - - # load gene names from the annotation file - gene_name_to_index: Dict[str, int] = gtf.extract_gene_names( - args.gtf_annotation_file - ) - - # For snRNA-seq we need the extended gene information - if args.sn_rna_seq_mode: - gene_locations = gtf.extract_extended_gene_names(args.gtf_annotation_file) - else: - gene_locations = None - - matrix = count.CountMatrix.from_sorted_tagged_bam( - bam_file=args.bam_file, - gene_name_to_index=gene_name_to_index, - chromosomes_gene_locations_extended=gene_locations, - cell_barcode_tag=args.cell_barcode_tag, - molecule_barcode_tag=args.molecule_barcode_tag, - gene_name_tag=args.gene_id_tag, - open_mode=open_mode, - ) - matrix.save(args.output_prefix) - - return 0 - - @classmethod - def merge_count_matrices(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for constructing a count matrix from a tagged bam file. - - Constructs a count matrix from an aligned bam file sorted by cell barcode, molecule - barcode, and gene id. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--input-prefixes", - nargs="+", - help="prefix for count matrices to be concatenated. e.g. test_counts " - "for test_counts.npz, test_counts_col_index.npy, and test_counts_" - "row_index.npy", - ) - parser.add_argument( - "-o", "--output-stem", help="file stem for merged csr matrix", required=True - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - count_matrix = count.CountMatrix.merge_matrices(args.input_prefixes) - count_matrix.save(args.output_stem) - - return 0 - - @classmethod - def group_qc_outputs(cls, args: Iterable[str] = None) -> int: - """Commandline entrypoint for parsing picard metrics files, hisat2 and rsem statistics log files. - Parameters - ---------- - args: - file_names: array of files - output_name: prefix of output file name. - metrics_type: Picard, PicardTable, HISAT2, RSEM and Core. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", - "--file_names", - dest="file_names", - nargs="+", - required=True, - help="a list of files to be parsed out.", - ) - parser.add_argument( - "-o", - "--output_name", - dest="output_name", - required=True, - help="The output file name", - ) - parser.add_argument( - "-t", - "--metrics_type", - dest="metrics_type", - choices=["Picard", "PicardTable", "Core", "HISAT2", "RSEM"], - required=True, - help="a list of string to represent metrics types,such Picard, PicardTable, HISAT2,RSEM, Core", - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - if args.metrics_type == "Picard": - groups.write_aggregated_picard_metrics_by_row( - args.file_names, args.output_name - ) - elif args.metrics_type == "PicardTable": - groups.write_aggregated_picard_metrics_by_table( - args.file_names, args.output_name - ) - elif args.metrics_type == "Core": - groups.write_aggregated_qc_metrics(args.file_names, args.output_name) - elif args.metrics_type == "HISAT2": - groups.parse_hisat2_log(args.file_names, args.output_name) - elif args.metrics_type == "RSEM": - groups.parse_rsem_cnt(args.file_names, args.output_name) - return 0 - - -class TenXV2(GenericPlatform): - """Command Line Interface for 10x Genomics v2 RNA-sequencing programs - - This class defines several methods that are created as CLI tools when sctools is installed - (see setup.py) - - Attributes - ---------- - cell_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the cell barcode and the tags to - assign the sequence and quality of the cell barcode - molecule_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the molecule barcode and the tags - to assign the sequence and quality of the molecule barcode - sample_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the sample barcode and the tags - to assign the sequence and quality of the sample barcode - - Methods - ------- - attach_barcodes() - Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse - (r2) bam file - - """ - - # 10x contains three barcodes embedded within sequencing reads. The below objects define the - # start and end points of those barcodes relative to the start of the sequence, and the - # GA4GH standard tags that the extracted barcodes should be labeled with in the BAM file. - cell_barcode = fastq.EmbeddedBarcode( - start=0, - end=16, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - molecule_barcode = fastq.EmbeddedBarcode( - start=16, - end=26, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - sample_barcode = fastq.EmbeddedBarcode( - start=0, - end=8, - quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY, - ) - - @classmethod - def _tag_bamfile( - cls, - input_bamfile_name: str, - output_bamfile_name: str, - tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator], - ) -> None: - """Adds tags from fastq file(s) to a bam file. - - Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from - `input_bamfile_name`, and writes the result to `output_bamfile_name` - - Parameters - ---------- - input_bamfile_name : str - input bam - output_bamfile_name : str - output bam - tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] - Iterable of generators that yield barcodes from fastq files - - """ - bam_tagger = bam.Tagger(input_bamfile_name) - bam_tagger.tag(output_bamfile_name, tag_generators) - - @classmethod - def _make_tag_generators( - cls, r1, i1=None, whitelist=None - ) -> List[fastq.EmbeddedBarcodeGenerator]: - """Create tag generators from fastq files. - - Tag generators are iterators that run over fastq records, they extract and yield all of the - barcodes embedded in each fastq record. For 10x, this means extracting the cell, umi, and - optionally, the sample barcode. - - Parameters - ---------- - r1 : str - forward fastq file - i1 : str, optional - index fastq file - whitelist : str, optional - A file that contains a list of acceptable cell barcodes - - Returns - ------- - tag_generators, List[EmbeddedBarcodeGenerator] - EmbeddedBarcodeGenerators containing barcodes from 10x fastq records - - """ - tag_generators = [] - - # generator for cell and molecule barcodes - if whitelist is not None: - tag_generators.append( - fastq.BarcodeGeneratorWithCorrectedCellBarcodes( - fastq_files=r1, - embedded_cell_barcode=cls.cell_barcode, - whitelist=whitelist, - other_embedded_barcodes=[cls.molecule_barcode], - ) - ) - else: - tag_generators.append( - fastq.EmbeddedBarcodeGenerator( - fastq_files=r1, - embedded_barcodes=[cls.cell_barcode, cls.molecule_barcode], - ) - ) - - # generator for sample barcodes - if i1 is not None: - tag_generators.append( - fastq.EmbeddedBarcodeGenerator( - fastq_files=i1, embedded_barcodes=[cls.sample_barcode] - ) - ) - return tag_generators - - @classmethod - def attach_barcodes(cls, args=None): - """Command line entrypoint for attaching barcodes to a bamfile. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--r1", - required=True, - help="read 1 fastq file for a 10x genomics v2 experiment", - ) - parser.add_argument( - "--u2", - required=True, - help="unaligned bam containing cDNA fragments. Can be converted from fastq read 2" - "using picard FastqToSam", - ) - parser.add_argument( - "--i1", - default=None, - help="(optional) i7 index fastq file for a 10x genomics experiment", - ) - parser.add_argument( - "-o", "--output-bamfile", required=True, help="filename for tagged bam" - ) - parser.add_argument( - "-w", - "--whitelist", - default=None, - help="optional cell barcode whitelist. If provided, corrected barcodes " - "will also be output when barcodes are observed within 1ED of a " - "whitelisted barcode", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) - cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) - - return 0 - - -class BarcodePlatform(GenericPlatform): - """Command Line Interface for extracting and attaching barcodes with specified positions - generalizing TenXV2 attach barcodes - - Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the - corresponding barcode's start position and and length are provided. The sample barcode is extracted - from the index i7 fastq file and the cell and molecule barcode are extracted from the r1 fastq file - - This class defines several methods that are created as CLI tools when sctools is installed - (see setup.py) - - Attributes - ---------- - cell_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the cell barcode and the tags to - assign the sequence and quality of the cell barcode - molecule_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the molecule barcode and the tags - to assign the sequence and quality of the molecule barcode - sample_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the sample barcode and the tags - to assign the sequence and quality of the sample barcode - - Methods - ------- - attach_barcodes() - Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse - (r2) bam file - - """ - - cell_barcode = None - molecule_barcode = None - sample_barcode = None - - @classmethod - def _validate_barcode_args(cls, args): - """Validates that the barcode start position is greater than 0 - - Parameters - ---------- - args : object - arguments list, The default value of None, when passed to `parser.parse_args` - causes the parser to read `sys.argv` - - Returns - ------- - args : object - return arguments list if valid - - """ - # check that if a barcode start position is provided, its length is also (and vice versa) - cls._validate_barcode_length_and_position( - args.cell_barcode_start_pos, args.cell_barcode_length - ) - cls._validate_barcode_length_and_position( - args.molecule_barcode_start_pos, args.molecule_barcode_length - ) - cls._validate_barcode_length_and_position( - args.sample_barcode_start_pos, args.sample_barcode_length - ) - - # check that an index fastq is provided sample barcode length and position are given - if args.i1 is None and args.sample_barcode_length: - raise argparse.ArgumentError( - "An i7 index fastq file must be given to attach a sample barcode" - ) - - # check that cell and molecule barcodes don't overlap - if args.cell_barcode_length and args.molecule_barcode_length: - cls._validate_barcode_input( - args.molecule_barcode_start_pos, - args.cell_barcode_start_pos + args.cell_barcode_length, - ) - - return args - - @classmethod - def _validate_barcode_length_and_position( - cls, barcode_start_position, barcode_length - ): - """Checks that either that both barcode length and position are given or that neither are given as arguments - - Parameters - ---------- - barcode_start_position : int - the user defined start position (base pairs) of the barcode - - barcode_length : int - the user defined length (base pairs) of the barcode - - Returns - ------- - given_value : int - return given value if valid - - """ - barcode_start_pos_exists = bool(barcode_start_position) or ( - barcode_start_position == 0 - ) - barcode_length_exists = bool(barcode_length) - # (XOR boolean logic) - if barcode_start_pos_exists != barcode_length_exists: - raise argparse.ArgumentError( - "Invalid position/length, both position and length must be provided by the user together" - ) - - @classmethod - def _validate_barcode_input(cls, given_value, min_value): - """Validates that the barcode input is greater than a min value - - Parameters - ---------- - given_value : int - the given value that must be greater than the min_value, - (barcode length or barcode starting position) - - min_value : int - the min value that the given_value must be greater than - - Returns - ------- - given_value : int - return given value if valid - - """ - if given_value < min_value: - raise argparse.ArgumentTypeError("Invalid barcode length/position") - return given_value - - @classmethod - def _validate_barcode_start_pos(cls, given_value): - """Validates that the barcode start position is greater than 0 - - Parameters - ---------- - given_value : Union[int, str] - the given start position of the barcode to validate - - Returns - ------- - given_value : int - returns the start position if it is valid - - """ - return cls._validate_barcode_input(int(given_value), 0) - - @classmethod - def _validate_barcode_length(cls, given_value): - """Validates that the barcode length is greater than 1 - - Parameters - ---------- - given_value : Union[int, str] - the given length of the barcode to validate - - Returns - ------- - given_value : int - returns the length if it is valid - - """ - return cls._validate_barcode_input(int(given_value), 1) - - @classmethod - def _tag_bamfile( - cls, - input_bamfile_name: str, - output_bamfile_name: str, - tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator], - ) -> None: - """Adds tags from fastq file(s) to a bam file. - - Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from - `input_bamfile_name`, and writes the result to `output_bamfile_name` - - Parameters - ---------- - input_bamfile_name : str - input bam - output_bamfile_name : str - output bam - tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] - Iterable of generators that yield barcodes from fastq files - - """ - bam_tagger = bam.Tagger(input_bamfile_name) - bam_tagger.tag(output_bamfile_name, tag_generators) - - @classmethod - def _make_tag_generators( - cls, r1, i1=None, whitelist=None - ) -> List[fastq.EmbeddedBarcodeGenerator]: - """Create tag generators from fastq files. - - Tag generators are iterators that run over fastq records, they extract and yield all of the - barcodes embedded in each fastq record. This means extracting the cell, umi, and/or the sample barcode. - - Parameters - ---------- - r1 : str - forward fastq file, where possibly the cell and/or molecule barcode is found - i1 : str, optional - index fastq file, where the sample barcode is found - whitelist : str, optional - A file that contains a list of acceptable cell barcodes - - Returns - ------- - tag_generators : List[EmbeddedBarcodeGenerator] - EmbeddedBarcodeGenerators containing barcodes from the given fastq - - """ - tag_generators = [] - barcode_args = {"fastq_files": r1} - - if i1: - sample_barcode_args = dict(barcode_args) - sample_barcode_args["embedded_barcodes"] = [cls.sample_barcode] - tag_generators.append(fastq.EmbeddedBarcodeGenerator(**sample_barcode_args)) - - if whitelist: - barcode_args["whitelist"] = whitelist - if cls.cell_barcode: - barcode_args["embedded_cell_barcode"] = cls.cell_barcode - if cls.molecule_barcode: - barcode_args["other_embedded_barcodes"] = [cls.molecule_barcode] - tag_generators.append( - fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args) - ) - - else: - # for all the barcodes that have a length and starting position specified - barcode_args["embedded_barcodes"] = [ - barcode - for barcode in [cls.cell_barcode, cls.molecule_barcode] - if barcode - ] - tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) - - return tag_generators - - @classmethod - def attach_barcodes(cls, args=None): - """Command line entrypoint for attaching barcodes to a bamfile. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, The default value of None, when passed to `parser.parse_args` - causes the parser to read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--r1", - required=True, - help="read 1 fastq file, where the cell and molecule barcode is found", - ) - parser.add_argument( - "--u2", - required=True, - help="unaligned bam, can be converted from fastq read 2" - "using picard FastqToSam", - ) - parser.add_argument( - "-o", "--output-bamfile", required=True, help="filename for tagged bam" - ) - parser.add_argument( - "-w", - "--whitelist", - default=None, - help="optional cell barcode whitelist. If provided, corrected barcodes " - "will also be output when barcodes are observed within 1ED of a " - "whitelisted barcode", - ) - parser.add_argument( - "--i1", - default=None, - help="(optional) i7 index fastq file, where the sample barcode is found", - ) - parser.add_argument( - "--sample-barcode-start-position", - dest="sample_barcode_start_pos", - default=None, - help="the user defined start position (base pairs) of the sample barcode", - type=cls._validate_barcode_start_pos, - ) - parser.add_argument( - "--sample-barcode-length", - dest="sample_barcode_length", - default=None, - help="the user defined length (base pairs) of the sample barcode", - type=cls._validate_barcode_length, - ) - parser.add_argument( - "--cell-barcode-start-position", - dest="cell_barcode_start_pos", - default=None, - help="the user defined start position, in base pairs, of the cell barcode", - type=cls._validate_barcode_start_pos, - ) - parser.add_argument( - "--cell-barcode-length", - dest="cell_barcode_length", - default=None, - help="the user defined length, in base pairs, of the cell barcode", - type=cls._validate_barcode_length, - ) - parser.add_argument( - "--molecule-barcode-start-position", - dest="molecule_barcode_start_pos", - default=None, - help="the user defined start position, in base pairs, of the molecule barcode " - "(must be not overlap cell barcode if cell barcode is provided)", - type=cls._validate_barcode_start_pos, - ) - parser.add_argument( - "--molecule-barcode-length", - dest="molecule_barcode_length", - default=None, - help="the user defined length, in base pairs, of the molecule barcode", - type=cls._validate_barcode_length, - ) - - # parse and validate the args - if args: - args = parser.parse_args(args) - else: - args = parser.parse_args() - cls._validate_barcode_args(args) - - # if the length and there for the start pos have been given as args - # get the appropriate barcodes - if args.cell_barcode_length: - cls.cell_barcode = fastq.EmbeddedBarcode( - start=args.cell_barcode_start_pos, - end=args.cell_barcode_start_pos + args.cell_barcode_length, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - if args.molecule_barcode_length: - cls.molecule_barcode = fastq.EmbeddedBarcode( - start=args.molecule_barcode_start_pos, - end=args.molecule_barcode_start_pos + args.molecule_barcode_length, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - if args.sample_barcode_length: - cls.sample_barcode = fastq.EmbeddedBarcode( - start=args.sample_barcode_start_pos, - end=args.sample_barcode_start_pos + args.sample_barcode_length, - quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY, - ) - - # make the tags and attach the barcodes - tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) - cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) - - return 0 diff --git a/tools/scripts/sctools/build/lib/sctools/reader.py b/tools/scripts/sctools/build/lib/sctools/reader.py deleted file mode 100644 index bc26f1cf..00000000 --- a/tools/scripts/sctools/build/lib/sctools/reader.py +++ /dev/null @@ -1,204 +0,0 @@ -""" -Sequence File Iterators -======================= - -.. currentmodule:: sctools - -This module defines a general iterator and some helper functions for iterating over files -that contain sequencing data - -Methods -------- -infer_open(file_: str, mode: str) - helper function that determines the compression type of a file without relying on its extension -zip_readers(*readers, indices=None) - helper function that iterates over one or more readers, optionally extracting only the records - that correspond to indices - -Classes -------- -Reader Basic reader that loops over one or more input files. - -See Also --------- -sctools.gtf.Reader -sctools.fastq.Reader - -""" - -import os -import gzip -import bz2 -from copy import copy -from functools import partial -from typing import Callable, Iterable, Generator, Set, List - - -def infer_open(file_: str, mode: str) -> Callable: - """Helper function to infer the correct compression type of an input file - - Identifies files that are .gz or .bz2 compressed without requiring file extensions - - Parameters - ---------- - file_ : str - the file to open - mode : {'r', 'rb'} - the mode to open the file in. 'r' returns strings, 'rb' returns bytes - - Returns - ------- - open_function : Callable - the correct open function for the file's compression with mode pre-set through functools - partial - - """ - with open(file_, "rb") as f: - data: bytes = f.read(3) - - # gz and bzip treat 'r' = bytes, 'rt' = string - if data[:2] == b"\x1f\x8b": # gzip magic number - inferred_openhook: Callable = gzip.open - inferred_mode: str = "rt" if mode == "r" else mode - - elif data == b"BZh": # bz2 magic number - inferred_openhook: Callable = bz2.open - inferred_mode: str = "rt" if mode == "r" else mode - - else: - inferred_openhook: Callable = open - inferred_mode: str = mode - - return partial(inferred_openhook, mode=inferred_mode) - - -class Reader: - """Basic reader object that seamlessly loops over multiple input files. - - Is subclassed to create readers for specific file types (e.g. fastq, gtf, etc.) - - Parameters - ---------- - files : Union[str, List], optional - The file(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - The open mode for files. If 'r', yield string data, if 'rb', yield bytes data - (default = 'r'). - header_comment_char : str, optional - If not None, skip lines beginning with this character (default = None). - - """ - - def __init__(self, files="-", mode="r", header_comment_char=None): - if isinstance(files, str): - self._files = [files] - elif isinstance(files, Iterable): # test items of iterable - files = list(files) - if all(isinstance(f, str) for f in files): - self._files = files - else: - raise TypeError("All passed files must be type str") - else: - raise TypeError("Files must be a string filename or a list of such names.") - - # set open mode: - if mode not in {"r", "rb"}: - raise ValueError("Mode must be one of 'r', 'rb'") - self._mode = mode - - if isinstance(header_comment_char, str) and mode == "rb": - self._header_comment_char = header_comment_char.encode() - else: - self._header_comment_char = header_comment_char - - @property - def filenames(self) -> List[str]: - return self._files - - def __len__(self): - """Return the length of the Reader object. - - Notes - ----- - This function requires reading the complete file, and should typically not be - used with sys.stdin, as it will consume the input. - - """ - return sum(1 for _ in self) - - def __iter__(self): - for file_ in self._files: - - f = infer_open(file_, self._mode)(file_) - - # iterate over the file, dropping header lines if requested - try: - file_iterator = iter(f) - if self._header_comment_char is not None: - first_record = next(file_iterator) - while first_record.startswith(self._header_comment_char): - first_record = next(file_iterator) - - yield first_record # avoid loss of first non-comment line - - for record in file_iterator: # now, run to exhaustion - yield record - finally: # clean up - f.close() - - @property - def size(self) -> int: - """return the collective size of all files being read in bytes""" - return sum(os.stat(f).st_size for f in self._files) - - def select_record_indices(self, indices: Set) -> Generator: - """Iterate over provided indices only, skipping other records. - - Parameters - ---------- - indices : Set[int] - indices to include in the output - - Yields - ------ - record, str - records from file corresponding to indices - - """ - indices = copy( - indices - ) # passed indices is a reference, need own copy to modify - for idx, record in enumerate(self): - if idx in indices: - yield record - indices.remove(idx) - - # stopping condition - if not indices: - break - - -def zip_readers(*readers, indices=None) -> Generator: - """Zip together multiple reader objects, yielding records simultaneously. - - If indices is passed, only return lines in file that correspond to indices - - Parameters - ---------- - *readers : List[Reader] - Reader objects to simultaneously iterate over - indices : Set[int], optional - indices to include in the output - - Yields - ------ - records : Tuple[str] - one record per reader passed - - """ - if indices: - iterators = zip(*(r.select_record_indices(indices) for r in readers)) - else: - iterators = zip(*readers) - for record_tuple in iterators: - yield record_tuple diff --git a/tools/scripts/sctools/build/lib/sctools/stats.py b/tools/scripts/sctools/build/lib/sctools/stats.py deleted file mode 100644 index a303f5fd..00000000 --- a/tools/scripts/sctools/build/lib/sctools/stats.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Statistics Functions for Sequence Data Analysis -=============================================== - -.. currentmodule:: sctools - -This module implements statistical modules for sequence analysis - -Methods -------- -base4_entropy(x: np.array, axis: int=1) - calculate the entropy of a 4 x sequence length base frequency matrix - -Classes -------- -OnlineGaussianSuficientStatistic Empirical (online) calculation of mean and variance - -""" - -from typing import Tuple -import numpy as np - - -def base4_entropy(x, axis=1): - """Calculate entropy in base four of a data matrix x - - Useful for measuring DNA entropy (with 4 nucleotides) as the output is restricted to [0, 1] - - Parameters - ---------- - x : np.ndarray - array of dimension one or more containing numeric types - axis : int, optional - axis to calculate entropy across. Values in this axis are treated as observation frequencies - - Returns - ------- - entropy : np.ndarray - array of input dimension - 1 containin entropy values bounded in [0, 1] - - """ - - # convert to probabilities - if axis == 1: - x = np.divide(x, np.sum(x, axis=axis)[:, None]) - else: - x = np.divide(x, np.sum(x, axis=axis)) - - with np.errstate(divide="ignore"): - r = np.log(x) / np.log(4) - - # convention: 0 * log(0) = 0, != -INF. - r[np.isinf(r)] = 0 - - return np.abs(-1 * np.sum(x * r, axis=axis)) - - -class OnlineGaussianSufficientStatistic: - """ - Implementation of Welford's online mean and variance algorithm - - Methods - ------- - update(new_value: float) - incorporate new_value into the online estimate of mean and variance - mean() - return the mean value - calculate_variance() - calculate and return the variance - mean_and_variance() - return both mean and variance - - """ - - __slots__ = ["_count", "_mean", "_mean_squared_error"] - - def __init__(self): - self._mean_squared_error: float = 0.0 - self._mean: float = 0.0 - self._count: int = 0 - - def update(self, new_value: float) -> None: - self._count += 1 - delta = new_value - self._mean - self._mean += delta / self._count - delta2 = new_value - self._mean - self._mean_squared_error += delta * delta2 - - @property - def mean(self) -> float: - """return the mean value""" - return self._mean - - def calculate_variance(self): - """calculate and return the variance""" - if self._count < 2: - return float("nan") - else: - return self._mean_squared_error / (self._count - 1) - - def mean_and_variance(self) -> Tuple[float, float]: - """calculate and return the mean and variance""" - return self.mean, self.calculate_variance() diff --git a/tools/scripts/sctools/build/lib/sctools/test/__init__.py b/tools/scripts/sctools/build/lib/sctools/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tools/scripts/sctools/build/lib/sctools/test/characterize-cell-testing-data.ipynb b/tools/scripts/sctools/build/lib/sctools/test/characterize-cell-testing-data.ipynb deleted file mode 100644 index 37fc8747..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/characterize-cell-testing-data.ipynb +++ /dev/null @@ -1,1057 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load Testing Data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import pysam\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:24: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n" - ] - } - ], - "source": [ - "def parse_record(record):\n", - " \"\"\"line parser to build dataframe, supports missing tags in test data\"\"\"\n", - " data = {\n", - " 'qname': record.query_name,\n", - " 'flag': record.flag,\n", - " 'reference': record.reference_id,\n", - " 'position': record.pos,\n", - " 'mapq': record.query_alignment_qualities,\n", - " 'cigar': record.cigarstring,\n", - " 'rnext': record.rnext, \n", - " 'pnext': record.pnext,\n", - " 'tlen': record.tlen, \n", - " 'sequence': record.seq,\n", - " 'quality': record.qual,\n", - " }\n", - " for name, tag in record.get_tags():\n", - " data[name] = tag\n", - " return pd.Series(data)\n", - "\n", - "input_sam_file = 'data/small-cell-sorted.bam'\n", - "with pysam.AlignmentFile(input_sam_file, 'rb') as f:\n", - " records = []\n", - " for record in f:\n", - " records.append(parse_record(record))\n", - "\n", - "data = pd.concat(records, axis=1).T\n", - "\n", - "results_scalar = {} # will hold the calculations we make\n", - "\n", - "# add a strand field\n", - "data['strand'] = [f & 16 for f in data['flag']]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build Expectations for Testing Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Reads" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "656\n" - ] - } - ], - "source": [ - "results_scalar['n_reads'] = len(data)\n", - "print(results_scalar['n_reads'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Genes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "11\n" - ] - } - ], - "source": [ - "results_scalar['n_genes'] = len(data.groupby(['GE']))\n", - "print(results_scalar['n_genes'])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.9827586206896552\n" - ] - } - ], - "source": [ - "mean_n_genes = data.groupby(['CB']).apply(lambda x: len(set(x['GE']))).mean()\n", - "print(mean_n_genes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Gene table should have 8 entries plus a header for a total of 9 lines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Molecules\n", - "\n", - "Molecules are defined as a unique triplet of CB, UB, and GE" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "249\n" - ] - } - ], - "source": [ - "results_scalar['n_molecules'] = len(data.groupby(['CB', 'UB', 'GE']))\n", - "print(results_scalar['n_molecules'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Fragments\n", - "\n", - "Fragments are defined as molecules are (CB, UB, GE) but must additionally have a unique position" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "499\n" - ] - } - ], - "source": [ - "results_scalar['n_fragments'] = len(data.groupby(['CB', 'UB', 'GE', 'position']))\n", - "print(results_scalar['n_fragments'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Most Abundant Gene\n", - "\n", - "Based on the above, at least one of the genes has to be observed more than once. Which is it? " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MTATP6P1 300\n" - ] - } - ], - "source": [ - "results_scalar['most_abundant'] = data.groupby(['GE']).size().idxmax()\n", - "results_scalar['most_abundant_gene_n_observations'] = data.groupby(['GE']).size().max()\n", - "print(results_scalar['most_abundant'], results_scalar['most_abundant_gene_n_observations'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cell with most reads" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "94" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.groupby(['CB']).apply(lambda x: len(x)).max()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## perfect molecule barcodes" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['perfect_molecule_barcodes'] = 0\n", - "for c, r in zip(data['UB'], data['UR']):\n", - " if c == r:\n", - " results_scalar['perfect_molecule_barcodes'] += 1" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "655" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_scalar['perfect_molecule_barcodes']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the alignment metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_exonic'] = sum(data['XF'] == 'CODING')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_intronic'] = sum(data['XF'] == 'INTRONIC')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_utr'] = sum(data['XF'] == 'UTR')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_uniquely'] = sum(data['NH'] == 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['duplicate_reads'] = sum((data['flag'] & 1024).astype(bool))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['spliced_reads'] = sum(1 for v in data['cigar'] if 'N' in v)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicate_reads': 107,\n", - " 'most_abundant': 'MTATP6P1',\n", - " 'most_abundant_gene_n_observations': 300,\n", - " 'n_fragments': 499,\n", - " 'n_genes': 11,\n", - " 'n_molecules': 249,\n", - " 'n_reads': 656,\n", - " 'perfect_molecule_barcodes': 655,\n", - " 'reads_mapped_exonic': 609,\n", - " 'reads_mapped_intronic': 28,\n", - " 'reads_mapped_uniquely': 656,\n", - " 'reads_mapped_utr': 19,\n", - " 'spliced_reads': 2}" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_scalar" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the higher-order metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "calc_func_fraction_from_acii = lambda x: sum(1 for c in x if ord(c) > 63) / len(x)\n", - "calc_func_fraction = lambda x: sum(1 for c in x if c > 30) / len(x)\n", - "calc_func_mean = lambda x: np.mean([c for c in x])\n", - "\n", - "data['num_UY_qual_fraction'] = data['UY'].apply(calc_func_fraction_from_acii)\n", - "\n", - "data['num_base_qual_fraction'] = data['mapq'].apply(calc_func_fraction)\n", - "data['num_base_qual_mean'] = data['mapq'].apply(calc_func_mean)\n", - "\n", - "grouped_by_cell = data.groupby(['CB'])" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_series = {}" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# vector values\n", - "# I changed these to retain the index to make merging into a dataframe easier, and guarantee same order. \n", - "results_series['molecule_barcode_fraction_bases_above_30_mean'] = grouped_by_cell.mean()['num_UY_qual_fraction']\n", - "results_series['molecule_barcode_fraction_bases_above_30_variance'] = grouped_by_cell.var()['num_UY_qual_fraction']\n", - "\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_mean'] = grouped_by_cell.mean()['num_base_qual_fraction']\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_variance'] = grouped_by_cell.var()['num_base_qual_fraction']\n", - "results_series['genomic_read_quality_mean'] = grouped_by_cell.mean()['num_base_qual_mean']\n", - "results_series['genomic_read_quality_variance'] = grouped_by_cell.var()['num_base_qual_mean']\n", - "\n", - "reads_per_cell = data.groupby(['CB']).size()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AS 96\n", - "CB AAACCTGAGAAACCTA\n", - "CR AAACCTGAGAAACCTA\n", - "CY AAFFFJJJJJJJJJJJ\n", - "GE NaN\n", - "GS NaN\n", - "HI 1\n", - "MD 98\n", - "NH 1\n", - "NM 0\n", - "RG A\n", - "SR GTAATTGC\n", - "SY AAAFFJ\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# compare two numpy arrays that are slightly different\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0meps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m11\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1e-8\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mallclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompare_me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompare_me\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0meps\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (58,) (11,) " - ] - } - ], - "source": [ - "# compare two numpy arrays that are slightly different\n", - "eps = np.random.rand(11) * 1e-8\n", - "np.allclose(compare_me, compare_me + eps)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# it is actually discriminative, though\n", - "np.allclose(compare_me, np.arange(11))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Look at the metrics output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "cell_metrics = pd.read_csv('data/cell_metrics.csv', index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "cell_metrics['n_genes']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "!cat data/cell_metrics.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "hide_input": false, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "nav_menu": {}, - "toc": { - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 6, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tools/scripts/sctools/build/lib/sctools/test/characterize-gene-testing-data.ipynb b/tools/scripts/sctools/build/lib/sctools/test/characterize-gene-testing-data.ipynb deleted file mode 100644 index a6a31002..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/characterize-gene-testing-data.ipynb +++ /dev/null @@ -1,1159 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load Testing Data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import pysam\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:24: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n" - ] - } - ], - "source": [ - "def parse_record(record):\n", - " \"\"\"line parser to build dataframe, supports missing tags in test data\"\"\"\n", - " data = {\n", - " 'qname': record.query_name,\n", - " 'flag': record.flag,\n", - " 'reference': record.reference_id,\n", - " 'position': record.pos,\n", - " 'mapq': record.query_alignment_qualities,\n", - " 'cigar': record.cigarstring,\n", - " 'rnext': record.rnext, \n", - " 'pnext': record.pnext,\n", - " 'tlen': record.tlen, \n", - " 'sequence': record.seq,\n", - " 'quality': record.qual,\n", - " }\n", - " for name, tag in record.get_tags():\n", - " data[name] = tag\n", - " return pd.Series(data)\n", - "\n", - "input_sam_file = 'data/small-gene-sorted.bam'\n", - "with pysam.AlignmentFile(input_sam_file, 'rb') as f:\n", - " records = []\n", - " for record in f:\n", - " records.append(parse_record(record))\n", - "\n", - "data = pd.concat(records, axis=1).T\n", - "\n", - "results_scalar = {} # will hold the calculations we make" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build Expectations for Testing Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Reads" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "300\n" - ] - } - ], - "source": [ - "results_scalar['n_reads'] = len(data)\n", - "print(results_scalar['n_reads'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Genes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8\n" - ] - } - ], - "source": [ - "results_scalar['n_genes'] = len(data.groupby(['GE']))\n", - "print(results_scalar['n_genes'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Gene table should have 8 entries plus a header for a total of 9 lines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Molecules\n", - "\n", - "Molecules are defined as a unique triplet of CB, UB, and GE" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "88\n" - ] - } - ], - "source": [ - "results_scalar['n_molecules'] = len(data.groupby(['CB', 'UB', 'GE']))\n", - "print(results_scalar['n_molecules'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Fragments\n", - "\n", - "Fragments are defined as molecules are (CB, UB, GE) but must additionally have a unique position" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "217\n" - ] - } - ], - "source": [ - "results_scalar['n_fragments'] = len(data.groupby(['CB', 'UB', 'GE', 'position']))\n", - "print(results_scalar['n_fragments'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Most Abundant Gene\n", - "\n", - "Based on the above, at least one of the genes has to be observed more than once. Which is it? " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "AL627309.7 245\n" - ] - } - ], - "source": [ - "results_scalar['most_abundant'] = data.groupby(['GE']).size().idxmax()\n", - "results_scalar['most_abundant_gene_n_observations'] = data.groupby(['GE']).size().max()\n", - "print(results_scalar['most_abundant'], results_scalar['most_abundant_gene_n_observations'])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['perfect_molecule_barcodes'] = 0\n", - "for c, r in zip(data['UB'], data['UR']):\n", - " if c == r:\n", - " results_scalar['perfect_molecule_barcodes'] += 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the alignment metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'most_abundant': 'AL627309.7',\n", - " 'most_abundant_gene_n_observations': 245,\n", - " 'n_fragments': 217,\n", - " 'n_genes': 8,\n", - " 'n_molecules': 88,\n", - " 'n_reads': 300,\n", - " 'perfect_molecule_barcodes': 300}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_scalar" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_exonic'] = sum(data['XF'] == 'CODING')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_intronic'] = sum(data['XF'] == 'INTRONIC')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_utr'] = sum(data['XF'] == 'UTR')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_uniquely'] = sum(data['NH'] == 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['duplicate_reads'] = sum((data['flag'] & 1024).astype(bool))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['spliced_reads'] = sum(1 for v in data['cigar'] if 'N' in v)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the higher-order metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "calc_func_fraction_from_acii = lambda x: sum(1 for c in x if ord(c) > 63) / len(x)\n", - "calc_func_fraction = lambda x: sum(1 for c in x if c > 30) / len(x)\n", - "calc_func_mean = lambda x: np.mean([c for c in x])\n", - "\n", - "data['num_UY_qual_fraction'] = data['UY'].apply(calc_func_fraction_from_acii)\n", - "\n", - "data['num_base_qual_fraction'] = data['mapq'].apply(calc_func_fraction)\n", - "data['num_base_qual_mean'] = data['mapq'].apply(calc_func_mean)\n", - "\n", - "grouped_by_gene = data.groupby(['GE'])" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_series = {}" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# vector values\n", - "# I changed these to retain the index to make merging into a dataframe easier, and guarantee same order. \n", - "results_series['molecule_barcode_fraction_bases_above_30_mean'] = grouped_by_gene.mean()['num_UY_qual_fraction']\n", - "results_series['molecule_barcode_fraction_bases_above_30_variance'] = grouped_by_gene.var()['num_UY_qual_fraction']\n", - "\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_mean'] = grouped_by_gene.mean()['num_base_qual_fraction']\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_variance'] = grouped_by_gene.var()['num_base_qual_fraction']\n", - "results_series['genomic_read_quality_mean'] = grouped_by_gene.mean()['num_base_qual_mean']\n", - "results_series['genomic_read_quality_variance'] = grouped_by_gene.var()['num_base_qual_mean']\n", - "\n", - "reads_per_gene = data.groupby(['GE']).size()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "molecules_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB']).size()))\n", - "fragments_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB', 'position']).size()))\n", - "reads_per_molecule = reads_per_gene / molecules_per_gene\n", - "reads_per_fragment = reads_per_gene / fragments_per_gene\n", - "fragments_per_molecule = fragments_per_gene / molecules_per_gene\n", - "results_series['reads_per_molecule'] = reads_per_molecule\n", - "results_series['reads_per_fragment'] = reads_per_fragment\n", - "results_series['fragments_per_molecule'] = fragments_per_molecule\n", - "\n", - "# scalar values\n", - "results_scalar['fragments_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE', 'position']).size() == 1)\n", - "results_scalar['molecules_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE']).size() == 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "fragments_per_molecule np.array([1.0000, 1.0000, 1.0000, 1.8750, 2.9831, 1.2500, 1.0000, 1.3077])\n", - "genomic_read_quality_mean np.array([36.2143, 24.8469, 25.4792, 35.3664, 34.0956, 33.0364, 20.7423, 27.3078])\n", - "genomic_read_quality_variance np.array([nan, nan, nan, 18.4553, 21.6745, 33.6572, nan, 53.5457])\n", - "genomic_reads_fraction_bases_quality_above_30_mean np.array([0.8878, 0.3980, 0.4271, 0.8148, 0.7681, 0.7216, 0.1546, 0.5089])\n", - "genomic_reads_fraction_bases_quality_above_30_variance np.array([nan, nan, nan, 0.0282, 0.0346, 0.0537, nan, 0.0849])\n", - "molecule_barcode_fraction_bases_above_30_mean np.array([1.0000, 1.0000, 0.8000, 0.9885, 0.9833, 0.9857, 0.7000, 0.9444])\n", - "molecule_barcode_fraction_bases_above_30_variance np.array([nan, nan, nan, 0.0011, 0.0051, 0.0014, nan, 0.0120])\n", - "reads_per_fragment np.array([1.0000, 1.0000, 1.0000, 1.7333, 1.3920, 1.4000, 1.0000, 1.0588])\n", - "reads_per_molecule np.array([1.0000, 1.0000, 1.0000, 3.2500, 4.1525, 1.7500, 1.0000, 1.3846])\n" - ] - } - ], - "source": [ - "# write out the array information for the testing file\n", - "for k, vals in pd.DataFrame(results_series).iteritems():\n", - " print(k, 'np.array([' + ', '.join('{:.4f}'.format(i) for i in vals.values) + '])')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Write Results to File for Automated Testing" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "pd.Series(results_scalar).to_csv('%s_testing_knowledge_scalar.csv' % input_sam_file.replace('.bam', ''))\n", - "pd.DataFrame(results_series).to_csv('%s_testing_knowledge_series.csv' % input_sam_file.replace('.bam', ''))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# do a comparison of the whole 2d dataframe at once\n", - "np.allclose(\n", - " pd.DataFrame(results_series).fillna(0).values, # fill nans with zero, call values to get the numpy array the dataframe is based on\n", - " pd.read_csv('data/small-gene-sorted_testing_knowledge_series.csv', index_col=0, header=0).fillna(0).values\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# to get most_abundant alone: " - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "test_read_scalar = pd.read_csv('data/small-gene-sorted_testing_knowledge_scalar.csv', index_col=0, header=None, squeeze=True)\n", - "\n", - "# extract this, we're going to drop it from the array to do some conversion to numeric\n", - "most_abundant = test_read_scalar['most_abundant'] \n", - "\n", - "# drop most abundant, convert to float, fill any NaN values with 0, and call .values to get the numpy array pandas objects are based on.\n", - "for_comparison = test_read_scalar.drop('most_abundant').astype(float).fillna(0).values\n", - "\n", - "\n", - "# note, have to drop the string value and convert to float before this works. \n", - "np.allclose(\n", - " pd.Series(results_scalar).drop('most_abundant').fillna(0).values, # do the same thing as above to the one in memory\n", - " for_comparison\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "GE\n", - "ACAP3 36.214286\n", - "AGRN 24.846939\n", - "AL627309.1 25.479167\n", - "AL627309.5 35.366414\n", - "AL627309.7 34.095625\n", - "AL645608.2 33.036443\n", - "AL645608.3 20.742268\n", - "AL645608.4 27.307758\n", - "Name: genomic_read_quality_mean, dtype: float64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# get a metric from a dataframe: \n", - "df = pd.DataFrame(results_series)\n", - "df['genomic_read_quality_mean']" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# get a numpy array from the dataframe\n", - "compare_me = df['genomic_read_quality_mean'].values" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# compare two numpy arrays that are slightly different\n", - "eps = np.random.rand(8) * 1e-8\n", - "np.allclose(compare_me, compare_me + eps)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# it is actually discriminative, though\n", - "np.allclose(compare_me, np.arange(8))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Look at the metrics output" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "gene_metrics = pd.read_csv('data/gene_metrics.csv', index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - }, - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
n_readsnoise_readsperfect_molecule_barcodesreads_mapped_exonicreads_mapped_intronicreads_mapped_utrreads_mapped_uniquelyreads_mapped_multipleduplicate_readsspliced_reads...genomic_read_quality_variancen_moleculesn_fragmentsreads_per_moleculereads_per_fragmentfragments_per_moleculefragments_with_single_read_evidencemolecules_with_single_read_evidencenumber_cells_detected_multiplenumber_cells_expressing
ACAP31011001001...NaN111.0000001.0000001.0000001101
AGRN1011001001...NaN111.0000001.0000001.0000001101
AL627309.11011001001...NaN111.0000001.0000001.0000001101
AL627309.52602626002601126...18.4552938153.2500001.7333331.8750007268
AL627309.72450245245002450760...21.674500591764.1525421.3920452.983051124223857
AL645608.27077007020...33.657186451.7500001.4000001.2500004224
AL645608.31011001000...NaN111.0000001.0000001.0000001101
AL645608.418018180018010...53.54574013171.3846151.0588241.3076921612113
\n", - "

8 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " n_reads noise_reads perfect_molecule_barcodes \\\n", - "ACAP3 1 0 1 \n", - "AGRN 1 0 1 \n", - "AL627309.1 1 0 1 \n", - "AL627309.5 26 0 26 \n", - "AL627309.7 245 0 245 \n", - "AL645608.2 7 0 7 \n", - "AL645608.3 1 0 1 \n", - "AL645608.4 18 0 18 \n", - "\n", - " reads_mapped_exonic reads_mapped_intronic reads_mapped_utr \\\n", - "ACAP3 1 0 0 \n", - "AGRN 1 0 0 \n", - "AL627309.1 1 0 0 \n", - "AL627309.5 26 0 0 \n", - "AL627309.7 245 0 0 \n", - "AL645608.2 7 0 0 \n", - "AL645608.3 1 0 0 \n", - "AL645608.4 18 0 0 \n", - "\n", - " reads_mapped_uniquely reads_mapped_multiple duplicate_reads \\\n", - "ACAP3 1 0 0 \n", - "AGRN 1 0 0 \n", - "AL627309.1 1 0 0 \n", - "AL627309.5 26 0 11 \n", - "AL627309.7 245 0 76 \n", - "AL645608.2 7 0 2 \n", - "AL645608.3 1 0 0 \n", - "AL645608.4 18 0 1 \n", - "\n", - " spliced_reads ... \\\n", - "ACAP3 1 ... \n", - "AGRN 1 ... \n", - "AL627309.1 1 ... \n", - "AL627309.5 26 ... \n", - "AL627309.7 0 ... \n", - "AL645608.2 0 ... \n", - "AL645608.3 0 ... \n", - "AL645608.4 0 ... \n", - "\n", - " genomic_read_quality_variance n_molecules n_fragments \\\n", - "ACAP3 NaN 1 1 \n", - "AGRN NaN 1 1 \n", - "AL627309.1 NaN 1 1 \n", - "AL627309.5 18.455293 8 15 \n", - "AL627309.7 21.674500 59 176 \n", - "AL645608.2 33.657186 4 5 \n", - "AL645608.3 NaN 1 1 \n", - "AL645608.4 53.545740 13 17 \n", - "\n", - " reads_per_molecule reads_per_fragment fragments_per_molecule \\\n", - "ACAP3 1.000000 1.000000 1.000000 \n", - "AGRN 1.000000 1.000000 1.000000 \n", - "AL627309.1 1.000000 1.000000 1.000000 \n", - "AL627309.5 3.250000 1.733333 1.875000 \n", - "AL627309.7 4.152542 1.392045 2.983051 \n", - "AL645608.2 1.750000 1.400000 1.250000 \n", - "AL645608.3 1.000000 1.000000 1.000000 \n", - "AL645608.4 1.384615 1.058824 1.307692 \n", - "\n", - " fragments_with_single_read_evidence \\\n", - "ACAP3 1 \n", - "AGRN 1 \n", - "AL627309.1 1 \n", - "AL627309.5 7 \n", - "AL627309.7 124 \n", - "AL645608.2 4 \n", - "AL645608.3 1 \n", - "AL645608.4 16 \n", - "\n", - " molecules_with_single_read_evidence \\\n", - "ACAP3 1 \n", - "AGRN 1 \n", - "AL627309.1 1 \n", - "AL627309.5 2 \n", - "AL627309.7 22 \n", - "AL645608.2 2 \n", - "AL645608.3 1 \n", - "AL645608.4 12 \n", - "\n", - " number_cells_detected_multiple number_cells_expressing \n", - "ACAP3 0 1 \n", - "AGRN 0 1 \n", - "AL627309.1 0 1 \n", - "AL627309.5 6 8 \n", - "AL627309.7 38 57 \n", - "AL645608.2 2 4 \n", - "AL645608.3 0 1 \n", - "AL645608.4 1 13 \n", - "\n", - "[8 rows x 26 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gene_metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ",n_reads,noise_reads,perfect_molecule_barcodes,reads_mapped_exonic,reads_mapped_intronic,reads_mapped_utr,reads_mapped_uniquely,reads_mapped_multiple,duplicate_reads,spliced_reads,antisense_reads,molecule_barcode_fraction_bases_above_30_mean,molecule_barcode_fraction_bases_above_30_variance,genomic_reads_fraction_bases_quality_above_30_mean,genomic_reads_fraction_bases_quality_above_30_variance,genomic_read_quality_mean,genomic_read_quality_variance,n_molecules,n_fragments,reads_per_molecule,reads_per_fragment,fragments_per_molecule,fragments_with_single_read_evidence,molecules_with_single_read_evidence,number_cells_detected_multiple,number_cells_expressing\n", - "ACAP3,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.8877551020408163,nan,36.214285714285715,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AGRN,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.3979591836734694,nan,24.846938775510203,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AL627309.1,1,0,1,1,0,0,1,0,0,1,0,0.8,nan,0.4270833333333333,nan,25.479166666666668,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AL627309.5,26,0,26,26,0,0,26,0,11,26,0,0.9884615384615385,0.0010615384615384619,0.8148357472599155,0.02818637889146239,35.36641405113152,18.45529287710208,8,15,3.25,1.7333333333333334,1.875,7,2,6,8\n", - "AL627309.7,245,0,245,245,0,0,245,0,76,0,0,0.9832653061224491,0.005087654734024759,0.7681442526176698,0.03459077695708153,34.09562493869249,21.67450015630017,59,176,4.1525423728813555,1.3920454545454546,2.983050847457627,124,22,38,57\n", - "AL645608.2,7,0,7,7,0,0,7,0,2,0,0,0.9857142857142857,0.00142857142857143,0.7215743440233235,0.05371769699133296,33.03644314868805,33.65718648975626,4,5,1.75,1.4,1.25,4,2,2,4\n", - "AL645608.3,1,0,1,1,0,0,1,0,0,0,0,0.7,nan,0.15463917525773196,nan,20.742268041237114,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AL645608.4,18,0,18,18,0,0,18,0,1,0,0,0.9444444444444444,0.012026143790849672,0.5089380971044231,0.08488064356706926,27.307757608823714,53.545739760471115,13,17,1.3846153846153846,1.0588235294117647,1.3076923076923077,16,12,1,13\n" - ] - } - ], - "source": [ - "!cat data/gene_metrics.csv" - ] - } - ], - "metadata": { - "hide_input": false, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "nav_menu": {}, - "toc": { - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 6, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/1k-august-2016.txt b/tools/scripts/sctools/build/lib/sctools/test/data/1k-august-2016.txt deleted file mode 100644 index 54b0b83b..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/1k-august-2016.txt +++ /dev/null @@ -1,1001 +0,0 @@ -AAACCTGAGAAACCAT -AAACCTGAGAAACCGC -AAACCTGAGAAACCTA -AAACCTGAGAAACGAG -AAACCTGAGAAACGCC -AAACCTGAGAAAGTGG -AAACCTGAGAACAACT -AAACCTGAGAACAATC -AAACCTGAGAACTCGG -AAACCTGAGAACTGTA -AAACCTGAGAAGAAGC -AAACCTGAGAAGATTC -AAACCTGAGAAGCCCA -AAACCTGAGAAGGACA -AAACCTGAGAAGGCCT -AAACCTGAGAAGGGTA -AAACCTGAGAAGGTGA -AAACCTGAGAAGGTTT -AAACCTGAGAATAGGG -AAACCTGAGAATCTCC -AAACCTGAGAATGTGT -AAACCTGAGAATGTTG -AAACCTGAGAATTCCC -AAACCTGAGAATTGTG -AAACCTGAGACAAAGG -AAACCTGAGACAAGCC -AAACCTGAGACAATAC -AAACCTGAGACACGAC -AAACCTGAGACACTAA -AAACCTGAGACAGACC -AAACCTGAGACAGAGA -AAACCTGAGACAGGCT -AAACCTGAGACATAAC -AAACCTGAGACCACGA -AAACCTGAGACCCACC -AAACCTGAGACCGGAT -AAACCTGAGACCTAGG -AAACCTGAGACCTTTG -AAACCTGAGACGACGT -AAACCTGAGACGCAAC -AAACCTGAGACGCACA -AAACCTGAGACGCTTT -AAACCTGAGACTAAGT -AAACCTGAGACTACAA -AAACCTGAGACTAGAT -AAACCTGAGACTAGGC -AAACCTGAGACTCGGA -AAACCTGAGACTGGGT -AAACCTGAGACTGTAA -AAACCTGAGACTTGAA -AAACCTGAGACTTTCG -AAACCTGAGAGAACAG -AAACCTGAGAGACGAA -AAACCTGAGAGACTAT -AAACCTGAGAGACTTA -AAACCTGAGAGAGCTC -AAACCTGAGAGATGAG -AAACCTGAGAGCAATT -AAACCTGAGAGCCCAA -AAACCTGAGAGCCTAG -AAACCTGAGAGCTATA -AAACCTGAGAGCTGCA -AAACCTGAGAGCTGGT -AAACCTGAGAGCTTCT -AAACCTGAGAGGACGG -AAACCTGAGAGGGATA -AAACCTGAGAGGGCTT -AAACCTGAGAGGTACC -AAACCTGAGAGGTAGA -AAACCTGAGAGGTTAT -AAACCTGAGAGGTTGC -AAACCTGAGAGTAAGG -AAACCTGAGAGTAATC -AAACCTGAGAGTACAT -AAACCTGAGAGTACCG -AAACCTGAGAGTCGGT -AAACCTGAGAGTCTGG -AAACCTGAGAGTGACC -AAACCTGAGAGTGAGA -AAACCTGAGAGTTGGC -AAACCTGAGATACACA -AAACCTGAGATAGCAT -AAACCTGAGATAGGAG -AAACCTGAGATAGTCA -AAACCTGAGATATACG -AAACCTGAGATATGCA -AAACCTGAGATATGGT -AAACCTGAGATCACGG -AAACCTGAGATCCCAT -AAACCTGAGATCCCGC -AAACCTGAGATCCGAG -AAACCTGAGATCCTGT -AAACCTGAGATCGATA -AAACCTGAGATCGGGT -AAACCTGAGATCTGAA -AAACCTGAGATCTGCT -AAACCTGAGATGAGAG -AAACCTGAGATGCCAG -AAACCTGAGATGCCTT -AAACCTGAGATGCGAC -AAACCTGAGATGGCGT -AAACCTGAGATGGGTC -AAACCTGAGATGTAAC -AAACCTGAGATGTCGG -AAACCTGAGATGTGGC -AAACCTGAGATGTGTA -AAACCTGAGATGTTAG -AAACCTGAGATTACCC -AAACCTGAGCAAATCA -AAACCTGAGCAACGGT -AAACCTGAGCAATATG -AAACCTGAGCAATCTC -AAACCTGAGCACACAG -AAACCTGAGCACAGGT -AAACCTGAGCACCGCT -AAACCTGAGCACCGTC -AAACCTGAGCACGCCT -AAACCTGAGCAGACTG -AAACCTGAGCAGATCG -AAACCTGAGCAGCCTC -AAACCTGAGCAGCGTA -AAACCTGAGCAGGCTA -AAACCTGAGCAGGTCA -AAACCTGAGCATCATC -AAACCTGAGCATGGCA -AAACCTGAGCCAACAG -AAACCTGAGCCACCTG -AAACCTGAGCCACGCT -AAACCTGAGCCACGTC -AAACCTGAGCCACTAT -AAACCTGAGCCAGAAC -AAACCTGAGCCAGGAT -AAACCTGAGCCAGTAG -AAACCTGAGCCAGTTT -AAACCTGAGCCATCGC -AAACCTGAGCCCAACC -AAACCTGAGCCCAATT -AAACCTGAGCCCAGCT -AAACCTGAGCCCGAAA -AAACCTGAGCCCTAAT -AAACCTGAGCCGATTT -AAACCTGAGCCGCCTA -AAACCTGAGCCGGTAA -AAACCTGAGCCGTCGT -AAACCTGAGCCTATGT -AAACCTGAGCCTCGTG -AAACCTGAGCCTTGAT -AAACCTGAGCGAAGGG -AAACCTGAGCGACGTA -AAACCTGAGCGAGAAA -AAACCTGAGCGATAGC -AAACCTGAGCGATATA -AAACCTGAGCGATCCC -AAACCTGAGCGATGAC -AAACCTGAGCGATTCT -AAACCTGAGCGCCTCA -AAACCTGAGCGCCTTG -AAACCTGAGCGCTCCA -AAACCTGAGCGCTTAT -AAACCTGAGCGGATCA -AAACCTGAGCGGCTTC -AAACCTGAGCGTAATA -AAACCTGAGCGTAGTG -AAACCTGAGCGTCAAG -AAACCTGAGCGTCTAT -AAACCTGAGCGTGAAC -AAACCTGAGCGTGAGT -AAACCTGAGCGTGTCC -AAACCTGAGCGTTCCG -AAACCTGAGCGTTGCC -AAACCTGAGCGTTTAC -AAACCTGAGCTAAACA -AAACCTGAGCTAACAA -AAACCTGAGCTAACTC -AAACCTGAGCTAAGAT -AAACCTGAGCTACCGC -AAACCTGAGCTACCTA -AAACCTGAGCTAGCCC -AAACCTGAGCTAGGCA -AAACCTGAGCTAGTCT -AAACCTGAGCTAGTGG -AAACCTGAGCTAGTTC -AAACCTGAGCTATGCT -AAACCTGAGCTCAACT -AAACCTGAGCTCCCAG -AAACCTGAGCTCCTCT -AAACCTGAGCTCCTTC -AAACCTGAGCTCTCGG -AAACCTGAGCTGAAAT -AAACCTGAGCTGAACG -AAACCTGAGCTGATAA -AAACCTGAGCTGCAAG -AAACCTGAGCTGCCCA -AAACCTGAGCTGCGAA -AAACCTGAGCTGGAAC -AAACCTGAGCTGTCTA -AAACCTGAGCTGTTCA -AAACCTGAGCTTATCG -AAACCTGAGCTTCGCG -AAACCTGAGCTTTGGT -AAACCTGAGGAACTGC -AAACCTGAGGAATCGC -AAACCTGAGGAATGGA -AAACCTGAGGAATTAC -AAACCTGAGGACACCA -AAACCTGAGGACAGAA -AAACCTGAGGACAGCT -AAACCTGAGGACATTA -AAACCTGAGGACCACA -AAACCTGAGGACGAAA -AAACCTGAGGACTGGT -AAACCTGAGGAGCGAG -AAACCTGAGGAGCGTT -AAACCTGAGGAGTACC -AAACCTGAGGAGTAGA -AAACCTGAGGAGTCTG -AAACCTGAGGAGTTGC -AAACCTGAGGAGTTTA -AAACCTGAGGATATAC -AAACCTGAGGATCGCA -AAACCTGAGGATGCGT -AAACCTGAGGATGGAA -AAACCTGAGGATGGTC -AAACCTGAGGATGTAT -AAACCTGAGGATTCGG -AAACCTGAGGCAAAGA -AAACCTGAGGCAATTA -AAACCTGAGGCACATG -AAACCTGAGGCAGGTT -AAACCTGAGGCAGTCA -AAACCTGAGGCATGGT -AAACCTGAGGCATGTG -AAACCTGAGGCATTGG -AAACCTGAGGCCATAG -AAACCTGAGGCCCGTT -AAACCTGAGGCCCTCA -AAACCTGAGGCCCTTG -AAACCTGAGGCCGAAT -AAACCTGAGGCGACAT -AAACCTGAGGCGATAC -AAACCTGAGGCGCTCT -AAACCTGAGGCGTACA -AAACCTGAGGCTACGA -AAACCTGAGGCTAGAC -AAACCTGAGGCTAGCA -AAACCTGAGGCTAGGT -AAACCTGAGGCTATCT -AAACCTGAGGCTCAGA -AAACCTGAGGCTCATT -AAACCTGAGGCTCTTA -AAACCTGAGGGAAACA -AAACCTGAGGGAACGG -AAACCTGAGGGAGTAA -AAACCTGAGGGATACC -AAACCTGAGGGATCTG -AAACCTGAGGGATGGG -AAACCTGAGGGCACTA -AAACCTGAGGGCATGT -AAACCTGAGGGCTCTC -AAACCTGAGGGCTTCC -AAACCTGAGGGCTTGA -AAACCTGAGGGTATCG -AAACCTGAGGGTCGAT -AAACCTGAGGGTCTCC -AAACCTGAGGGTGTGT -AAACCTGAGGGTGTTG -AAACCTGAGGGTTCCC -AAACCTGAGGGTTTCT -AAACCTGAGGTAAACT -AAACCTGAGGTACTCT -AAACCTGAGGTAGCCA -AAACCTGAGGTAGCTG -AAACCTGAGGTCATCT -AAACCTGAGGTCGGAT -AAACCTGAGGTGACCA -AAACCTGAGGTGATAT -AAACCTGAGGTGATTA -AAACCTGAGGTGCAAC -AAACCTGAGGTGCACA -AAACCTGAGGTGCTAG -AAACCTGAGGTGCTTT -AAACCTGAGGTGGGTT -AAACCTGAGGTGTGGT -AAACCTGAGGTGTTAA -AAACCTGAGGTTACCT -AAACCTGAGGTTCCTA -AAACCTGAGTAACCCT -AAACCTGAGTAAGTAC -AAACCTGAGTAATCCC -AAACCTGAGTACACCT -AAACCTGAGTACATGA -AAACCTGAGTACCGGA -AAACCTGAGTACGACG -AAACCTGAGTACGATA -AAACCTGAGTACGCCC -AAACCTGAGTACGCGA -AAACCTGAGTACGTAA -AAACCTGAGTACGTTC -AAACCTGAGTACTTGC -AAACCTGAGTAGATGT -AAACCTGAGTAGCCGA -AAACCTGAGTAGCGGT -AAACCTGAGTAGGCCA -AAACCTGAGTAGGTGC -AAACCTGAGTAGTGCG -AAACCTGAGTATCGAA -AAACCTGAGTATCTCG -AAACCTGAGTATGACA -AAACCTGAGTATTGGA -AAACCTGAGTCAAGCG -AAACCTGAGTCAAGGC -AAACCTGAGTCAATAG -AAACCTGAGTCACGCC -AAACCTGAGTCATCCA -AAACCTGAGTCATGCT -AAACCTGAGTCCAGGA -AAACCTGAGTCCATAC -AAACCTGAGTCCCACG -AAACCTGAGTCCGGTC -AAACCTGAGTCCGTAT -AAACCTGAGTCCTCCT -AAACCTGAGTCGAGTG -AAACCTGAGTCGATAA -AAACCTGAGTCGCCGT -AAACCTGAGTCGTACT -AAACCTGAGTCGTTTG -AAACCTGAGTCTCAAC -AAACCTGAGTCTCCTC -AAACCTGAGTCTCGGC -AAACCTGAGTCTTGCA -AAACCTGAGTGAACAT -AAACCTGAGTGAACGC -AAACCTGAGTGAAGAG -AAACCTGAGTGAAGTT -AAACCTGAGTGAATTG -AAACCTGAGTGACATA -AAACCTGAGTGACTCT -AAACCTGAGTGATCGG -AAACCTGAGTGCAAGC -AAACCTGAGTGCCAGA -AAACCTGAGTGCCATT -AAACCTGAGTGCGATG -AAACCTGAGTGCGTGA -AAACCTGAGTGCTGCC -AAACCTGAGTGGACGT -AAACCTGAGTGGAGAA -AAACCTGAGTGGAGTC -AAACCTGAGTGGCACA -AAACCTGAGTGGGATC -AAACCTGAGTGGGCTA -AAACCTGAGTGGGTTG -AAACCTGAGTGGTAAT -AAACCTGAGTGGTAGC -AAACCTGAGTGGTCCC -AAACCTGAGTGTACCT -AAACCTGAGTGTACGG -AAACCTGAGTGTACTC -AAACCTGAGTGTCCAT -AAACCTGAGTGTCCCG -AAACCTGAGTGTCTCA -AAACCTGAGTGTGAAT -AAACCTGAGTGTGGCA -AAACCTGAGTGTTAGA -AAACCTGAGTGTTGAA -AAACCTGAGTGTTTGC -AAACCTGAGTTAACGA -AAACCTGAGTTAAGTG -AAACCTGAGTTACCCA -AAACCTGAGTTACGGG -AAACCTGAGTTAGCGG -AAACCTGAGTTAGGTA -AAACCTGAGTTATCGC -AAACCTGAGTTCCACA -AAACCTGAGTTCGATC -AAACCTGAGTTCGCAT -AAACCTGAGTTCGCGC -AAACCTGAGTTGAGAT -AAACCTGAGTTGAGTA -AAACCTGAGTTGCAGG -AAACCTGAGTTGTAGA -AAACCTGAGTTGTCGT -AAACCTGAGTTTAGGA -AAACCTGAGTTTCCTT -AAACCTGAGTTTGCGT -AAACCTGCAAACAACA -AAACCTGCAAACCCAT -AAACCTGCAAACCTAC -AAACCTGCAAACGCGA -AAACCTGCAAACGTGG -AAACCTGCAAACTGCT -AAACCTGCAAACTGTC -AAACCTGCAAAGAATC -AAACCTGCAAAGCAAT -AAACCTGCAAAGCGGT -AAACCTGCAAAGGAAG -AAACCTGCAAAGGCGT -AAACCTGCAAAGGTGC -AAACCTGCAAAGTCAA -AAACCTGCAAAGTGCG -AAACCTGCAAATACAG -AAACCTGCAAATCCGT -AAACCTGCAAATTGCC -AAACCTGCAACAACCT -AAACCTGCAACACCCG -AAACCTGCAACACCTA -AAACCTGCAACACGCC -AAACCTGCAACCGCCA -AAACCTGCAACGATCT -AAACCTGCAACGATGG -AAACCTGCAACGCACC -AAACCTGCAACTGCGC -AAACCTGCAACTGCTA -AAACCTGCAACTGGCC -AAACCTGCAACTTGAC -AAACCTGCAAGAAAGG -AAACCTGCAAGAAGAG -AAACCTGCAAGACACG -AAACCTGCAAGACGTG -AAACCTGCAAGAGGCT -AAACCTGCAAGAGTCG -AAACCTGCAAGCCATT -AAACCTGCAAGCCCAC -AAACCTGCAAGCCGCT -AAACCTGCAAGCCGTC -AAACCTGCAAGCCTAT -AAACCTGCAAGCGAGT -AAACCTGCAAGCGATG -AAACCTGCAAGCGCTC -AAACCTGCAAGCGTAG -AAACCTGCAAGCTGAG -AAACCTGCAAGCTGGA -AAACCTGCAAGCTGTT -AAACCTGCAAGGACAC -AAACCTGCAAGGACTG -AAACCTGCAAGGCTCC -AAACCTGCAAGGGTCA -AAACCTGCAAGGTGTG -AAACCTGCAAGGTTCT -AAACCTGCAAGGTTTC -AAACCTGCAAGTAATG -AAACCTGCAAGTACCT -AAACCTGCAAGTAGTA -AAACCTGCAAGTCATC -AAACCTGCAAGTCTAC -AAACCTGCAAGTCTGT -AAACCTGCAAGTTAAG -AAACCTGCAAGTTCTG -AAACCTGCAAGTTGTC -AAACCTGCAATAACGA -AAACCTGCAATAAGCA -AAACCTGCAATACGCT -AAACCTGCAATAGAGT -AAACCTGCAATAGCAA -AAACCTGCAATAGCGG -AAACCTGCAATCACAC -AAACCTGCAATCAGAA -AAACCTGCAATCCAAC -AAACCTGCAATCCGAT -AAACCTGCAATCGAAA -AAACCTGCAATCGGTT -AAACCTGCAATCTACG -AAACCTGCAATCTGCA -AAACCTGCAATGAAAC -AAACCTGCAATGAATG -AAACCTGCAATGACCT -AAACCTGCAATGCCAT -AAACCTGCAATGGAAT -AAACCTGCAATGGACG -AAACCTGCAATGGAGC -AAACCTGCAATGGATA -AAACCTGCAATGGTCT -AAACCTGCAATGTAAG -AAACCTGCAATGTTGC -AAACCTGCAATTCCTT -AAACCTGCAATTGCTG -AAACCTGCACAACGCC -AAACCTGCACAACGTT -AAACCTGCACAACTGT -AAACCTGCACAAGACG -AAACCTGCACAAGCCC -AAACCTGCACAAGTAA -AAACCTGCACACAGAG -AAACCTGCACACATGT -AAACCTGCACACCGAC -AAACCTGCACACCGCA -AAACCTGCACACGCTG -AAACCTGCACACTGCG -AAACCTGCACAGACAG -AAACCTGCACAGACTT -AAACCTGCACAGAGGT -AAACCTGCACAGATTC -AAACCTGCACAGCCCA -AAACCTGCACAGCGTC -AAACCTGCACAGGAGT -AAACCTGCACAGGCCT -AAACCTGCACAGGTTT -AAACCTGCACAGTCGC -AAACCTGCACATAACC -AAACCTGCACATCCAA -AAACCTGCACATCCGG -AAACCTGCACATCTTT -AAACCTGCACATGACT -AAACCTGCACATGGGA -AAACCTGCACATGTGT -AAACCTGCACATTAGC -AAACCTGCACATTCGA -AAACCTGCACATTTCT -AAACCTGCACCAACCG -AAACCTGCACCACCAG -AAACCTGCACCACGTG -AAACCTGCACCAGATT -AAACCTGCACCAGCAC -AAACCTGCACCAGGCT -AAACCTGCACCAGGTC -AAACCTGCACCAGTTA -AAACCTGCACCATCCT -AAACCTGCACCATGTA -AAACCTGCACCCAGTG -AAACCTGCACCCATGG -AAACCTGCACCCATTC -AAACCTGCACCCTATC -AAACCTGCACCGAAAG -AAACCTGCACCGAATT -AAACCTGCACCGATAT -AAACCTGCACCGCTAG -AAACCTGCACCGGAAA -AAACCTGCACCGTTGG -AAACCTGCACCTATCC -AAACCTGCACCTCGGA -AAACCTGCACCTCGTT -AAACCTGCACCTGGTG -AAACCTGCACCTTGTC -AAACCTGCACGAAACG -AAACCTGCACGAAAGC -AAACCTGCACGAAATA -AAACCTGCACGAAGCA -AAACCTGCACGACGAA -AAACCTGCACGACTCG -AAACCTGCACGAGAGT -AAACCTGCACGAGGTA -AAACCTGCACGCATCG -AAACCTGCACGCCAGT -AAACCTGCACGCGAAA -AAACCTGCACGCTTTC -AAACCTGCACGGACAA -AAACCTGCACGGATAG -AAACCTGCACGGCCAT -AAACCTGCACGGCGTT -AAACCTGCACGGCTAC -AAACCTGCACGGTAAG -AAACCTGCACGGTAGA -AAACCTGCACGGTGTC -AAACCTGCACGGTTTA -AAACCTGCACGTAAGG -AAACCTGCACGTCAGC -AAACCTGCACGTCTCT -AAACCTGCACGTGAGA -AAACCTGCACGTTGGC -AAACCTGCACTAAGTC -AAACCTGCACTACAGT -AAACCTGCACTAGTAC -AAACCTGCACTATCTT -AAACCTGCACTCAGGC -AAACCTGCACTCGACG -AAACCTGCACTCTGTC -AAACCTGCACTGAAGG -AAACCTGCACTGCCAG -AAACCTGCACTGTCGG -AAACCTGCACTGTGTA -AAACCTGCACTGTTAG -AAACCTGCACTTAACG -AAACCTGCACTTAAGC -AAACCTGCACTTACGA -AAACCTGCACTTCGAA -AAACCTGCACTTCTGC -AAACCTGCACTTGGAT -AAACCTGCAGAAGCAC -AAACCTGCAGACAAAT -AAACCTGCAGACAAGC -AAACCTGCAGACACTT -AAACCTGCAGACAGGT -AAACCTGCAGACGCAA -AAACCTGCAGACGCCT -AAACCTGCAGACGCTC -AAACCTGCAGACGTAG -AAACCTGCAGACTCGC -AAACCTGCAGAGCCAA -AAACCTGCAGAGTGTG -AAACCTGCAGATAATG -AAACCTGCAGATCCAT -AAACCTGCAGATCGGA -AAACCTGCAGATCTGT -AAACCTGCAGATGAGC -AAACCTGCAGATGGCA -AAACCTGCAGATGGGT -AAACCTGCAGATTGCT -AAACCTGCAGCAGTTT -AAACCTGCAGCATACT -AAACCTGCAGCATGAG -AAACCTGCAGCCAATT -AAACCTGCAGCCACCA -AAACCTGCAGCCAGAA -AAACCTGCAGCCTATA -AAACCTGCAGCCTGTG -AAACCTGCAGCCTTGG -AAACCTGCAGCCTTTC -AAACCTGCAGCGAACA -AAACCTGCAGCGATCC -AAACCTGCAGCGTAAG -AAACCTGCAGCGTCCA -AAACCTGCAGCGTTCG -AAACCTGCAGCTATTG -AAACCTGCAGCTCCGA -AAACCTGCAGCTCGAC -AAACCTGCAGCTCGCA -AAACCTGCAGCTGCAC -AAACCTGCAGCTGCTG -AAACCTGCAGCTGGCT -AAACCTGCAGCTGTAT -AAACCTGCAGCTGTGC -AAACCTGCAGCTGTTA -AAACCTGCAGCTTAAC -AAACCTGCAGCTTCGG -AAACCTGCAGGAACGT -AAACCTGCAGGAATCG -AAACCTGCAGGAATGC -AAACCTGCAGGACCCT -AAACCTGCAGGACGTA -AAACCTGCAGGATCGA -AAACCTGCAGGATTGG -AAACCTGCAGGCAGTA -AAACCTGCAGGCGATA -AAACCTGCAGGCTCAC -AAACCTGCAGGCTGAA -AAACCTGCAGGGAGAG -AAACCTGCAGGGATTG -AAACCTGCAGGGCATA -AAACCTGCAGGGTACA -AAACCTGCAGGGTATG -AAACCTGCAGGGTTAG -AAACCTGCAGGTCCAC -AAACCTGCAGGTCGTC -AAACCTGCAGGTCTCG -AAACCTGCAGGTGCCT -AAACCTGCAGGTGGAT -AAACCTGCAGGTTTCA -AAACCTGCAGTAACGG -AAACCTGCAGTAAGAT -AAACCTGCAGTAAGCG -AAACCTGCAGTACACT -AAACCTGCAGTAGAGC -AAACCTGCAGTATAAG -AAACCTGCAGTATCTG -AAACCTGCAGTATGCT -AAACCTGCAGTCACTA -AAACCTGCAGTCAGAG -AAACCTGCAGTCAGCC -AAACCTGCAGTCCTTC -AAACCTGCAGTCGATT -AAACCTGCAGTCGTGC -AAACCTGCAGTCTTCC -AAACCTGCAGTGACAG -AAACCTGCAGTGAGTG -AAACCTGCAGTGGAGT -AAACCTGCAGTGGGAT -AAACCTGCAGTTAACC -AAACCTGCAGTTCATG -AAACCTGCAGTTCCCT -AAACCTGCAGTTTACG -AAACCTGCATAAAGGT -AAACCTGCATAACCTG -AAACCTGCATAAGACA -AAACCTGCATACAGCT -AAACCTGCATACCATG -AAACCTGCATACGCCG -AAACCTGCATACGCTA -AAACCTGCATACTACG -AAACCTGCATACTCTT -AAACCTGCATAGAAAC -AAACCTGCATAGACTC -AAACCTGCATAGGATA -AAACCTGCATAGTAAG -AAACCTGCATATACCG -AAACCTGCATATACGC -AAACCTGCATATGAGA -AAACCTGCATATGCTG -AAACCTGCATATGGTC -AAACCTGCATCACAAC -AAACCTGCATCACCCT -AAACCTGCATCACGAT -AAACCTGCATCACGTA -AAACCTGCATCAGTAC -AAACCTGCATCAGTCA -AAACCTGCATCATCCC -AAACCTGCATCCAACA -AAACCTGCATCCCACT -AAACCTGCATCCCATC -AAACCTGCATCCGCGA -AAACCTGCATCCGGGT -AAACCTGCATCCGTGG -AAACCTGCATCCTAGA -AAACCTGCATCCTTGC -AAACCTGCATCGACGC -AAACCTGCATCGATGT -AAACCTGCATCGATTG -AAACCTGCATCGGAAG -AAACCTGCATCGGACC -AAACCTGCATCGGGTC -AAACCTGCATCGGTTA -AAACCTGCATCGTCGG -AAACCTGCATCTACGA -AAACCTGCATCTATGG -AAACCTGCATCTCCCA -AAACCTGCATCTCGCT -AAACCTGCATCTGGTA -AAACCTGCATGAACCT -AAACCTGCATGAAGTA -AAACCTGCATGACATC -AAACCTGCATGACGGA -AAACCTGCATGAGCGA -AAACCTGCATGATCCA -AAACCTGCATGCAACT -AAACCTGCATGCAATC -AAACCTGCATGCATGT -AAACCTGCATGCCACG -AAACCTGCATGCCCGA -AAACCTGCATGCCTAA -AAACCTGCATGCCTTC -AAACCTGCATGCGCAC -AAACCTGCATGCTAGT -AAACCTGCATGCTGGC -AAACCTGCATGGAATA -AAACCTGCATGGATGG -AAACCTGCATGGGAAC -AAACCTGCATGGGACA -AAACCTGCATGGTAGG -AAACCTGCATGGTCAT -AAACCTGCATGGTCTA -AAACCTGCATGGTTGT -AAACCTGCATGTAAGA -AAACCTGCATGTAGTC -AAACCTGCATGTCCTC -AAACCTGCATGTCGAT -AAACCTGCATGTCTCC -AAACCTGCATGTTCCC -AAACCTGCATGTTGAC -AAACCTGCATTAACCG -AAACCTGCATTACCTT -AAACCTGCATTACGAC -AAACCTGCATTAGCCA -AAACCTGCATTAGGCT -AAACCTGCATTATCTC -AAACCTGCATTCACTT -AAACCTGCATTCCTCG -AAACCTGCATTCCTGC -AAACCTGCATTCGACA -AAACCTGCATTCTCAT -AAACCTGCATTCTTAC -AAACCTGCATTGAGCT -AAACCTGCATTGCGGC -AAACCTGCATTGGCGC -AAACCTGCATTGGGCC -AAACCTGCATTGGTAC -AAACCTGCATTGTGCA -AAACCTGCATTTCACT -AAACCTGCATTTCAGG -AAACCTGCATTTGCCC -AAACCTGCATTTGCTT -AAACCTGGTAAACACA -AAACCTGGTAAACCTC -AAACCTGGTAAACGCG -AAACCTGGTAAAGGAG -AAACCTGGTAAAGTCA -AAACCTGGTAAATACG -AAACCTGGTAAATGAC -AAACCTGGTAAATGTG -AAACCTGGTAACGACG -AAACCTGGTAACGCGA -AAACCTGGTAACGTTC -AAACCTGGTAAGAGAG -AAACCTGGTAAGAGGA -AAACCTGGTAAGCACG -AAACCTGGTAAGGATT -AAACCTGGTAAGGGAA -AAACCTGGTAAGGGCT -AAACCTGGTAAGTAGT -AAACCTGGTAAGTGGC -AAACCTGGTAAGTGTA -AAACCTGGTAAGTTCC -AAACCTGGTAATAGCA -AAACCTGGTAATCACC -AAACCTGGTAATCGTC -AAACCTGGTAATTGGA -AAACCTGGTACAAGTA -AAACCTGGTACACCGC -AAACCTGGTACAGACG -AAACCTGGTACAGCAG -AAACCTGGTACAGTGG -AAACCTGGTACAGTTC -AAACCTGGTACATCCA -AAACCTGGTACATGTC -AAACCTGGTACCAGTT -AAACCTGGTACCATCA -AAACCTGGTACCCAAT -AAACCTGGTACCGAGA -AAACCTGGTACCGCTG -AAACCTGGTACCGGCT -AAACCTGGTACCGTAT -AAACCTGGTACCGTTA -AAACCTGGTACCTACA -AAACCTGGTACGAAAT -AAACCTGGTACGACCC -AAACCTGGTACGCACC -AAACCTGGTACGCTGC -AAACCTGGTACTCAAC -AAACCTGGTACTCGCG -AAACCTGGTACTCTCC -AAACCTGGTACTTAGC -AAACCTGGTACTTCTT -AAACCTGGTACTTGAC -AAACCTGGTAGAAAGG -AAACCTGGTAGAAGGA -AAACCTGGTAGAGCTG -AAACCTGGTAGAGGAA -AAACCTGGTAGAGTGC -AAACCTGGTAGATTAG -AAACCTGGTAGCAAAT -AAACCTGGTAGCACGA -AAACCTGGTAGCCTAT -AAACCTGGTAGCCTCG -AAACCTGGTAGCGATG -AAACCTGGTAGCGCAA -AAACCTGGTAGCGCTC -AAACCTGGTAGCGTAG -AAACCTGGTAGCGTCC -AAACCTGGTAGCGTGA -AAACCTGGTAGCTAAA -AAACCTGGTAGCTCCG -AAACCTGGTAGCTGCC -AAACCTGGTAGCTTGT -AAACCTGGTAGGACAC -AAACCTGGTAGGAGTC -AAACCTGGTAGGCATG -AAACCTGGTAGGCTGA -AAACCTGGTAGGGACT -AAACCTGGTAGGGTAC -AAACCTGGTAGTACCT -AAACCTGGTAGTAGTA -AAACCTGGTAGTGAAT -AAACCTGGTATAAACG -AAACCTGGTATAATGG -AAACCTGGTATAGGGC -AAACCTGGTATAGGTA -AAACCTGGTATAGTAG -AAACCTGGTATATCCG -AAACCTGGTATATGAG -AAACCTGGTATATGGA -AAACCTGGTATCACCA -AAACCTGGTATCAGTC -AAACCTGGTATCGCAT -AAACCTGGTATCTGCA -AAACCTGGTATGAAAC -AAACCTGGTATGAATG -AAACCTGGTATGCTTG -AAACCTGGTATGGTTC -AAACCTGGTATTACCG -AAACCTGGTATTAGCC -AAACCTGGTATTCGTG -AAACCTGGTATTCTCT -AAACCTGGTCAAACTC -AAACCTGGTCAAAGAT -AAACCTGGTCAAAGCG -AAACCTGGTCAACATC -AAACCTGGTCAACTGT -AAACCTGGTCAAGCGA -AAACCTGGTCAATACC -AAACCTGGTCAATGTC -AAACCTGGTCACAAGG -AAACCTGGTCACACGC -AAACCTGGTCACCCAG -AAACCTGGTCACCTAA -AAACCTGGTCACTGGC -AAACCTGGTCACTTCC -AAACCTGGTCAGAAGC -AAACCTGGTCAGAATA -AAACCTGGTCAGAGGT -AAACCTGGTCAGATAA -AAACCTGGTCAGCTAT -AAACCTGGTCAGGACA -AAACCTGGTCAGTGGA -AAACCTGGTCATACTG -AAACCTGGTCATATCG -AAACCTGGTCATATGC -AAACCTGGTCATCCCT -AAACCTGGTCATCGGC -AAACCTGGTCATGCAT -AAACCTGGTCATGCCG -AAACCTGGTCATTAGC -AAACCTGGTCCAACTA -AAACCTGGTCCAAGTT -AAACCTGGTCCAGTAT -AAACCTGGTCCAGTGC -AAACCTGGTCCAGTTA -AAACCTGGTCCATCCT -AAACCTGGTCCATGAT -AAACCTGGTCCCGACA -AAACCTGGTCCCTACT -AAACCTGGTCCCTTGT -AAACCTGGTCCGAACC -AAACCTGGTCCGAAGA -AAACCTGGTCCGAATT -AAACCTGGTCCGACGT -AAACCTGGTCCGAGTC -AAACCTGGTCCGCTGA -AAACCTGGTCCGTCAG -AAACCTGGTCCGTGAC -AAACCTGGTCCGTTAA -AAACCTGGTCCTAGCG -AAACCTGGTCCTCCAT -AAACCTGGTCCTCTTG -AAACCTGGTCCTGCTT -AAACCTGGTCGAAAGC -AAACCTGGTCGAACAG -AAACCTGGTCGAATCT -AAACCTGGTCGACTAT -AAACCTGGTCGACTGC -AAACCTGGTCGAGATG -AAACCTGGTCGAGTTT -AAACCTGGTCGATTGT -AAACCTGGTCGCATAT -AAACCTGGTCGCATCG -AAACCTGGTCGCCATG -AAACCTGGTCGCGAAA -AAACCTGGTCGCGGTT -AAACCTGGTCGCGTGT -AAACCTGGTCGCTTCT -AAACCTGGTCGCTTTC -AAACCTGGTCGGATCC -AAACCTGGTCGGCACT -AAACCTGGTCGGCATC -AAACCTGGTCGGCTCA -AAACCTGGTCGGGTCT -AAACCTGGTCGTCTTC -AAACCTGGTCGTGGCT -AAACCTGGTCGTTGTA -AAACCTGGTCTAAACC -AAACCTGGTCTAAAGA -AAACCTGGTCTAACGT -AAACCTGGTCTACCTC -AAACCTGGTCTAGAGG -AAACCTGGTCTAGCCG -AAACCTGGTCTAGCGC -AAACCTGGTCTAGGTT -AAACCTGGTCTAGTCA -AAACCTGGTCTAGTGT -AAACCTGGTCTCAACA -AAACCTGGTCTCACCT -AAACCTGGTCTCATCC -AAACCTGGTCTCCACT -AAACCTGGTCTCCATC -AAACCTGGTCTCCCTA -AAACCTGGTCTCGTTC -AAACCTGGTCTCTCGT -AAACCTGGTCTCTCTG -AAACCTGGTCTCTTAT -AAACCTGGTCTCTTTA -AAACCTGGTCTGATCA -AAACCTGGTCTGATTG -AAACCTGGTCTGCAAT -AAACCTGGTCTGCCAG -AAACCTGGTCTGCGGT -AAACCTGGTCTGGAGA -AAACCTGGTCTGGTCG -AAACCTGGTCTTCAAG -AAACCTGGTCTTCGTC -AAACCTGGTCTTCTCG -AAACCTGGTCTTGATG -AAACCTGGTCTTGCGG -AAACCTGGTCTTGTCC -AAACCTGGTCTTTCAT -AAACCTGGTGAAAGAG -AAACCTGGTGAAATCA -AAACCTGGTGAACCTT -AAACCTGGTGAAGGCT -AAACCTGGTGACAAAT -AAACCTGGTGACCAAG -AAACCTGGTGACGCCT -AAACCTGGTGACGGTA -AAACCTGGTGACTACT -AAACCTGGTGACTCAT -AAACCTGGTGAGCGAT -AAACCTGGTGAGGCTA -AAACCTGGTGAGGGAG -AAACCTGGTGAGGGTT -AAACCTGGTGAGTATA -AAACCTGGTGAGTGAC -AAACCTGGTGATAAAC -AAACCTGGTGATAAGT -AAACCTGGTGATGATA -AAACCTGGTGATGCCC -NAGGTGCCAGACACTT diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/cell-gene-umi-queryname-sorted.bam b/tools/scripts/sctools/build/lib/sctools/test/data/cell-gene-umi-queryname-sorted.bam deleted file mode 100644 index f14155a7..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/cell-gene-umi-queryname-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/cell-sorted-missing-cb.bam b/tools/scripts/sctools/build/lib/sctools/test/data/cell-sorted-missing-cb.bam deleted file mode 100644 index 88d2b057..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/cell-sorted-missing-cb.bam and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/cell-sorted.bam b/tools/scripts/sctools/build/lib/sctools/test/data/cell-sorted.bam deleted file mode 100644 index b76e76c3..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/cell-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/cell_metrics_missing_cb.csv.gz b/tools/scripts/sctools/build/lib/sctools/test/data/cell_metrics_missing_cb.csv.gz deleted file mode 100644 index 20a433db..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/cell_metrics_missing_cb.csv.gz and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/chr1.30k_records.gtf.gz b/tools/scripts/sctools/build/lib/sctools/test/data/chr1.30k_records.gtf.gz deleted file mode 100644 index 36e6f0fa..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/chr1.30k_records.gtf.gz and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/expected_picard_group.csv b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/expected_picard_group.csv deleted file mode 100644 index 6b97c599..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/expected_picard_group.csv +++ /dev/null @@ -1,3 +0,0 @@ -,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE -Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics -test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2.csv b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2.csv deleted file mode 100644 index 17418654..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2.csv +++ /dev/null @@ -1,3 +0,0 @@ -,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads -Class,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G -test,478,240,106,4414,652,412,1,95.64%,5479,824 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log deleted file mode 100644 index 982a1b65..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log +++ /dev/null @@ -1,11 +0,0 @@ -HISAT2 summary stats: - Total pairs: 5479 - Aligned concordantly or discordantly 0 time: 412 (7.52%) - Aligned concordantly 1 time: 4414 (80.56%) - Aligned concordantly >1 times: 652 (11.90%) - Aligned discordantly 1 time: 1 (0.02%) - Total unpaired reads: 824 - Aligned 0 time: 478 (58.01%) - Aligned 1 time: 240 (29.13%) - Aligned >1 times: 106 (12.86%) - Overall alignment rate: 95.64% diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_trans.csv b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_trans.csv deleted file mode 100644 index e484efef..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_trans.csv +++ /dev/null @@ -1,3 +0,0 @@ -,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads -Class,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T -test,7270,0,0,360,1484,3635,0,33.66%,5479,7270 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log deleted file mode 100644 index 099ace2d..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log +++ /dev/null @@ -1,11 +0,0 @@ -HISAT2 summary stats: - Total pairs: 5479 - Aligned concordantly or discordantly 0 time: 3635 (66.34%) - Aligned concordantly 1 time: 360 (6.57%) - Aligned concordantly >1 times: 1484 (27.09%) - Aligned discordantly 1 time: 0 (0.00%) - Total unpaired reads: 7270 - Aligned 0 time: 7270 (100.00%) - Aligned 1 time: 0 (0.00%) - Aligned >1 times: 0 (0.00%) - Overall alignment rate: 33.66% diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_picard_group.csv b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_picard_group.csv deleted file mode 100644 index 6b97c599..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_picard_group.csv +++ /dev/null @@ -1,3 +0,0 @@ -,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE -Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics -test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt deleted file mode 100644 index a1828311..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt +++ /dev/null @@ -1,12 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.AlignmentSummaryMetrics -CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP -FIRST_OF_PAIR 5479 5479 1 0 5256 0.959299 131124 4650 116063 115095 0 0.0009 0.000922 0.000069 25 5080 0.966514 190 0.036149 0 0.494292 0.006141 0 -SECOND_OF_PAIR 5479 5479 1 0 5224 0.953459 130281 4629 115487 114015 0 0.000852 0.000849 0.000038 25 5080 0.972435 158 0.030245 0 0.508806 0.006165 0 -PAIR 10958 10958 1 0 10480 0.956379 261405 9279 231550 229110 0 0.000876 0.000885 0.000054 25 10160 0.969466 348 0.033206 0 0.501527 0.006153 0 - - diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt deleted file mode 100644 index c4f38f09..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt +++ /dev/null @@ -1,8 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam] OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.MarkDuplicated.bam METRICS_FILE=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX= OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:51:46 UTC 2018 - -## METRICS CLASS picard.sam.DuplicationMetrics -LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE -d20fb2dd-3d98-4516-a648-dee5e1917bd7 320 5080 4393 478 33 21 0 0.007156 612743 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt deleted file mode 100644 index c0050359..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt +++ /dev/null @@ -1,15 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics -REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE -A C A>C 231512 16 0.000069 -A G A>G 231512 156 0.000673 -A T A>T 231512 16 0.000069 -C A C>A 173880 16 0.000092 -C G C>G 173880 14 0.000081 -C T C>T 173880 82 0.000471 - - diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt deleted file mode 100644 index 934a84ba..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.GcBiasSummaryMetrics -ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP -All Reads ALL 100 7701 14873 10.733266 1.82225 0.112713 0.817807 1.086361 2.181453 0.143318 - - diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt deleted file mode 100644 index 160eb300..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt +++ /dev/null @@ -1,8 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.InsertSizeMetrics -MEDIAN_INSERT_SIZE MEDIAN_ABSOLUTE_DEVIATION MIN_INSERT_SIZE MAX_INSERT_SIZE MEAN_INSERT_SIZE STANDARD_DEVIATION READ_PAIRS PAIR_ORIENTATION WIDTH_OF_10_PERCENT WIDTH_OF_20_PERCENT WIDTH_OF_30_PERCENT WIDTH_OF_40_PERCENT WIDTH_OF_50_PERCENT WIDTH_OF_60_PERCENT WIDTH_OF_70_PERCENT WIDTH_OF_80_PERCENT WIDTH_OF_90_PERCENT WIDTH_OF_99_PERCENT SAMPLE LIBRARY READ_GROUP -191 63 33 2725787 207.219528 106.256303 5067 FR 25 49 73 99 127 157 195 267 641 87835 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.rna_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.rna_metrics.txt deleted file mode 100644 index f7a52c62..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_qc.rna_metrics.txt +++ /dev/null @@ -1,8 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectRnaSeqMetrics REF_FLAT=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:51:55 UTC 2018 - -## METRICS CLASS picard.analysis.RnaSeqMetrics -PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP -273950 261405 0 56934 37664 101238 65569 0 0 0 719 795 60 0.474901 0.525099 0 0.2178 0.144083 0.387284 0.250833 0.361883 0.345311 0 0.939679 0.680576 0.705663 0.496023 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_rsem.cnt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_rsem.cnt deleted file mode 100644 index 3ee8b723..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_rsem.cnt +++ /dev/null @@ -1,15 +0,0 @@ -3635 1844 0 5479 -1652 192 1484 -6599 3 -0 3635 -1 360 -2 327 -3 416 -4 243 -5 185 -6 85 -7 76 -8 53 -9 16 -10 83 -Inf 0 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_rsem.csv b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_rsem.csv deleted file mode 100644 index fc0afb09..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics/test_rsem.csv +++ /dev/null @@ -1,3 +0,0 @@ -,alignable reads,filtered reads,multiple mapped,strand,total alignments,total reads,unalignable reads,uncertain reads,unique aligned -Class,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM -test,1844,0,192,3,6599,5479,3635,1484,1652 diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt deleted file mode 100644 index 1559f3e7..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:18 UTC 2019 - -## METRICS CLASS picard.analysis.AlignmentSummaryMetrics -CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP -UNPAIRED 1086652 1086652 1 0 770963 0.709485 38213614 697232 34613985 34073804 0 0.002624 0.002357 0.000149 50 0 0 0 0 0 0.501303 0 0.000027 - - diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt deleted file mode 100644 index 661fa797..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectDuplicationMetrics/inputs/-1585165421/SRR6258488_qc.bam] OUTPUT=SRR6258488_qc.MarkDuplicated.bam METRICS_FILE=SRR6258488_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX= OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:17 UTC 2019 - -## METRICS CLASS picard.sam.DuplicationMetrics -LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE -SRR6258488 770963 0 473100 315689 345396 0 0 0.448006 - - diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt deleted file mode 100644 index 26669a77..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:18 UTC 2019 - -## METRICS CLASS picard.analysis.GcBiasSummaryMetrics -ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP -All Reads ALL 100 1559752 1244063 13.760859 1.1878 0.219754 0.753171 1.281724 0.883386 0.021428 - - diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt b/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt deleted file mode 100644 index 43831064..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt +++ /dev/null @@ -1,113 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectRnaSeqMetrics REF_FLAT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=SRR6258488_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-1585165421/SRR6258488_qc.bam OUTPUT=SRR6258488_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:18 UTC 2019 - -## METRICS CLASS picard.analysis.RnaSeqMetrics -PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP -54332600 38213614 0 371628 1152265 18630585 18059136 0 0 0 12352 12891 538 0.489324 0.510676 0 0.009725 0.030153 0.487538 0.472584 0.039878 0.028047 0 2.183917 0 0 0 - -## HISTOGRAM java.lang.Integer -normalized_position All_Reads.normalized_coverage -0 1.252653 -1 1.146108 -2 1.065068 -3 1.122433 -4 1.234516 -5 1.247113 -6 1.2191 -7 1.08917 -8 1.101883 -9 1.130302 -10 1.082888 -11 1.146879 -12 1.173149 -13 1.084206 -14 1.035169 -15 1.169359 -16 1.278125 -17 1.298059 -18 1.418038 -19 1.468055 -20 1.306559 -21 1.210198 -22 0.953958 -23 0.806139 -24 0.815513 -25 0.887045 -26 0.763414 -27 0.737914 -28 0.702678 -29 0.689913 -30 0.633512 -31 0.665368 -32 0.682949 -33 0.848599 -34 0.941722 -35 1.082228 -36 1.113449 -37 1.049003 -38 0.97788 -39 0.989931 -40 0.92986 -41 0.874432 -42 0.87788 -43 0.868871 -44 0.92942 -45 1.015775 -46 1.070114 -47 1.023889 -48 1.023103 -49 0.988576 -50 0.931694 -51 0.794716 -52 0.765784 -53 0.721218 -54 0.723223 -55 0.711507 -56 0.704034 -57 0.694139 -58 0.741844 -59 0.831505 -60 0.806244 -61 0.869419 -62 0.987354 -63 0.954176 -64 0.925553 -65 0.951851 -66 0.906269 -67 0.85666 -68 0.985052 -69 0.947861 -70 0.98528 -71 0.873541 -72 0.87925 -73 0.956294 -74 1.137028 -75 1.206313 -76 1.148145 -77 1.159051 -78 1.207689 -79 1.170334 -80 1.199969 -81 1.391121 -82 1.243649 -83 1.235795 -84 1.227105 -85 1.278662 -86 1.298065 -87 1.201038 -88 1.2361 -89 1.098932 -90 1.042881 -91 1.037875 -92 0.95545 -93 0.969215 -94 1.059149 -95 0.857316 -96 0.792585 -97 0.817511 -98 0.880909 -99 0.786114 -100 0.548663 - diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/small-cell-sorted.bam b/tools/scripts/sctools/build/lib/sctools/test/data/small-cell-sorted.bam deleted file mode 100644 index a44c1ff9..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/small-cell-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/small-gene-sorted.bam b/tools/scripts/sctools/build/lib/sctools/test/data/small-gene-sorted.bam deleted file mode 100644 index 9773d658..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/small-gene-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/test.bam b/tools/scripts/sctools/build/lib/sctools/test/data/test.bam deleted file mode 100644 index 75db9802..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/test.bam and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf b/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf deleted file mode 100644 index 79561f37..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf +++ /dev/null @@ -1,109 +0,0 @@ -# truncated chromosome 19 genome used for testing util package ONLY -# created Aug 22, 2017 by Ambrose J Carr -chr19 HAVANA gene 60951 71626 . - . gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8"; -chr19 HAVANA transcript 60951 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA exon 70928 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 1; exon_id "ENSE00003781173.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA exon 60951 61894 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 3; exon_id "ENSE00003783010.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA transcript 62113 66524 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633719.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-009"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475086.2"; -chr19 HAVANA exon 62113 66524 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633719.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-009"; exon_number 1; exon_id "ENSE00003783013.1"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475086.2"; -chr19 HAVANA transcript 63821 70951 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA exon 70928 70951 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; exon_number 1; exon_id "ENSE00003782721.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA exon 63821 64213 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; exon_number 3; exon_id "ENSE00003781018.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA transcript 65051 66382 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000634023.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-011"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471182.4"; -chr19 HAVANA exon 66346 66382 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000634023.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-011"; exon_number 1; exon_id "ENSE00003778074.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471182.4"; -chr19 HAVANA exon 65051 65226 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000634023.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-011"; exon_number 2; exon_id "ENSE00003782150.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471182.4"; -chr19 HAVANA transcript 65822 66420 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632496.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-005"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475088.2"; -chr19 HAVANA exon 66346 66420 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632496.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-005"; exon_number 1; exon_id "ENSE00003780450.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475088.2"; -chr19 HAVANA exon 65822 66133 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632496.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-005"; exon_number 2; exon_id "ENSE00003782888.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475088.2"; -chr19 HAVANA transcript 65822 70945 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA exon 70928 70945 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; exon_number 1; exon_id "ENSE00003776564.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA exon 65822 66047 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; exon_number 3; exon_id "ENSE00003779454.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA transcript 65822 70963 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631796.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-004"; level 2; transcript_support_level "2"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475089.2"; -chr19 HAVANA exon 70928 70963 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631796.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-004"; exon_number 1; exon_id "ENSE00003775509.1"; level 2; transcript_support_level "2"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475089.2"; -chr19 HAVANA exon 65822 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631796.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-004"; exon_number 2; exon_id "ENSE00003783427.1"; level 2; transcript_support_level "2"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475089.2"; -chr19 HAVANA transcript 66320 66492 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633742.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WASH5P-001"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000451423.2"; -chr19 HAVANA exon 66320 66492 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633742.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WASH5P-001"; exon_number 1; exon_id "ENSE00003779144.1"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000451423.2"; -chr19 HAVANA transcript 66378 71566 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631994.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-006"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475090.1"; -chr19 HAVANA exon 71141 71566 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631994.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-006"; exon_number 1; exon_id "ENSE00003776913.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475090.1"; -chr19 HAVANA exon 66378 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631994.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-006"; exon_number 2; exon_id "ENSE00003775972.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475090.1"; -chr19 HAVANA transcript 70652 71626 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632292.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-012"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471218.1"; -chr19 HAVANA exon 70652 71626 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632292.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-012"; exon_number 1; exon_id "ENSE00003783546.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471218.1"; -chr19 HAVANA gene 68403 69178 . + . gene_id "ENSG00000282542.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "AC008993.2"; level 2; havana_gene "OTTHUMG00000180450.4"; -chr19 HAVANA transcript 68403 69178 . + . gene_id "ENSG00000282542.1"; transcript_id "ENST00000632280.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "AC008993.2"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "AC008993.2-001"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180450.4"; havana_transcript "OTTHUMT00000451405.4"; -chr19 HAVANA exon 68403 69178 . + . gene_id "ENSG00000282542.1"; transcript_id "ENST00000632280.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "AC008993.2"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "AC008993.2-001"; exon_number 1; exon_id "ENSE00003776314.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180450.4"; havana_transcript "OTTHUMT00000451405.4"; -chr19 HAVANA gene 69167 69972 . + . gene_id "ENSG00000282798.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "LLNLR-222A1.1"; level 2; havana_gene "OTTHUMG00000190399.1"; -chr19 HAVANA transcript 69167 69972 . + . gene_id "ENSG00000282798.1"; transcript_id "ENST00000631744.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "LLNLR-222A1.1"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "LLNLR-222A1.1-001"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190399.1"; havana_transcript "OTTHUMT00000484821.1"; -chr19 HAVANA exon 69167 69972 . + . gene_id "ENSG00000282798.1"; transcript_id "ENST00000631744.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "LLNLR-222A1.1"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "LLNLR-222A1.1-001"; exon_number 1; exon_id "ENSE00003780024.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190399.1"; havana_transcript "OTTHUMT00000484821.1"; -chr19 HAVANA gene 71778 72718 . + . gene_id "ENSG00000282807.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; level 2; tag "ncRNA_host"; havana_gene "OTTHUMG00000180451.3"; -chr19 HAVANA transcript 71778 72718 . + . gene_id "ENSG00000282807.1"; transcript_id "ENST00000633603.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC008993.3-001"; level 2; tag "basic"; transcript_support_level "5"; havana_gene "OTTHUMG00000180451.3"; havana_transcript "OTTHUMT00000451407.2"; -chr19 HAVANA exon 71778 72274 . + . gene_id "ENSG00000282807.1"; transcript_id "ENST00000633603.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC008993.3-001"; exon_number 1; exon_id "ENSE00003776113.1"; level 2; tag "basic"; transcript_support_level "5"; havana_gene "OTTHUMG00000180451.3"; havana_transcript "OTTHUMT00000451407.2"; -chr19 HAVANA exon 72585 72718 . + . gene_id "ENSG00000282807.1"; transcript_id "ENST00000633603.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC008993.3-001"; exon_number 2; exon_id "ENSE00003783209.1"; level 2; tag "basic"; transcript_support_level "5"; havana_gene "OTTHUMG00000180451.3"; havana_transcript "OTTHUMT00000451407.2"; -chr19 ENSEMBL gene 71973 72110 . + . gene_id "ENSG00000275604.1"; gene_type "miRNA"; gene_status "KNOWN"; gene_name "MIR1302-11"; level 3; -chr19 ENSEMBL transcript 71973 72110 . + . gene_id "ENSG00000275604.1"; transcript_id "ENST00000408051.1"; gene_type "miRNA"; gene_status "KNOWN"; gene_name "MIR1302-11"; transcript_type "miRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-201"; level 3; tag "basic"; transcript_support_level "NA"; -chr19 ENSEMBL exon 71973 72110 . + . gene_id "ENSG00000275604.1"; transcript_id "ENST00000408051.1"; gene_type "miRNA"; gene_status "KNOWN"; gene_name "MIR1302-11"; transcript_type "miRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-201"; exon_number 1; exon_id "ENSE00001808054.1"; level 3; tag "basic"; transcript_support_level "NA"; -chr19 HAVANA gene 76163 77686 . - . gene_id "ENSG00000282591.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; level 2; havana_gene "OTTHUMG00000180467.4"; -chr19 HAVANA transcript 76163 77686 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA exon 77330 77686 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; exon_number 1; exon_id "ENSE00003778121.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA exon 76886 77090 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; exon_number 2; exon_id "ENSE00003783139.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA exon 76163 76783 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; exon_number 3; exon_id "ENSE00003778696.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA transcript 76220 77659 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000632948.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-002"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451425.3"; -chr19 HAVANA exon 77330 77659 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000632948.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-002"; exon_number 1; exon_id "ENSE00003779597.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451425.3"; -chr19 HAVANA exon 76220 77090 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000632948.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-002"; exon_number 2; exon_id "ENSE00003782175.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451425.3"; -chr19 HAVANA gene 94062 94974 . + . gene_id "ENSG00000282137.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G3P"; level 2; havana_gene "OTTHUMG00000180452.2"; -chr19 HAVANA transcript 94062 94974 . + . gene_id "ENSG00000282137.1"; transcript_id "ENST00000633500.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G3P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G3P-001"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180452.2"; havana_transcript "OTTHUMT00000451408.2"; -chr19 HAVANA exon 94062 94974 . + . gene_id "ENSG00000282137.1"; transcript_id "ENST00000633500.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G3P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G3P-001"; exon_number 1; exon_id "ENSE00003781724.1"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180452.2"; havana_transcript "OTTHUMT00000451408.2"; -chr19 HAVANA gene 104535 105471 . + . gene_id "ENSG00000267310.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G1P"; level 2; havana_gene "OTTHUMG00000180453.1"; -chr19 HAVANA transcript 104535 105471 . + . gene_id "ENSG00000267310.1"; transcript_id "ENST00000588632.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G1P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G1P-001"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180453.1"; havana_transcript "OTTHUMT00000451409.1"; -chr19 HAVANA exon 104535 105471 . + . gene_id "ENSG00000267310.1"; transcript_id "ENST00000588632.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G1P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G1P-001"; exon_number 1; exon_id "ENSE00002952394.1"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180453.1"; havana_transcript "OTTHUMT00000451409.1"; -chr19 HAVANA gene 107461 111696 . + . gene_id "ENSG00000176695.6"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; level 2; havana_gene "OTTHUMG00000180454.2"; -chr19 HAVANA transcript 107461 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA exon 107461 107555 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 1; exon_id "ENSE00002825729.2"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA exon 110625 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA CDS 110679 111593 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA start_codon 110679 110681 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA stop_codon 111594 111596 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA UTR 107461 107555 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 1; exon_id "ENSE00002825729.2"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA UTR 110625 110678 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA UTR 111594 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA transcript 110613 111417 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000618231.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "OR4F17-002"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000475091.1"; -chr19 HAVANA exon 110613 111417 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000618231.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "OR4F17-002"; exon_number 1; exon_id "ENSE00003719758.1"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000475091.1"; -chr19 ENSEMBL transcript 110643 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL exon 110643 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL CDS 110679 111593 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL start_codon 110679 110681 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL stop_codon 111594 111596 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL UTR 110643 110678 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL UTR 111594 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 HAVANA gene 145485 145812 . + . gene_id "ENSG00000267792.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "WBP1LP11"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000180455.1"; -chr19 HAVANA transcript 145485 145812 . + . gene_id "ENSG00000267792.1"; transcript_id "ENST00000586141.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "WBP1LP11"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WBP1LP11-001"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180455.1"; havana_transcript "OTTHUMT00000451411.1"; -chr19 HAVANA exon 145485 145812 . + . gene_id "ENSG00000267792.1"; transcript_id "ENST00000586141.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "WBP1LP11"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WBP1LP11-001"; exon_number 1; exon_id "ENSE00002835239.1"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180455.1"; havana_transcript "OTTHUMT00000451411.1"; -chr19 HAVANA gene 156279 157215 . - . gene_id "ENSG00000266971.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4F8P"; level 2; havana_gene "OTTHUMG00000180456.1"; -chr19 HAVANA transcript 156279 157215 . - . gene_id "ENSG00000266971.1"; transcript_id "ENST00000589943.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4F8P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4F8P-001"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180456.1"; havana_transcript "OTTHUMT00000451412.1"; -chr19 HAVANA exon 156279 157215 . - . gene_id "ENSG00000266971.1"; transcript_id "ENST00000589943.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4F8P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4F8P-001"; exon_number 1; exon_id "ENSE00002966057.1"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180456.1"; havana_transcript "OTTHUMT00000451412.1"; -chr19 HAVANA gene 176896 177913 . + . gene_id "ENSG00000282535.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC092192.1"; level 2; havana_gene "OTTHUMG00000180458.2"; -chr19 HAVANA transcript 176896 177913 . + . gene_id "ENSG00000282535.1"; transcript_id "ENST00000633154.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC092192.1"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC092192.1-001"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180458.2"; havana_transcript "OTTHUMT00000451414.2"; -chr19 HAVANA exon 176896 177913 . + . gene_id "ENSG00000282535.1"; transcript_id "ENST00000633154.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC092192.1"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC092192.1-001"; exon_number 1; exon_id "ENSE00003777312.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180458.2"; havana_transcript "OTTHUMT00000451414.2"; -chr19 HAVANA gene 186373 195696 . - . gene_id "ENSG00000281379.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; level 2; havana_gene "OTTHUMG00000180460.8"; -chr19 HAVANA transcript 186373 191429 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000632397.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-003"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475096.2"; -chr19 HAVANA exon 191186 191429 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000632397.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-003"; exon_number 1; exon_id "ENSE00003777503.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475096.2"; -chr19 HAVANA exon 186373 186498 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000632397.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-003"; exon_number 2; exon_id "ENSE00003778733.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475096.2"; -chr19 HAVANA transcript 191115 191325 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000633205.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "SEPT14P19-001"; level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000451416.7"; -chr19 HAVANA exon 191115 191325 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000633205.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "SEPT14P19-001"; exon_number 1; exon_id "ENSE00003775583.1"; level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000451416.7"; -chr19 HAVANA transcript 191212 195696 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000587432.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-002"; level 2; tag "basic"; transcript_support_level "2"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475095.2"; -chr19 HAVANA exon 195504 195696 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000587432.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-002"; exon_number 1; exon_id "ENSE00002880392.5"; level 2; tag "basic"; transcript_support_level "2"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475095.2"; -chr19 HAVANA exon 191212 191354 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000587432.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-002"; exon_number 2; exon_id "ENSE00002672754.6"; level 2; tag "basic"; transcript_support_level "2"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475095.2"; -chr19 HAVANA gene 193239 195595 . + . gene_id "ENSG00000282059.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "CICP19"; level 1; tag "pseudo_consens"; tag "overlapping_locus"; havana_gene "OTTHUMG00000180463.7"; -chr19 HAVANA transcript 193239 195595 . + . gene_id "ENSG00000282059.1"; transcript_id "ENST00000632944.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "CICP19"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "CICP19-001"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180463.7"; havana_transcript "OTTHUMT00000451419.7"; -chr19 HAVANA exon 193239 195595 . + . gene_id "ENSG00000282059.1"; transcript_id "ENST00000632944.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "CICP19"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "CICP19-001"; exon_number 1; exon_id "ENSE00003779877.1"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180463.7"; havana_transcript "OTTHUMT00000451419.7"; -chr19 HAVANA gene 197310 198066 . - . gene_id "ENSG00000282416.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.2"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000190442.1"; -chr19 HAVANA transcript 197310 198066 . - . gene_id "ENSG00000282416.1"; transcript_id "ENST00000632679.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.2"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.2-001"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190442.1"; havana_transcript "OTTHUMT00000484997.1"; -chr19 HAVANA exon 197310 198066 . - . gene_id "ENSG00000282416.1"; transcript_id "ENST00000632679.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.2"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.2-001"; exon_number 1; exon_id "ENSE00003778188.1"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190442.1"; havana_transcript "OTTHUMT00000484997.1"; -chr19 HAVANA gene 197961 200775 . + . gene_id "ENSG00000282051.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; level 2; havana_gene "OTTHUMG00000182072.3"; -chr19 HAVANA transcript 197961 198396 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633895.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-001"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000484998.2"; -chr19 HAVANA exon 197961 198396 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633895.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-001"; exon_number 1; exon_id "ENSE00003783880.1"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000484998.2"; -chr19 HAVANA transcript 198052 200775 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633286.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-002"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000459134.1"; -chr19 HAVANA exon 198052 198234 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633286.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-002"; exon_number 1; exon_id "ENSE00003777852.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000459134.1"; -chr19 HAVANA exon 200578 200775 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633286.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-002"; exon_number 2; exon_id "ENSE00003780775.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000459134.1"; diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf.bz2 b/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf.bz2 deleted file mode 100644 index 5800fce8..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf.bz2 and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf.gz b/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf.gz deleted file mode 100644 index 2b954059..00000000 Binary files a/tools/scripts/sctools/build/lib/sctools/test/data/test.gtf.gz and /dev/null differ diff --git a/tools/scripts/sctools/build/lib/sctools/test/data/test.sam b/tools/scripts/sctools/build/lib/sctools/test/data/test.sam deleted file mode 100644 index 928bcab1..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/data/test.sam +++ /dev/null @@ -1,805 +0,0 @@ -@HD VN:1.4 SO:coordinate -@SQ SN:1 LN:248956422 -@SQ SN:10 LN:133797422 -@SQ SN:11 LN:135086622 -@SQ SN:12 LN:133275309 -@SQ SN:13 LN:114364328 -@SQ SN:14 LN:107043718 -@SQ SN:15 LN:101991189 -@SQ SN:16 LN:90338345 -@SQ SN:17 LN:83257441 -@SQ SN:18 LN:80373285 -@SQ SN:19 LN:58617616 -@SQ SN:2 LN:242193529 -@SQ SN:20 LN:64444167 -@SQ SN:21 LN:46709983 -@SQ SN:22 LN:50818468 -@SQ SN:3 LN:198295559 -@SQ SN:4 LN:190214555 -@SQ SN:5 LN:181538259 -@SQ SN:6 LN:170805979 -@SQ SN:7 LN:159345973 -@SQ SN:8 LN:145138636 -@SQ SN:9 LN:138394717 -@SQ SN:MT LN:16569 -@SQ SN:X LN:156040895 -@SQ SN:Y LN:57227415 -@SQ SN:KI270728.1 LN:1872759 -@SQ SN:KI270727.1 LN:448248 -@SQ SN:KI270442.1 LN:392061 -@SQ SN:KI270729.1 LN:280839 -@SQ SN:GL000225.1 LN:211173 -@SQ SN:KI270743.1 LN:210658 -@SQ SN:GL000008.2 LN:209709 -@SQ SN:GL000009.2 LN:201709 -@SQ SN:KI270747.1 LN:198735 -@SQ SN:KI270722.1 LN:194050 -@SQ SN:GL000194.1 LN:191469 -@SQ SN:KI270742.1 LN:186739 -@SQ SN:GL000205.2 LN:185591 -@SQ SN:GL000195.1 LN:182896 -@SQ SN:KI270736.1 LN:181920 -@SQ SN:KI270733.1 LN:179772 -@SQ SN:GL000224.1 LN:179693 -@SQ SN:GL000219.1 LN:179198 -@SQ SN:KI270719.1 LN:176845 -@SQ SN:GL000216.2 LN:176608 -@SQ SN:KI270712.1 LN:176043 -@SQ SN:KI270706.1 LN:175055 -@SQ SN:KI270725.1 LN:172810 -@SQ SN:KI270744.1 LN:168472 -@SQ SN:KI270734.1 LN:165050 -@SQ SN:GL000213.1 LN:164239 -@SQ SN:GL000220.1 LN:161802 -@SQ SN:KI270715.1 LN:161471 -@SQ SN:GL000218.1 LN:161147 -@SQ SN:KI270749.1 LN:158759 -@SQ SN:KI270741.1 LN:157432 -@SQ SN:GL000221.1 LN:155397 -@SQ SN:KI270716.1 LN:153799 -@SQ SN:KI270731.1 LN:150754 -@SQ SN:KI270751.1 LN:150742 -@SQ SN:KI270750.1 LN:148850 -@SQ SN:KI270519.1 LN:138126 -@SQ SN:GL000214.1 LN:137718 -@SQ SN:KI270708.1 LN:127682 -@SQ SN:KI270730.1 LN:112551 -@SQ SN:KI270438.1 LN:112505 -@SQ SN:KI270737.1 LN:103838 -@SQ SN:KI270721.1 LN:100316 -@SQ SN:KI270738.1 LN:99375 -@SQ SN:KI270748.1 LN:93321 -@SQ SN:KI270435.1 LN:92983 -@SQ SN:GL000208.1 LN:92689 -@SQ SN:KI270538.1 LN:91309 -@SQ SN:KI270756.1 LN:79590 -@SQ SN:KI270739.1 LN:73985 -@SQ SN:KI270757.1 LN:71251 -@SQ SN:KI270709.1 LN:66860 -@SQ SN:KI270746.1 LN:66486 -@SQ SN:KI270753.1 LN:62944 -@SQ SN:KI270589.1 LN:44474 -@SQ SN:KI270726.1 LN:43739 -@SQ SN:KI270735.1 LN:42811 -@SQ SN:KI270711.1 LN:42210 -@SQ SN:KI270745.1 LN:41891 -@SQ SN:KI270714.1 LN:41717 -@SQ SN:KI270732.1 LN:41543 -@SQ SN:KI270713.1 LN:40745 -@SQ SN:KI270754.1 LN:40191 -@SQ SN:KI270710.1 LN:40176 -@SQ SN:KI270717.1 LN:40062 -@SQ SN:KI270724.1 LN:39555 -@SQ SN:KI270720.1 LN:39050 -@SQ SN:KI270723.1 LN:38115 -@SQ SN:KI270718.1 LN:38054 -@SQ SN:KI270317.1 LN:37690 -@SQ SN:KI270740.1 LN:37240 -@SQ SN:KI270755.1 LN:36723 -@SQ SN:KI270707.1 LN:32032 -@SQ SN:KI270579.1 LN:31033 -@SQ SN:KI270752.1 LN:27745 -@SQ SN:KI270512.1 LN:22689 -@SQ SN:KI270322.1 LN:21476 -@SQ SN:GL000226.1 LN:15008 -@SQ SN:KI270311.1 LN:12399 -@SQ SN:KI270366.1 LN:8320 -@SQ SN:KI270511.1 LN:8127 -@SQ SN:KI270448.1 LN:7992 -@SQ SN:KI270521.1 LN:7642 -@SQ SN:KI270581.1 LN:7046 -@SQ SN:KI270582.1 LN:6504 -@SQ SN:KI270515.1 LN:6361 -@SQ SN:KI270588.1 LN:6158 -@SQ SN:KI270591.1 LN:5796 -@SQ SN:KI270522.1 LN:5674 -@SQ SN:KI270507.1 LN:5353 -@SQ SN:KI270590.1 LN:4685 -@SQ SN:KI270584.1 LN:4513 -@SQ SN:KI270320.1 LN:4416 -@SQ SN:KI270382.1 LN:4215 -@SQ SN:KI270468.1 LN:4055 -@SQ SN:KI270467.1 LN:3920 -@SQ SN:KI270362.1 LN:3530 -@SQ SN:KI270517.1 LN:3253 -@SQ SN:KI270593.1 LN:3041 -@SQ SN:KI270528.1 LN:2983 -@SQ SN:KI270587.1 LN:2969 -@SQ SN:KI270364.1 LN:2855 -@SQ SN:KI270371.1 LN:2805 -@SQ SN:KI270333.1 LN:2699 -@SQ SN:KI270374.1 LN:2656 -@SQ SN:KI270411.1 LN:2646 -@SQ SN:KI270414.1 LN:2489 -@SQ SN:KI270510.1 LN:2415 -@SQ SN:KI270390.1 LN:2387 -@SQ SN:KI270375.1 LN:2378 -@SQ SN:KI270420.1 LN:2321 -@SQ SN:KI270509.1 LN:2318 -@SQ SN:KI270315.1 LN:2276 -@SQ SN:KI270302.1 LN:2274 -@SQ SN:KI270518.1 LN:2186 -@SQ SN:KI270530.1 LN:2168 -@SQ SN:KI270304.1 LN:2165 -@SQ SN:KI270418.1 LN:2145 -@SQ SN:KI270424.1 LN:2140 -@SQ SN:KI270417.1 LN:2043 -@SQ SN:KI270508.1 LN:1951 -@SQ SN:KI270303.1 LN:1942 -@SQ SN:KI270381.1 LN:1930 -@SQ SN:KI270529.1 LN:1899 -@SQ SN:KI270425.1 LN:1884 -@SQ SN:KI270396.1 LN:1880 -@SQ SN:KI270363.1 LN:1803 -@SQ SN:KI270386.1 LN:1788 -@SQ SN:KI270465.1 LN:1774 -@SQ SN:KI270383.1 LN:1750 -@SQ SN:KI270384.1 LN:1658 -@SQ SN:KI270330.1 LN:1652 -@SQ SN:KI270372.1 LN:1650 -@SQ SN:KI270548.1 LN:1599 -@SQ SN:KI270580.1 LN:1553 -@SQ SN:KI270387.1 LN:1537 -@SQ SN:KI270391.1 LN:1484 -@SQ SN:KI270305.1 LN:1472 -@SQ SN:KI270373.1 LN:1451 -@SQ SN:KI270422.1 LN:1445 -@SQ SN:KI270316.1 LN:1444 -@SQ SN:KI270340.1 LN:1428 -@SQ SN:KI270338.1 LN:1428 -@SQ SN:KI270583.1 LN:1400 -@SQ SN:KI270334.1 LN:1368 -@SQ SN:KI270429.1 LN:1361 -@SQ SN:KI270393.1 LN:1308 -@SQ SN:KI270516.1 LN:1300 -@SQ SN:KI270389.1 LN:1298 -@SQ SN:KI270466.1 LN:1233 -@SQ SN:KI270388.1 LN:1216 -@SQ SN:KI270544.1 LN:1202 -@SQ SN:KI270310.1 LN:1201 -@SQ SN:KI270412.1 LN:1179 -@SQ SN:KI270395.1 LN:1143 -@SQ SN:KI270376.1 LN:1136 -@SQ SN:KI270337.1 LN:1121 -@SQ SN:KI270335.1 LN:1048 -@SQ SN:KI270378.1 LN:1048 -@SQ SN:KI270379.1 LN:1045 -@SQ SN:KI270329.1 LN:1040 -@SQ SN:KI270419.1 LN:1029 -@SQ SN:KI270336.1 LN:1026 -@SQ SN:KI270312.1 LN:998 -@SQ SN:KI270539.1 LN:993 -@SQ SN:KI270385.1 LN:990 -@SQ SN:KI270423.1 LN:981 -@SQ SN:KI270392.1 LN:971 -@SQ SN:KI270394.1 LN:970 -@PG ID:STAR PN:STAR VN:STAR_2.5.0a CL:STAR --runMode alignReads --runThreadN 23 --genomeDir hg38_long_polya/ --readFilesIn test_long_polya/test_long_polya_merged.fastq --limitOutSJcollapsed 2000000 --outFileNamePrefix test_long_polya/alignments/ --outSAMprimaryFlag AllBestScore --outFilterType BySJout --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 --alignIntronMax 1000000 --alignSJDBoverhangMin 8 -@PG ID:STAR-3D48ABDC PN:STAR VN:STAR_2.5.0a CL:STAR --runMode alignReads --runThreadN 23 --genomeDir hg38_long_polya/ --readFilesIn test_long_polya/test_long_polya_merged.fastq --limitOutSJcollapsed 2000000 --outFileNamePrefix test_long_polya/alignments/ --outSAMprimaryFlag AllBestScore --outFilterType BySJout --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 --alignIntronMax 1000000 --alignSJDBoverhangMin 8 -@CO user command line: STAR --alignIntronMax 1000000 --alignIntronMin 20 --genomeDir hg38_long_polya/ --limitOutSJcollapsed 2000000 --outFilterMultimapNmax 1 --readFilesIn test_long_polya/test_long_polya_merged.fastq --outFilterMismatchNoverLmax 0.04 --runThreadN 23 --alignSJDBoverhangMin 8 --runMode alignReads --outFileNamePrefix test_long_polya/alignments/ --outFilterType BySJout --outSAMprimaryFlag AllBestScore -@CO user command line: STAR --alignIntronMax 1000000 --alignIntronMin 20 --genomeDir hg38_long_polya/ --limitOutSJcollapsed 2000000 --outFilterMultimapNmax 1 --readFilesIn test_long_polya/test_long_polya_merged.fastq --outFilterMismatchNoverLmax 0.04 --runThreadN 23 --alignSJDBoverhangMin 8 --runMode alignReads --outFileNamePrefix test_long_polya/alignments/ --outFilterType BySJout --outSAMprimaryFlag AllBestScore -:AGGTTCCATTCTACACGCT:ACGTACAT:TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGGTTTTTTT;HISEQ:222:C7HL8ANXX:8:1110:8959:43102 16 19 281075 255 28S71M2S * 0 0 NGCCCCCGGTCCCCTCTTTTCCTCCCCCCCCCATATACATTACATTTTACAAAACAGCAACTATCTGATCTCTCGGTCCCTTCCTTAACCCCATAAAAAAG ############################BB/ 0 - - -def test_chromosome_19_comes_before_21(indices): - """chromosome 19 comes before 21 in the test file, this should be replicated in the output""" - assert max(indices[0]) < min(indices[1]) - - -# TAGGER TESTED IN INTEGRATION TESTS ONLY (see test_entrypoints.py) - -# TEST SPLIT - - -@pytest.fixture(scope="module", params=[data_dir + "test.sam", data_dir + "test.bam"]) -def bamfile(request): - return request.param - - -def test_split_bam_raises_value_error_when_passed_bam_without_barcodes(bamfile,): - split_size = ( - 0.02 # our test data is very small, 0.01mb = ~10kb, which should yield 5 files. - ) - with pytest.raises(RuntimeError): - bam.split( - [bamfile], - "test_output", - [consts.CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - ) - - -@pytest.fixture -def tagged_bam(): - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test_r2.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - ] - platform.TenXV2.attach_barcodes(args) - return "test_tagged_bam.bam" - - -def test_split_on_tagged_bam(tagged_bam): - split_size = 0.005 # our test data is very small, this value should yield 3 files - outputs = bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - ) - assert len(outputs) == 3 - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_split_with_large_chunk_size_generates_one_file(tagged_bam): - split_size = 1024 # our test data is very small, this value should yield 1 file - outputs = bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - ) - assert len(outputs) == 1 - - # the file should be full size - with pysam.AlignmentFile(outputs[0], "rb", check_sq=False) as f: - assert len([x for x in f]) == 100 - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_split_with_raise_missing_true_raises_warning_without_cr_barcode_passed( - tagged_bam, -): - split_size = 1024 # our test data is very small, this value should yield 1 file - with pytest.raises(RuntimeError): - bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - raise_missing=True, - ) - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_split_succeeds_with_raise_missing_false_and_no_cr_barcode_passed(tagged_bam,): - split_size = 1024 # our test data is very small, this value should yield 1 file - outputs = bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - raise_missing=False, - ) - - assert len(outputs) == 1 - - # the file should be full size - with pysam.AlignmentFile(outputs[0], "rb", check_sq=False) as f: - assert ( - len([x for x in f]) == 1 - ) # only one of our barcodes is whitelisted or within 1 base - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_get_barcodes_from_bam(tagged_bam): - outputs = bam.get_barcodes_from_bam( - tagged_bam, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - raise_missing=True, - ) - assert len(outputs) == 99 - - -def test_get_barcodes_from_bam_with_raise_missing_true_raises_warning_without_cr_barcode_passed( - tagged_bam, -): - with pytest.raises(RuntimeError): - bam.get_barcodes_from_bam( - tagged_bam, [consts.CELL_BARCODE_TAG_KEY], raise_missing=True - ) - - -def test_write_barcodes_to_bins(tagged_bam): - barcodes = bam.get_barcodes_from_bam( - tagged_bam, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - raise_missing=True, - ) - - test_barcodes_to_bins = {} - for barcode in barcodes: - test_barcodes_to_bins[barcode] = 0 - - filenames = bam.write_barcodes_to_bins( - tagged_bam, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - test_barcodes_to_bins, - raise_missing=False, - ) - - assert len(filenames) == 1 - - # cleanup - for f in filenames: - shutil.rmtree(os.path.dirname(f)) - - -def test_get_barcode_for_alignment(tagged_bam): - with pysam.AlignmentFile(tagged_bam, "rb", check_sq=False) as input_alignments: - for alignment in input_alignments: - barcode = bam.get_barcode_for_alignment( - alignment, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - raise_missing=False, - ) - assert barcode == "NTAAGAGTCTGCAAGT" - break - - -def test_get_barcode_for_alignment_raises_error_for_missing_tag(tagged_bam): - with pysam.AlignmentFile(tagged_bam, "rb", check_sq=False) as input_alignments: - for alignment in input_alignments: - with pytest.raises(RuntimeError): - bam.get_barcode_for_alignment(alignment, TAG_KEYS, raise_missing=True) - - -# TEST SORTING - - -def test_tag_sortable_records_compare_correctly(): - records = make_records_from_values(TAG_KEYS, SORTED_VALUES) - num_records = len(SORTED_VALUES) - for i in range(num_records): - for j in range(num_records): - if i < j: - assert records[i] < records[j] - elif i == j: - assert records[i] == records[j] - else: - assert records[i] > records[j] - - -def test_tag_sortable_records_raises_error_on_different_tag_lists(): - r1 = bam.TagSortableRecord(["FOO", "BAR"], ["A", "A"], "A") - r2 = bam.TagSortableRecord(["BAR", "BAZ"], ["A", "A"], "A") - with pytest.raises(ValueError): - r1 == r2 - - -def test_tag_sortable_records_str(): - record = bam.TagSortableRecord(TAG_KEYS, SORTED_VALUES[0][0], SORTED_VALUES[0][1]) - s = record.__str__() - assert "TagSortableRecord" in s - assert "['FOO', 'BAR', 'BAZ']" in s - - -def test_verify_sort_on_unsorted_records_raises_error(): - records = make_records_from_values(TAG_KEYS, UNSORTED_VALUES) - with pytest.raises(bam.SortError): - bam.verify_sort(records, TAG_KEYS) - - -def test_verify_sort_raises_no_error_on_sorted_records(): - records = make_records_from_values(TAG_KEYS, SORTED_VALUES) - bam.verify_sort(records, TAG_KEYS) - - -def test_sort_by_tags_and_queryname_sorts_correctly_from_file(): - tag_keys = ["UB", "CB", "GE"] - with pysam.AlignmentFile(data_dir + "unsorted.bam", "rb") as f: - records = f.fetch(until_eof=True) - sorted_records = bam.sort_by_tags_and_queryname(records, tag_keys) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(r, tag_keys) for r in sorted_records - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - -def test_sort_by_tags_and_queryname_sorts_correctly_from_file_no_tag_keys(): - tag_keys = [] - with pysam.AlignmentFile(data_dir + "unsorted.bam", "rb") as f: - records = f.fetch(until_eof=True) - sorted_records = bam.sort_by_tags_and_queryname(records, tag_keys) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(r, tag_keys) for r in sorted_records - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - -def test_tag_sortable_records_sort_correctly(): - tag_keys = TAG_KEYS - records = make_records_from_values(tag_keys, deepcopy(UNSORTED_VALUES)) - sorted_records = sorted(records) - bam.verify_sort(sorted_records, tag_keys) - - -def test_tag_sortable_records_sort_correctly_when_already_sorted(): - # This is to a bit paranoid, but just make sure sorted stays correct if already sorted - tag_keys = TAG_KEYS - records = make_records_from_values(tag_keys, deepcopy(SORTED_VALUES)) - sorted_records = sorted(records) - bam.verify_sort(sorted_records, tag_keys) - - -def test_sort_by_tags_and_queryname_sorts_correctly_no_tag_keys(): - tag_keys = [] - records = make_records_from_values(tag_keys, deepcopy(UNSORTED_VALUES)) - sorted_records = sorted(records) - bam.verify_sort(sorted_records, tag_keys) - - -def test_tag_sortable_record_missing_tag_value_is_empty_string(): - tags = ["_NOT_REAL_TAG_"] - with pysam.AlignmentFile(data_dir + "unsorted.bam", "rb") as f: - records = f.fetch(until_eof=True) - first_record = next(iter(records)) - sortable_record = bam.TagSortableRecord.from_aligned_segment(first_record, tags) - assert sortable_record.tag_values[0] == "" - - -def test_tag_sortable_record_lt_is_false_for_equal_records(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert not r1 < r2 - - -def test_tag_sortable_record_lt_is_true_for_smaller_query_name(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="B" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_is_true_for_smaller_tag(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "B"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "B", "A"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["B", "A", "A"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_is_true_for_smaller_tag_regardless_of_query_name(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="B" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "B"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_empty_query_name_is_smaller(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_empty_tag_is_smaller(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", ""], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_eq_is_true_for_identical_records(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 == r2 - - -def test_tag_sortable_record_eq_is_false_when_any_difference_exists(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="B" - ) - assert not r1 == r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "B"], query_name="A" - ) - assert not r1 == r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "B", "A"], query_name="A" - ) - assert not r1 == r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["B", "A", "A"], query_name="A" - ) - assert not r1 == r2 - - -def make_records_from_values(tag_keys, tags_and_query_name): - records = [] - for i in range(len(tags_and_query_name)): - r = bam.TagSortableRecord( - tag_keys=tag_keys, - tag_values=tags_and_query_name[i][0], - query_name=tags_and_query_name[i][1], - ) - records.append(r) - return records diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_barcode.py b/tools/scripts/sctools/build/lib/sctools/test/test_barcode.py deleted file mode 100644 index f9fa39a5..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_barcode.py +++ /dev/null @@ -1,161 +0,0 @@ -import os - -import numpy as np -import pysam -import pytest - -from .. import barcode, encodings, platform, consts - -data_dir = os.path.split(__file__)[0] + "/data/" - - -# TEST BARCODES - - -@pytest.fixture -def barcode_set(): - return barcode.Barcodes.from_whitelist( - data_dir + "1k-august-2016.txt", barcode_length=16 - ) - - -@pytest.fixture(scope="module", params=["r", "rb"]) -def short_barcode_set_from_iterable(request): - with open(data_dir + "1k-august-2016.txt", request.param) as f: - barcodes = [l.strip() for l in f.readlines()[:50]] - if isinstance(barcodes[0], bytes): - return barcode.Barcodes.from_iterable_bytes(barcodes, barcode_length=16) - else: - return barcode.Barcodes.from_iterable_strings(barcodes, barcode_length=16) - - -@pytest.fixture(scope="module") -def short_barcode_set_from_encoded(): - return barcode.Barcodes.from_iterable_encoded( - [0, 1, 2, 3, 4, 5, 6, 7], barcode_length=2 - ) - - -def test_iterable_produces_correct_barcodes(short_barcode_set_from_encoded): - tbe = encodings.TwoBit(2) - decoded = [tbe.decode(b) for b in short_barcode_set_from_encoded] - print(decoded) - assert decoded == [b"AA", b"AC", b"AT", b"AG", b"CA", b"CC", b"CT", b"CG"] - - -def test_reads_barcodes_from_file(barcode_set): - assert len(barcode_set) == 1001 # number of barcodes in file. - - -def test_base_frequency_sums_are_all_equal_to_barcode_set_length(barcode_set): - bf = barcode_set.base_frequency() - assert isinstance(bf, np.ndarray) - assert np.array_equal(bf.sum(axis=1), np.ones(16) * len(barcode_set)) - - -def test_barcode_diversity_is_in_range(barcode_set): - bd = barcode_set.effective_diversity() - assert np.all(bd >= 0) - assert np.all(bd <= 1) - - -def test_summarize_hamming_distances_gives_reasonable_results( - short_barcode_set_from_iterable, -): - - hamming_summary = short_barcode_set_from_iterable.summarize_hamming_distances() - - # we know 10x barcodes have at least this much distance - assert hamming_summary["minimum"] >= 2 - # no barcode can have more hamming distance than length - assert all(v <= 16 for v in hamming_summary.values()) - - -# TEST HashErrorsToCorrectBarcodes - - -@pytest.fixture(scope="module") -def trivial_whitelist(): - barcode_iterable = ["A" * 8] - error_mapping = barcode.ErrorsToCorrectBarcodesMap._prepare_single_base_error_hash_table( - barcode_iterable - ) - return barcode.ErrorsToCorrectBarcodesMap(error_mapping) - - -@pytest.fixture(scope="module") -def truncated_whitelist_from_10x(): - # note that this whitelist contains 1 non-10x barcode to ensure the presence of a matching - # target in the test data. - error_mapping = barcode.ErrorsToCorrectBarcodesMap.single_hamming_errors_from_whitelist( - data_dir + "1k-august-2016.txt" - ) - return error_mapping - - -def test_incorrect_input_raises_errors(trivial_whitelist): - with pytest.raises(TypeError): - barcode.ErrorsToCorrectBarcodesMap("not_a_mapping") - with pytest.raises(TypeError): - barcode.ErrorsToCorrectBarcodesMap({"not_a_mapping"}) - with pytest.raises(TypeError): - barcode.ErrorsToCorrectBarcodesMap(["not_a_mapping", "sldkf"]) - assert isinstance(trivial_whitelist, barcode.ErrorsToCorrectBarcodesMap) - - -def test_correct_barcode_finds_and_corrects_1_base_errors(trivial_whitelist): - assert trivial_whitelist.get_corrected_barcode("TAAAAAAA") == "AAAAAAAA" - assert trivial_whitelist.get_corrected_barcode("AAAACAAA") == "AAAAAAAA" - assert trivial_whitelist.get_corrected_barcode("AAAGAAAA") == "AAAAAAAA" - assert trivial_whitelist.get_corrected_barcode("AAAAAAAA") == "AAAAAAAA" - - -def test_correct_barcode_raises_keyerror_when_barcode_not_correct_length( - trivial_whitelist, -): - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAA") - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAAAAAAAA") - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAAAAAAAAA") - - -def test_correct_barcode_raises_keyerror_when_barcode_has_more_than_one_error( - trivial_whitelist, -): - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAAAAATT") - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("TTAAAAAA") - - -@pytest.fixture(scope="module") -def tagged_bamfile(): - outbam = data_dir + "bam_with_tags_test.bam" - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - outbam, - ] - platform.TenXV2.attach_barcodes(args) - return outbam - - -def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x): - outbam = data_dir + "bam_with_cb_tags.bam" - truncated_whitelist_from_10x.correct_bam(tagged_bamfile, outbam) - success = False - with pysam.AlignmentFile(outbam, "rb") as f: - for record in f: - try: - success = record.get_tag(consts.CELL_BARCODE_TAG_KEY) - except KeyError: - continue - assert success - os.remove(outbam) diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_count.py b/tools/scripts/sctools/build/lib/sctools/test/test_count.py deleted file mode 100644 index 81ec1514..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_count.py +++ /dev/null @@ -1,1348 +0,0 @@ -""" -Testing for Count Matrix Construction -===================================== - -The test generates (1) a random count matrix, and (2) corresponding alignment records, and writes them to disk -(a BAM file, count matrix, row and column indices). The alignment records are expected to produce the same count -matrix according to the counting algorithm implemented in `sctools:bam.from_sorted_tagged_bam`. Gene names are -fetched from an annotations GTF file that is a subset of GENCODE annotations (see `_test_annotation_file` below). - -Notes ------ - -- The agreement between the synthetic count matrix and the synthetic BAM file is contingent on the - agreement between the counting algorithm implemented in `sctools:bam.from_sorted_tagged_bam` and - the test data generator (see SyntheticTaggedBAMGenerator below). Therefore, future changes in the - counting algorithm must be accompanied by a corresponding change in the test data generation class. - Otherwise, the tests will fail. - -- We have adopt a minimal test suite design strategy, in the sense that the synthetic test data is only complete - to the degree that is required by `sctools:bam.from_sorted_tagged_bam`. As such, the synthetic BAM file lacks - the following features: - - * flag, - * query_sequence, - * query_quality, - * CIGAR string, - * cell barcode quality tag, - * molecule barcode quality tag, - * raw cell and molecule barcodes, - - At the time of writing, the counting algorithm **only** relies on the BAM tags. - -- SyntheticTaggedBAMGenerator generates four types of alignment records: - - * necessary alignments -- these records contain one unique cell/molecule/gene tag for each cell/gene count - unit, according to the randomly generated count matrix. Necessary alignments are also sufficient - in the sense that they are expected to reproduce the count matrix in the absence of any other alignment - record. - - * redundant alignments -- these records are expected to be ignored by the counting algorithm and have three - subtypes: - - - duplicate alignments -- these are randomly picked from necessary alignments, though, they are given a new - query name (to mimic PCR and optical duplicates). - - - incomplete alignments -- these records miss at least one necessary tag, e.g. cell barcode, molecule - barcode, or gene name. - - - multi-gene alignments -- these records have the same tags and query_name, though, at least two such - records per query_name exist that point to different genes. -""" - -import operator -import os -import tempfile -from typing import Callable, Optional, List, Set, Tuple, Dict, Generator - -import numpy as np -import scipy.sparse as sp -import pysam -import pytest - -from sctools import gtf, bam, consts -from sctools.count import CountMatrix - -# set the input and output directories -_test_data_dir = os.path.join(os.path.split(__file__)[0], "data") -_test_annotation_file = os.path.join(_test_data_dir, "chr1.30k_records.gtf.gz") - -# constants -_test_num_cells = 50 -_test_max_genes = 20 -_test_gene_expression_rate = 5.0 -_test_num_duplicates = 20 -_test_num_missing_some_tags = 20 -_test_num_multiple_gene_alignments = 20 -_test_max_gene_hits_per_multiple_gene_alignments = 5 - -_test_num_only_exons = 10 -_test_num_only_introns = 10 -_test_both_exons_introns = 10 - - -@pytest.fixture(scope="module") -def gene_name_to_index() -> Dict[str, int]: - return gtf.extract_gene_names(_test_annotation_file) - - -class AlignmentRecordTags: - """Represents the bundle of cell barcode, molecule barcode, and gene name.""" - - def __init__( - self, - cell_barcode: Optional[str], - molecule_barcode: Optional[str], - gene_name: Optional[str], - alignment_location: Optional[str] = "EXONIC", - ) -> None: - self.cell_barcode = cell_barcode - self.molecule_barcode = molecule_barcode - self.gene_name = gene_name - self.alignment_location = alignment_location - - def __hash__(self): - return hash((self.cell_barcode, self.molecule_barcode, self.gene_name)) - - def __repr__(self): - return ( - f"{consts.CELL_BARCODE_TAG_KEY}: {self.cell_barcode}, " - f"{consts.MOLECULE_BARCODE_TAG_KEY}: {self.molecule_barcode}, " - f"{consts.GENE_NAME_TAG_KEY}: {self.gene_name}", - f"{consts.ALIGNMENT_LOCATION_TAG_KEY}: {self.alignment_location}", - ) - - -class CellMoleculeGeneQueryNameSortOrder(bam.AlignmentSortOrder): - """Hierarchical alignment record sort order (cell barcode >= molecule barcode >= gene name >= query name).""" - - def __init__( - self, - cell_barcode_tag_key: str = consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag_key: str = consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag_key: str = consts.GENE_NAME_TAG_KEY, - ) -> None: - assert cell_barcode_tag_key, "Cell barcode tag key can not be None" - assert molecule_barcode_tag_key, "Molecule barcode tag key can not be None" - assert gene_name_tag_key, "Gene name tag key can not be None" - self.cell_barcode_tag_key = cell_barcode_tag_key - self.molecule_barcode_tag_key = molecule_barcode_tag_key - self.gene_name_tag_key = gene_name_tag_key - - def _get_sort_key( - self, alignment: pysam.AlignedSegment - ) -> Tuple[str, str, str, str]: - return ( - bam.get_tag_or_default(alignment, self.cell_barcode_tag_key, default="N"), - bam.get_tag_or_default( - alignment, self.molecule_barcode_tag_key, default="N" - ), - bam.get_tag_or_default(alignment, self.gene_name_tag_key, default="N"), - alignment.query_name, - ) - - @property - def key_generator( - self, - ) -> Callable[[pysam.AlignedSegment], Tuple[str, str, str, str]]: - return self._get_sort_key - - def __repr__(self) -> str: - return "hierarchical__cell_molecule_gene_query_name" - - -class SyntheticTaggedBAMGenerator: - """This class generates a synthetic count matrix and an accompanying synthetic tagged BAM file as - described in the preamble documentation block. - - Parameters - ---------- - num_cells : int - number of real cells - max-genes : int - maximum number of genes to use to generate synthetic counts - gene_name_to_index : dict - a map from gene name to their count matrix index - gene_expression_rate : float - poisson rate at which each gene is expressed - rng_seed : int - random number generator seed - - Methods - ------- - generate_synthetic_bam_and_counts_matrix - generates synthetic test data and writes the output to disk - - See Also - -------- - count.from_sorted_tagged_bam - """ - - OUTPUT_PREFIX = "synthetic_" - SYNTHETIC_SEQUENCE_NAME = "SYNTHETIC_SEQUENCE" - SYNTHETIC_SEQUENCE_LENGTH = 100 - NECESSARY_QUERY_NAME_PREFIX = "NECESSARY_QUERY_" - DUPLICATE_QUERY_NAME_PREFIX = "DUPLICATE_QUERY_" - INCOMPLETE_QUERY_NAME_PREFIX = "INCOMPLETE_QUERY_" - MULTI_GENE_QUERY_NAME_PREFIX = "MULTI_GENE_QUERY_" - - bam_output_filename = OUTPUT_PREFIX + "records.bam" - count_matrix_output_filename = OUTPUT_PREFIX + "count_matrix.npy" - row_index_output_filename = OUTPUT_PREFIX + "_row_index.npy" - col_index_output_filename = OUTPUT_PREFIX + "_col_index.npy" - - def __init__( - self, - num_cells: int, - max_genes: int, - gene_name_to_index: Dict[str, int], - gene_expression_rate: float, - rng_seed: int = 777, - ) -> None: - self.num_cells = num_cells - self.gene_expression_rate = gene_expression_rate - - # initialize the random number generator - self.rng: np.random.RandomState = np.random.RandomState(seed=rng_seed) - - # generate gene names - self.all_gene_names = [ - k for k, v in sorted(gene_name_to_index.items(), key=operator.itemgetter(1)) - ] - self.num_genes = len(self.all_gene_names) - - self.max_genes = max_genes - assert ( - max_genes <= self.num_genes - ), f"Max genes ({self.max_genes}) must be <= to all annotated genes ({self.num_genes})" - self.to_be_used_gene_indices: List[int] = self.rng.choice( - np.arange(0, self.num_genes, dtype=np.int), - size=self.max_genes, - replace=False, - ).tolist() - self.to_be_used_gene_names = [ - self.all_gene_names[j] for j in self.to_be_used_gene_indices - ] - - def generate_synthetic_bam_and_counts_matrix( - self, - output_path: str, - num_duplicates: int, - num_missing_some_tags: int, - num_multiple_gene_alignments: int, - max_gene_hits_per_multiple_gene_alignments: int, - alignment_sort_order: bam.AlignmentSortOrder = CellMoleculeGeneQueryNameSortOrder(), - ): - """Generates synthetic count matrix and BAM file and writes them to disk. - - Parameters - ---------- - output_path : str - output path - num_duplicates : int - number of duplicate records - num_missing_some_tags : int - number of records that miss at least one crucial tag - num_multiple_gene_alignments : int - number of records that have at least two different gene tags - max_gene_hits_per_multiple_gene_alignments : int - maximum number of unique gene names to use for multiple-gene records - alignment_sort_order : bam.AlignmentSortOrder - sort order of BAM alignment records; if 'None', random sort order is implied - - Returns - ------- - None - """ - assert 2 <= max_gene_hits_per_multiple_gene_alignments <= self.max_genes, ( - f"The parameter `max_gene_hits_per_multiple_gene_alignments` must >= 2 and < maximum annotated " - f"genes ({self.max_genes})" - ) - assert num_duplicates >= 0, "Number of duplicate queries must be non-negative" - assert ( - num_missing_some_tags >= 0 - ), "Number of queries with missing tags must be non-negative" - assert ( - num_multiple_gene_alignments >= 0 - ), "Number of queries with multiple gene alignments must be non-negative" - - # generate synthetic count matrix and corresponding simulated records - synthetic_data_bundle = self._generate_synthetic_counts_and_alignment_tags( - num_duplicates, - num_missing_some_tags, - num_multiple_gene_alignments, - max_gene_hits_per_multiple_gene_alignments, - ) - records = list( - SyntheticTaggedBAMGenerator._get_bam_records_generator( - synthetic_data_bundle - ) - ) - - if not alignment_sort_order: # random - # shuffle records - self.rng.shuffle(records) - - else: - records = sorted(records, key=alignment_sort_order.key_generator) - - # write BAM file - with pysam.AlignmentFile( - os.path.join(output_path, self.bam_output_filename), - mode="wb", - reference_names=[self.SYNTHETIC_SEQUENCE_NAME], - reference_lengths=[self.SYNTHETIC_SEQUENCE_LENGTH], - ) as bo: - for record in records: - bo.write(record) - - # write count matrix, row index, and col index - np.save( - os.path.join(output_path, self.count_matrix_output_filename), - synthetic_data_bundle.count_matrix, - ) - np.save( - os.path.join(output_path, self.row_index_output_filename), - synthetic_data_bundle.row_index, - ) - np.save( - os.path.join(output_path, self.col_index_output_filename), - synthetic_data_bundle.col_index, - ) - - def _generate_synthetic_counts_and_alignment_tags( - self, - num_duplicates: int, - num_missing_some_tags: int, - num_multiple_gene_alignments: int, - max_gene_hits_per_multiple_gene_alignments: int, - ) -> "SyntheticDataBundle": - - # generate count matrix - count_matrix: np.ndarray = self._generate_random_count_matrix() - - # generate necessary alignment tags that produce count_matrix - ( - necessary_alignment_record_tags_set, - row_index, - col_index, - ) = self._generate_necessary_alignment_record_bundle(count_matrix) - necessary_alignment_record_tags_list = list(necessary_alignment_record_tags_set) - - # sanity check -- we require as many necessary alignment records as the total counts - assert len(necessary_alignment_record_tags_set) == np.sum(count_matrix), ( - "There is an inconsistency between synthetic counts and necessary tags: we require as " - "many necessary alignment tags as the total counts" - ) - - # add duplicate records - duplicate_alignment_tags_list = self._generate_duplicate_alignment_tags( - num_duplicates, necessary_alignment_record_tags_list - ) - - # add records with missing tags - incomplete_alignment_tags_list: List[ - AlignmentRecordTags - ] = self._generate_incomplete_alignment_tags(num_missing_some_tags) - - # add records with multiple gene alignments - multiple_alignment_tags_list: List[ - List[AlignmentRecordTags] - ] = self._generate_multiple_gene_alignment_tags( - num_multiple_gene_alignments, - max_gene_hits_per_multiple_gene_alignments, - necessary_alignment_record_tags_set, - ) - - return SyntheticDataBundle( - count_matrix, - row_index, - col_index, - necessary_alignment_record_tags_list, - duplicate_alignment_tags_list, - incomplete_alignment_tags_list, - multiple_alignment_tags_list, - ) - - def _generate_random_count_matrix(self) -> np.ndarray: - """Generates a random count matrix. - - This method selects `self.max_genes` out of all all genes (`self.num_genes`) and populates the selected genes - with Poisson counts with rate `self.gene_expression_rate`. The count matrix entries corresponding to the - rest of the genes are set to zero. - - Returns - ------- - np.ndarray - an ndarray of shape (`self.num_cells`, `self.num_genes`) - """ - non_zero_count_matrix = self.rng.poisson( - lam=self.gene_expression_rate, size=(self.num_cells, self.max_genes) - ) - count_matrix = np.zeros((self.num_cells, self.num_genes), dtype=np.int) - for i, i_gene in enumerate(self.to_be_used_gene_indices): - count_matrix[:, i_gene] = non_zero_count_matrix[:, i] - return count_matrix - - @staticmethod - def _get_bam_records_generator( - synthetic_data_bundle: "SyntheticDataBundle", rng_seed: int = 777 - ) -> Generator[pysam.AlignedSegment, None, None]: - """Returns a generator of pysam.AlignedSegment instances created from the alignment tags - provided to the initializer. - - Parameters - ---------- - synthetic_data_bundle : SyntheticDataBundle - a bundle of synthetic alignment tags - rng_seed : int - random number generator seed; it is used for generating random reference_start position. - - See Also - -------- - - The preamble documentation block for a description of the meaning of different alignment records - (necessary, duplicate, incomplete, etc.) - - SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags - """ - rng = np.random.RandomState(rng_seed) - - num_queries = synthetic_data_bundle.num_queries - i_query = 0 - - # necessary, duplicate, and incomplete alignments - for alignment_tags_list, query_name_prefix in zip( - [ - synthetic_data_bundle.necessary_alignment_record_tags_list, - synthetic_data_bundle.duplicate_alignment_tags_list, - synthetic_data_bundle.incomplete_alignment_tags_list, - ], - [ - SyntheticTaggedBAMGenerator.NECESSARY_QUERY_NAME_PREFIX, - SyntheticTaggedBAMGenerator.DUPLICATE_QUERY_NAME_PREFIX, - SyntheticTaggedBAMGenerator.INCOMPLETE_QUERY_NAME_PREFIX, - ], - ): - for alignment_tags in alignment_tags_list: - yield SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tags, query_name_prefix, i_query, num_queries, rng - ) - i_query += 1 - - # multi-gene alignments - for alignment_tags_list in synthetic_data_bundle.multiple_alignment_tags_list: - # multiple alignments have the same query name (by definition) - for alignment_tags in alignment_tags_list: - yield SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tags, - SyntheticTaggedBAMGenerator.MULTI_GENE_QUERY_NAME_PREFIX, - i_query, - num_queries, - rng, - ) - i_query += 1 - - @staticmethod - def _generate_aligned_segment_from_tags( - alignment_tags: AlignmentRecordTags, - query_prefix: str, - i_query: int, - num_queries: int, - rng: np.random.RandomState, - record_reference_id: Optional[int] = 0, - reference_start: Optional[int] = -1, - ) -> pysam.AlignedSegment: - """Generates pysam.AlignedSegment instances from alignment_tags. - - Parameters - ---------- - alignment_tags : AlignmentRecordTags - tags to attach to the instantiated pysam.AlignedSegment - query_prefix : str - prefix to use for query name - i_query : int - query index - num_queries: int - maximum number of queries (only used for pretty-printing the query index) - rng: np.random.RandomState - a random number generator - - Notes - ----- - The query_sequence and query_quality are both empty as these query features are not used for generating - the counts matrix. Likewise, the flag is currently unset. In the future, once we add a filtering - policy based on BAM record flags (such as duplicates), this method must be updated accordingly. - - Returns - ------- - pysam.AlignedSegment - an instance of pysam.AlignedSegment - - """ - tags = [] - if alignment_tags.cell_barcode: - tags.append((consts.CELL_BARCODE_TAG_KEY, alignment_tags.cell_barcode, "Z")) - if alignment_tags.molecule_barcode: - tags.append( - (consts.MOLECULE_BARCODE_TAG_KEY, alignment_tags.molecule_barcode, "Z") - ) - if alignment_tags.gene_name: - tags.append((consts.GENE_NAME_TAG_KEY, alignment_tags.gene_name, "Z")) - - if alignment_tags.alignment_location: - tags.append( - ( - consts.ALIGNMENT_LOCATION_TAG_KEY, - alignment_tags.alignment_location, - "Z", - ) - ) - - record = pysam.AlignedSegment() - record.query_name = SyntheticTaggedBAMGenerator._generate_query_name( - query_prefix, i_query, num_queries - ) - - if reference_start == -1: - record.reference_start = rng.randint( - low=0, high=SyntheticTaggedBAMGenerator.SYNTHETIC_SEQUENCE_LENGTH - ) - else: - record.reference_start = reference_start - - record.reference_id = ( - record_reference_id # note: we only use one synthetic sequence - ) - if len(tags) > 0: - record.set_tags(tags) - return record - - @staticmethod - def _generate_query_name(query_prefix: str, i_query: int, num_queries: int) -> str: - """Returns query name string from query index. We zero-pad the string representation of query - indices merely for pretty-printing, e.g. 0000, 0001, ..., 9999.""" - num_digits = len(str(num_queries - 1)) - return query_prefix + str(i_query).zfill(num_digits) - - def _generate_necessary_alignment_record_bundle( - self, count_matrix: np.ndarray - ) -> Tuple[Set[AlignmentRecordTags], List[str], List[str]]: - alignments: Set[AlignmentRecordTags] = set() - used_cell_barcodes: Set[str] = set() - - row_index: List[str] = [] - col_index = self.all_gene_names - - for i_cell in range(self.num_cells): - # generate a unique cell barcode - while True: - cell_barcode = self._generate_random_cell_barcode() - if cell_barcode not in used_cell_barcodes: - break - row_index.append(cell_barcode) - - for i_gene in self.to_be_used_gene_indices: - for i_molecule in range(count_matrix[i_cell, i_gene]): - # generate a unique alignment tag - unique_alignment_tag = self._generate_unique_random_alignment_tag( - alignments, - gene_name=self.all_gene_names[i_gene], - cell_barcode=cell_barcode, - ) - alignments.add(unique_alignment_tag) - - return alignments, row_index, col_index - - def _generate_unique_random_alignment_tag( - self, - existing_alignment_tags: Set[AlignmentRecordTags], - gene_name: str, - cell_barcode: Optional[str] = None, - molecule_barcode: Optional[str] = None, - ) -> AlignmentRecordTags: - assert ( - gene_name in self.to_be_used_gene_names - ), f"{gene_name} is not an allowed gene for generating synthetic data" - - while True: - alignment = AlignmentRecordTags( - cell_barcode=cell_barcode - if cell_barcode - else self._generate_random_cell_barcode(), - molecule_barcode=molecule_barcode - if molecule_barcode - else self._generate_random_molecule_barcode(), - gene_name=gene_name, - ) - if alignment not in existing_alignment_tags: - return alignment - - def _generate_duplicate_alignment_tags( - self, num_duplicates: int, necessary_alignments_list: List[AlignmentRecordTags] - ) -> List[AlignmentRecordTags]: - return self.rng.choice(necessary_alignments_list, size=num_duplicates).tolist() - - def _generate_incomplete_alignment_tags( - self, num_missing_some_tags: int - ) -> List[AlignmentRecordTags]: - """Generates alignments with missing crucial tags. - - Notes - ----- - This method requires each combination of missing tags to occur at least once and may therefore return lists - that are longer than `num_missing_some_tags`. - """ - incomplete_alignment_tags_list: List[AlignmentRecordTags] = list() - tag_mask_occurrences: Set[int] = set() - i_entries = 0 - while i_entries < num_missing_some_tags or len(tag_mask_occurrences) < 7: - tag_mask = self.rng.randint(low=0, high=7) - tag_mask_occurrences.add(tag_mask) - gene_name = self.rng.choice(self.to_be_used_gene_names) - alignment = self._generate_unique_random_alignment_tag(set(), gene_name) - if not tag_mask & 1: - alignment.cell_barcode = None - if not tag_mask & 2: - alignment.molecule_barcode = None - if not tag_mask & 4: - alignment.gene_name = None - incomplete_alignment_tags_list.append(alignment) - i_entries += 1 - return incomplete_alignment_tags_list - - def _generate_multiple_gene_alignment_tags( - self, - num_multiple_gene_alignments: int, - max_gene_hits_per_multiple_gene_alignments: int, - necessary_alignment_record_tags_set: Set[AlignmentRecordTags], - ) -> List[List[AlignmentRecordTags]]: - - necessary_alignment_record_tags_list = list(necessary_alignment_record_tags_set) - - multiple_gene_alignment_tags_list: List[List[AlignmentRecordTags]] = list() - for _ in range(num_multiple_gene_alignments): - random_necessary_alignment = self.rng.choice( - necessary_alignment_record_tags_list - ) - random_necessary_cell_barcode: str = random_necessary_alignment.cell_barcode - novel_molecule_barcode: str = self._generate_unique_random_alignment_tag( - necessary_alignment_record_tags_set, - gene_name=random_necessary_alignment.gene_name, - cell_barcode=random_necessary_cell_barcode, - ).molecule_barcode - num_gene_hits = self.rng.randint( - low=2, high=max_gene_hits_per_multiple_gene_alignments + 1 - ) - gene_name_hits = self.rng.choice( - self.to_be_used_gene_names, replace=False, size=num_gene_hits - ) - multiple_gene_alignment_tags_list.append( - [ - AlignmentRecordTags( - random_necessary_cell_barcode, novel_molecule_barcode, gene_name - ) - for gene_name in gene_name_hits - ] - ) - return multiple_gene_alignment_tags_list - - def _generate_random_cell_barcode(self, length: int = 16): - return self._generate_random_genomic_sequences(length) - - def _generate_random_molecule_barcode(self, length: int = 10): - return self._generate_random_genomic_sequences(length) - - def _generate_random_genomic_sequences(self, length: int): - return "".join(self.rng.choice(["A", "C", "T", "G"], size=length)) - - -class SyntheticDataBundle: - """A container for synthetic count matrix, row and column indices, and alignment tags. - - Parameters - ---------- - count_matrix : np.ndarray - the cell x gene synthetic count matrix - row_index : List[str] - list of cell barcodes - col_index : List[str] - list of gene names - necessary_alignment_record_tags_list : List[AlignmentRecordTags] - list of necessary alignment tags; alignment records made using these tags are expected to produce - `count_matrix` once processed by the counting algorithm. - duplicate_alignment_tags_list : List[AlignmentRecordTags] - list of duplicate alignment tags (a subset of `necessary_alignment_record_tags_list`) - incomplete_alignment_tags_list : List[AlignmentRecordTags] - list of incomplete alignment tags (miss at least one of the required tags: cell, molecule, gene) - multiple_alignment_tags_list : List[List[AlignmentRecordTags]] - list of lists of multiple alignment tags; each list element is a list of alignment tags with the - same molecular barcodes, though, with multiple gene names. - - See Also - -------- - SyntheticBarcodedBAMGenerator - """ - - def __init__( - self, - count_matrix: np.ndarray, - row_index: List[str], - col_index: List[str], - necessary_alignment_record_tags_list: List[AlignmentRecordTags], - duplicate_alignment_tags_list: List[AlignmentRecordTags], - incomplete_alignment_tags_list: List[AlignmentRecordTags], - multiple_alignment_tags_list: List[List[AlignmentRecordTags]], - ) -> None: - - assert count_matrix.shape == ( - len(row_index), - len(col_index), - ), "The shape of the count matrix is inconsistent with the provided row/column indices" - - self.count_matrix = count_matrix - self.row_index = row_index - self.col_index = col_index - - self.necessary_alignment_record_tags_list = necessary_alignment_record_tags_list - self.duplicate_alignment_tags_list = duplicate_alignment_tags_list - self.incomplete_alignment_tags_list = incomplete_alignment_tags_list - self.multiple_alignment_tags_list = multiple_alignment_tags_list - - self.num_queries = ( - len(necessary_alignment_record_tags_list) - + len(duplicate_alignment_tags_list) - + len(incomplete_alignment_tags_list) - + len(multiple_alignment_tags_list) - ) - - -def _get_sorted_count_matrix( - count_matrix: np.ndarray, row_index: np.ndarray, col_index: np.ndarray -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Sorted the rows and columns of `count_matrix` and the associated row/column indices. - - Parameters - ---------- - count_matrix : np.ndarray - a cell x gene count matrix - row_index : np.ndarray - row index of the count matrix (i.e. cell barcodes) - col_index : np.ndarray - column index of the count matrix (i.e. gene names) - - Returns - ------- - Tuple[np.ndarray, np.ndarray, np.ndarray] - row/column sorted count matrix, sorted row index, sorted column index - """ - sorted_row_indices = [ - idx for idx, _ in sorted(enumerate(row_index), key=operator.itemgetter(1)) - ] - sorted_col_indices = [ - idx for idx, _ in sorted(enumerate(col_index), key=operator.itemgetter(1)) - ] - return ( - count_matrix[sorted_row_indices, :][:, sorted_col_indices], - row_index[sorted_row_indices], - col_index[sorted_col_indices], - ) - - -@pytest.mark.parametrize( - "alignment_sort_order", - [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()], - ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"], -) -def test_count_matrix_from_bam( - alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index -): - # instantiate a test data generator - synthetic_data_generator = SyntheticTaggedBAMGenerator( - _test_num_cells, _test_max_genes, gene_name_to_index, _test_gene_expression_rate - ) - - _test_temp_dir = tempfile.TemporaryDirectory() - try: - # generate test data - synthetic_data_generator.generate_synthetic_bam_and_counts_matrix( - _test_temp_dir.name, - _test_num_duplicates, - _test_num_missing_some_tags, - _test_num_multiple_gene_alignments, - _test_max_gene_hits_per_multiple_gene_alignments, - alignment_sort_order=alignment_sort_order, - ) - - # test data paths - test_bam_path = os.path.join( - _test_temp_dir.name, SyntheticTaggedBAMGenerator.bam_output_filename - ) - test_count_matrix_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedBAMGenerator.count_matrix_output_filename, - ) - test_row_index_path = os.path.join( - _test_temp_dir.name, SyntheticTaggedBAMGenerator.row_index_output_filename - ) - test_col_index_path = os.path.join( - _test_temp_dir.name, SyntheticTaggedBAMGenerator.col_index_output_filename - ) - - # create CountMatrix from the synthetic bam - count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( - test_bam_path, gene_name_to_index - ) - - # load the test counts matrix - count_matrix_data_expected = np.load(test_count_matrix_path) - row_index_expected = np.load(test_row_index_path) - col_index_expected = np.load(test_col_index_path) - - finally: - _test_temp_dir.cleanup() - - count_matrix_data_from_bam = count_matrix_from_bam.matrix.todense() - row_index_from_bam = count_matrix_from_bam.row_index - col_index_from_bam = count_matrix_from_bam.col_index - - # sort expected and from_bam results by their respective row and column indices, since their sort order - # is not part of the design specs and is considered arbitrary - ( - sorted_count_matrix_data_from_bam, - sorted_row_index_from_bam, - sorted_col_index_from_bam, - ) = _get_sorted_count_matrix( - count_matrix_data_from_bam, row_index_from_bam, col_index_from_bam - ) - ( - sorted_count_matrix_data_expected, - sorted_row_index_expected, - sorted_col_index_expected, - ) = _get_sorted_count_matrix( - count_matrix_data_expected, row_index_expected, col_index_expected - ) - - # assert equality of sorted count matrices and sorted row/col indices - assert np.allclose( - sorted_count_matrix_data_from_bam, sorted_count_matrix_data_expected - ) - assert all( - [ - row_name_from_bam == row_name_expected - for row_name_from_bam, row_name_expected in zip( - sorted_row_index_from_bam, sorted_row_index_expected - ) - ] - ) - assert all( - [ - col_name_from_bam == col_name_expected - for col_name_from_bam, col_name_expected in zip( - sorted_col_index_from_bam, sorted_col_index_expected - ) - ] - ) - - -def extract_gene_non_exons( - chromosome_gene_exons: Dict[str, List[tuple]], - chromosome_gene_locations_extended: Dict[str, List[tuple]], -) -> Dict[str, Dict[str, List[tuple]]]: - - chromosome_gene_non_exons = {} - - for chromosome in chromosome_gene_exons: - chromosome_gene_non_exons[chromosome] = {} - gene_name_exon_list = {} - for gene_exons in chromosome_gene_exons[chromosome]: - gene_name_exon_list[gene_exons[1]] = gene_exons[0] - - gene_name_location_dict = {} - for gene_locations in chromosome_gene_locations_extended[chromosome]: - gene_name_location_dict[gene_locations[1]] = gene_locations[0] - - for gene_name in gene_name_location_dict: - non_exon_list = [] - if gene_name in gene_name_exon_list: - - start, end = gene_name_location_dict[gene_name] - coords = gene_name_exon_list[gene_name] - coords.sort(key=lambda a: a[0]) - - x = start - y = coords[0][0] - 1 - i = 0 - - n = len(coords) - while i < n: - if y <= coords[i][0]: - if x < y: - non_exon_list.append((x, y)) - x = coords[i][1] - else: - x = max(x, coords[i][1]) - - if i < n - 1: - y = min(end, coords[i + 1][0]) - i += 1 - chromosome_gene_non_exons[chromosome][gene_name] = non_exon_list.copy() - - return chromosome_gene_non_exons - - -@pytest.mark.parametrize( - "alignment_sort_order", - [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()], - ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"], -) -def _count_matrix_with_introns( - alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index -): - - chromosomes_gene_locations_extended = gtf.extract_extended_gene_names( - _test_annotation_file - ) - chromosomes_gene_exons = gtf.extract_gene_exons(_test_annotation_file) - - _test_chromosomes_gene_non_exons = extract_gene_non_exons( - chromosomes_gene_exons, chromosomes_gene_locations_extended - ) - - _test_chromosomes_gene_exons = {} - for chromosome in chromosomes_gene_exons: - _test_chromosomes_gene_exons[chromosome] = {} - for gene_exons in chromosomes_gene_exons[chromosome]: - _test_chromosomes_gene_exons[chromosome][gene_exons[1]] = gene_exons[0] - - # instantiate a test data generator - chromosome = list(_test_chromosomes_gene_exons.keys())[0] - - synthetic_data_generator = SyntheticTaggedAlignmentTypeBAMGenerator( - _test_num_cells, - _test_max_genes, - _test_chromosomes_gene_exons[chromosome], - _test_chromosomes_gene_non_exons[chromosome], - ) - - _test_temp_dir = tempfile.TemporaryDirectory() - try: - # generate test data - synthetic_data_generator.generate_synthetic_bam_and_counts_matrix( - _test_temp_dir.name, - gene_name_to_index, - test_index, - alignment_sort_order=alignment_sort_order, - ) - - # test data paths - test_bam_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.bam_output_filename, - ) - test_count_matrix_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename, - ) - test_row_index_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename, - ) - test_col_index_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename, - ) - # create CountMatrix from the synthetic bam - if test_index == consts.SINGLE_CELL_COUNT_MATRIX: - count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( - test_bam_path, gene_name_to_index - ) - if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX: - count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( - test_bam_path, - gene_name_to_index, - chromosomes_gene_locations_extended=chromosomes_gene_locations_extended, - ) - - # load the test counts matrix - _count_matrix_data_expected = sp.csr_matrix(np.load(test_count_matrix_path)) - row_index_expected = np.load(test_row_index_path) - col_index_expected = np.load(test_col_index_path) - - count_matrix_data_expected = CountMatrix( - _count_matrix_data_expected, row_index_expected, col_index_expected - ) - count_matrix_data_expected = count_matrix_data_expected.matrix.todense() - - finally: - _test_temp_dir.cleanup() - - count_matrix_data_from_bam = count_matrix_from_bam.matrix.todense() - row_index_from_bam = count_matrix_from_bam.row_index - col_index_from_bam = count_matrix_from_bam.col_index - - # sort expected and from_bam results by their respective row and column indices, since their sort order - # is not part of the design specs and is considered arbitrary - ( - sorted_count_matrix_data_from_bam, - sorted_row_index_from_bam, - sorted_col_index_from_bam, - ) = _get_sorted_count_matrix( - count_matrix_data_from_bam, row_index_from_bam, col_index_from_bam - ) - ( - sorted_count_matrix_data_expected, - sorted_row_index_expected, - sorted_col_index_expected, - ) = _get_sorted_count_matrix( - count_matrix_data_expected, row_index_expected, col_index_expected - ) - - assert all( - [ - row_name_from_bam == row_name_expected - for row_name_from_bam, row_name_expected in zip( - sorted_row_index_from_bam, sorted_row_index_expected - ) - ] - ) - assert all( - [ - col_name_from_bam == col_name_expected - for col_name_from_bam, col_name_expected in zip( - sorted_col_index_from_bam, sorted_col_index_expected - ) - ] - ) - - assert np.allclose( - sorted_count_matrix_data_from_bam, sorted_count_matrix_data_expected - ) - - -class SyntheticTaggedAlignmentTypeBAMGenerator: - """This class generates a synthetic count matrix and an accompanying synthetic tagged BAM file as - described in the preamble documentation block. - - Parameters - ---------- - num_cells : int - number of real cells - max-genes : int - maximum number of genes to use to generate synthetic counts - chromosomes_gene_exons : Dict[str, Dict[str, List[tuple]]] - keys at the first level refers to chromosome number, keys at the - second level refers to a gene and with the list of exonic regions as values - chromosomes_gene_non_exons : Dict[str, Dict[str, List[tuple]]] - keys at the first level refers to chromosome number, keys at the - second level refers to a gene and with the list of intronic regions as values - - rng_seed : int - random number generator seed - - Methods - ------- - generate_synthetic_bam_and_counts_matrix - generates synthetic test data and writes the output to disk - - See Also - -------- - count.from_sorted_tagged_bam - """ - - OUTPUT_PREFIX = "intronic_" - SYNTHETIC_SEQUENCE_LENGTH = 5 - REFERENCE_SEQUENCE_NAME = "1" - # EXONIC_SEQUENCE_NAME = "EXONIC_SEQUENCE" - SYNTHETIC_SEQUENCE_LENGTH = 100 - - bam_output_filename = OUTPUT_PREFIX + "records.bam" - count_matrix_output_filename = OUTPUT_PREFIX + "count_matrix.npy" - row_index_output_filename = OUTPUT_PREFIX + "_row_index.npy" - col_index_output_filename = OUTPUT_PREFIX + "_col_index.npy" - - def __init__( - self, - num_cells: int, - max_genes: int, - chromosomes_gene_exons: Dict[str, Dict[str, List[tuple]]], - chromosomes_gene_non_exons: Dict[str, List[tuple]], - rng_seed: int = 777, - ) -> None: - self.num_cells = num_cells - - self.chromosomes_gene_exons = chromosomes_gene_exons - self.chromosomes_gene_non_exons = chromosomes_gene_non_exons - - # initialize the random number generator - self.rng: np.random.RandomState = np.random.RandomState(seed=rng_seed) - - # generate gene names - self.all_gene_names = list(self.chromosomes_gene_exons.keys())[:max_genes] - self.num_genes = len(self.all_gene_names) - - self.max_genes = max_genes - assert ( - max_genes <= self.num_genes - ), f"Max genes ({self.max_genes}) must be <= to all annotated genes ({self.num_genes})" - self.to_be_used_gene_indices: List[int] = self.rng.choice( - np.arange(0, self.num_genes, dtype=np.int), - size=self.max_genes, - replace=False, - ).tolist() - self.to_be_used_gene_names = [ - self.all_gene_names[j] for j in self.to_be_used_gene_indices - ] - - def _generate_random_cell_barcode(self, length: int = 16): - return self._generate_random_genomic_sequences(length) - - def _generate_random_molecule_barcode(self, length: int = 10): - return self._generate_random_genomic_sequences(length) - - def _generate_random_genomic_sequences(self, length: int): - return "".join(self.rng.choice(["A", "C", "T", "G"], size=length)) - - def _generate_location_based_tag_list( - self, num_alignments: int, gene_names: List[str], alignment_location: str - ): - alignment_record_tags = [] - for i in range(num_alignments): - alignment_record_tags.append( - AlignmentRecordTags( - self._generate_random_cell_barcode(), - self._generate_random_molecule_barcode(), - gene_names[i], - alignment_location, - ) - ) - - return alignment_record_tags - - def _add_alignment_start_coordinates(self, alignment_tags, alignment_location): - _alignment_tags = [] - - for alignment_tag in alignment_tags: - if alignment_location == "EXONIC": - if alignment_tag.gene_name in self.chromosomes_gene_exons: - coord = self.chromosomes_gene_exons[alignment_tag.gene_name] - setattr(alignment_tag, "coordinate", coord[0][0] + 1) - _alignment_tags.append(alignment_tag) - - if alignment_location == "INTRONIC": - if alignment_tag.gene_name in self.chromosomes_gene_non_exons: - coord = self.chromosomes_gene_non_exons[alignment_tag.gene_name] - if coord: - setattr(alignment_tag, "coordinate", coord[0][0] + 1) - alignment_tag.gene_name = "" - _alignment_tags.append(alignment_tag) - - return _alignment_tags - - def generate_synthetic_bam_and_counts_matrix( - self, - output_path: str, - gene_name_to_index: int, - test_index: int, - alignment_sort_order: bam.AlignmentSortOrder = CellMoleculeGeneQueryNameSortOrder(), - ): - """Generates synthetic count matrix and BAM file and writes them to disk. - - Parameters - ---------- - output_path : str - output path - gene_name_to_index : Dict[str, int] - gene name to an index - test_index : int - 0 for single cell matrix and 1 for single nuclei matrix - alignment_sort_order : bam.AlignmentSortOrder - sort order of BAM alignment records; if 'None', random sort order is implied - - Returns - ------- - None - """ - - gene_names_alignments = [] - - for gene_name in sorted(self.chromosomes_gene_non_exons.keys()): - if self.chromosomes_gene_non_exons[gene_name]: - gene_names_alignments.append(gene_name) - - gene_names: List[int] = [] - cell_ids: List[int] = [] - - records = [] - # Only exons, expected in both single-cell and single-nuclei modes - exonic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[0:], "EXONIC" - ) - exonic_alignment_tags = self._add_alignment_start_coordinates( - exonic_alignment_tags, "EXONIC" - ) - - for i, alignment_tag in enumerate(exonic_alignment_tags): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tag, - "EXONIC", - i, - 10, - self.rng, - reference_start=alignment_tag.coordinate, - ) - records.append(pysam_alignment) - gene_names.append(alignment_tag.gene_name) - cell_ids.append(alignment_tag.cell_barcode) - - "Only introns only in single-nuclei mode" - intronic_alignment_tags = self._generate_location_based_tag_list( - 3, gene_names_alignments[10:], "INTRONIC" - ) - intronic_alignment_tags = self._add_alignment_start_coordinates( - intronic_alignment_tags, "INTRONIC" - ) - for i, alignment_tag in enumerate(intronic_alignment_tags): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tag, - "INTRONIC", - i + 10, - 10, - self.rng, - reference_start=alignment_tag.coordinate, - ) - records.append(pysam_alignment) - if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX: - gene_names.append(gene_names_alignments[i + 10]) - cell_ids.append(alignment_tag.cell_barcode) - - "both intron and exons from the same gene in bost single-cell and single-nuclei modes" - exonic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[20:], "EXONIC" - ) - exonic_alignment_tags = self._add_alignment_start_coordinates( - exonic_alignment_tags, "EXONIC" - ) - - _intronic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[20:], "INTRONIC" - ) - intronic_alignment_tags = [] - for intronic_tag, exonic_tag in zip( - _intronic_alignment_tags, exonic_alignment_tags - ): - intronic_tag.cell_barcode = exonic_tag.cell_barcode - intronic_alignment_tags.append(intronic_tag) - intronic_alignment_tags = self._add_alignment_start_coordinates( - intronic_alignment_tags, "INTRONIC" - ) - - for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate( - zip(exonic_alignment_tags, intronic_alignment_tags) - ): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - exonic_alignment_tag, - "EXONINTRONSAME", - i + 20, - 10, - self.rng, - reference_start=exonic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - intronic_alignment_tag, - "EXONINTRONSAME", - i + 20, - 10, - self.rng, - reference_start=intronic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - cell_ids.append(exonic_alignment_tag.cell_barcode) - gene_names.append(exonic_alignment_tag.gene_name) - - # both intron and exons from separate genes should not appear in single-cell mode - exonic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[30:], "EXONIC" - ) - exonic_alignment_tags = self._add_alignment_start_coordinates( - exonic_alignment_tags, "EXONIC" - ) - - _intronic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[31:], "INTRONIC" - ) - intronic_alignment_tags = [] - for intronic_tag, exonic_tag in zip( - _intronic_alignment_tags, exonic_alignment_tags - ): - intronic_tag.cell_barcode = exonic_tag.cell_barcode - intronic_alignment_tags.append(intronic_tag) - intronic_alignment_tags = self._add_alignment_start_coordinates( - intronic_alignment_tags, "INTRONIC" - ) - - for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate( - zip(exonic_alignment_tags, intronic_alignment_tags) - ): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - exonic_alignment_tag, - "EXONINTRONSEP", - i + 30, - 10, - self.rng, - reference_start=exonic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - intronic_alignment_tag, - "EXONINTRONSEP", - i + 30, - 10, - self.rng, - reference_start=intronic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - - if test_index == consts.SINGLE_CELL_COUNT_MATRIX: - cell_ids.append(exonic_alignment_tag.cell_barcode) - gene_names.append(exonic_alignment_tag.gene_name) - - # write BAM file - with pysam.AlignmentFile( - os.path.join(output_path, self.bam_output_filename), - mode="wb", - reference_names=[self.REFERENCE_SEQUENCE_NAME], - reference_lengths=[self.SYNTHETIC_SEQUENCE_LENGTH], - ) as bo: - for record in records: - bo.write(record) - - n_genes = len(gene_name_to_index) - n_data = len(cell_ids) - # write count matrix, row index, and col index - count_matrix = np.zeros((n_data, n_genes), dtype=np.int32) - for i, (cell_id, gene_name) in enumerate(zip(cell_ids, gene_names)): - count_matrix[i][gene_name_to_index[gene_name]] = 1 - - test_count_matrix_path = os.path.join( - output_path, - SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename, - ) - test_row_index_path = os.path.join( - output_path, - SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename, - ) - test_col_index_path = os.path.join( - output_path, - SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename, - ) - - np.save(test_count_matrix_path, count_matrix) - np.save(test_row_index_path, cell_ids) - gene_rank = [(gene, rank) for gene, rank in gene_name_to_index.items()] - gene_rank.sort(key=lambda x: x[1]) - gene_names = [x[0] for x in gene_rank] - np.save(test_col_index_path, gene_names) - - return os.path.join(output_path, self.bam_output_filename) diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_encodings.py b/tools/scripts/sctools/build/lib/sctools/test/test_encodings.py deleted file mode 100644 index 1eeb4584..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_encodings.py +++ /dev/null @@ -1,97 +0,0 @@ -import pytest -from .. import encodings -from itertools import combinations - - -@pytest.fixture(scope="module") -def sequence(): - return b"ACGTTTGAGATGAGATATAGANNNN" - - -@pytest.fixture(scope="module") -def encoder_2bit(sequence): - length = len(sequence) - return encodings.TwoBit(length) - - -@pytest.fixture(scope="module") -def encoder_3bit(): - return encodings.ThreeBit() - - -@pytest.fixture(scope="module", params=[encodings.TwoBit, encodings.ThreeBit]) -def encoder(request): - return request.param - - -def test_two_bit_encode_decode_produces_same_string_except_for_N( - sequence, encoder_2bit -): - encoded = encoder_2bit.encode(sequence) - decoded = encoder_2bit.decode(encoded) - assert sequence[:4] == decoded[:4] # last 4 are N, which get randomized - - -def test_three_bit_encode_decode_produces_same_string(sequence, encoder_3bit): - encoded = encoder_3bit.encode(sequence) - decoded = encoder_3bit.decode(encoded) - assert sequence == decoded - - -def test_two_bit_encoder_gets_correct_gc_content(encoder_2bit): - sequence_no_n = b"AGCGCGAT" - gc_content = sequence_no_n.count(b"C") + sequence_no_n.count(b"G") - encoded = encoder_2bit.encode(sequence_no_n) - assert encoder_2bit.gc_content(encoded) == gc_content - - -def test_three_bit_encoder_gets_correct_gc_content(sequence, encoder_3bit): - encoded = encoder_3bit.encode(sequence) - assert encoder_3bit.gc_content(encoded) == sequence.count(b"C") + sequence.count( - b"G" - ) - - -def test_two_bit_throws_errors_when_asked_to_encode_unknown_nucleotide(encoder_2bit): - with pytest.raises(KeyError): - encoder_2bit.encode(b"ACGTP") # P is not a valid code - - -def test_three_bit_encodes_unknown_nucleotides_as_N(encoder_3bit): - encoded = encoder_3bit.encode(b"ACGTP") # P is not a valid code - decoded = encoder_3bit.decode(encoded) - assert decoded == b"ACGTN" - - -@pytest.fixture -def simple_barcodes(): - """simple barcode set with min_hamming = 1, max_hamming = 2""" - return [b"ACGT", b"ACGG", b"ACGA", b"ACGC", b"TCGT", b"CCGT", b"GCGT"] - - -@pytest.fixture -def simple_hamming_distances(simple_barcodes): - simple_hamming_distances = [] - for a, b in combinations(simple_barcodes, 2): - d_hamming = 0 - for i, j in zip(a, b): - if i != j: - d_hamming += 1 - simple_hamming_distances.append(d_hamming) - return simple_hamming_distances - - -def test_encoded_hamming_distance_is_accurate( - simple_hamming_distances, simple_barcodes, encoder -): - # encode simple barcodes - tbe = encoder(4) - encoded = [tbe.encode(b) for b in simple_barcodes] - encoded_hamming_distances = [] - - # use hamming distance function - for a, b in combinations(encoded, 2): - encoded_hamming_distances.append(tbe.hamming_distance(a, b)) - - # verify they are the same as the simple function used in this file - assert simple_hamming_distances == encoded_hamming_distances diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_entrypoints.py b/tools/scripts/sctools/build/lib/sctools/test/test_entrypoints.py deleted file mode 100644 index 419b3257..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_entrypoints.py +++ /dev/null @@ -1,307 +0,0 @@ -import glob -import os -import tempfile - -import numpy as np -import pysam -import pytest -import scipy.sparse as sp - -from sctools import bam, platform, count, consts - -data_dir = os.path.split(__file__)[0] + "/data/" - - -def test_Attach10XBarcodes_entrypoint(): - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - ] - - rc = platform.TenXV2.attach_barcodes(args) - assert rc == 0 - with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: - for alignment in f: - # each alignment should now have a tag, and that tag should be a string - assert isinstance( - alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str - ) - os.remove("test_tagged_bam.bam") # clean up - - -def test_Attach10XBarcodes_entrypoint_with_whitelist(): - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - ] - - return_call = platform.TenXV2.attach_barcodes(args) - assert return_call == 0 - success = False - with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: - for alignment in f: - if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): - success = True - # each alignment should now have a tag, and that tag should be a string - assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str - ) - assert success - os.remove("test_tagged_bam.bam") # clean up - - -def test_AttachBarcodes_entrypoint_with_whitelist(): - # test of the BarcodePlatform.attach_barcodes entry point with - # sample, cell, and molecule barcodes all specified - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - "--sample-barcode-start-position", - "0", - "--sample-barcode-length", - "8", - "--cell-barcode-start-position", - "0", - "--cell-barcode-length", - "16", - "--molecule-barcode-start-position", - "16", - "--molecule-barcode-length", - "7", # changed 10>7 intentionally for test - ] - - return_call = platform.BarcodePlatform.attach_barcodes(args) - assert return_call == 0 - success = False - with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: - for alignment in f: - if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): - success = True - # each alignment should now have a tag, and that tag should be a string - assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str - ) - assert len(alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY)) == 7 - assert isinstance( - alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str - ) - assert success - os.remove("test_tagged_bam.bam") # clean up - - -def test_split_bam(): - tag_args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - ] - - platform.TenXV2.attach_barcodes(tag_args) - - split_args = [ - "--bamfile", - "test_tagged_bam.bam", - "--output-prefix", - "test_tagged", - "--subfile-size", - "0.005", - "--tags", - consts.CELL_BARCODE_TAG_KEY, - consts.RAW_CELL_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.split_bam(split_args) - assert return_call == 0 - - for f in glob.glob("test_tagged*"): - os.remove(f) - - -def test_tag_sort_bam(): - args = [ - "-i", - data_dir + "unsorted.bam", - "-o", - "test_sorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.tag_sort_bam(args) - assert return_call == 0 - - tag_keys = [ - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - with pysam.AlignmentFile("test_sorted.bam", "rb") as f: - segments = f.fetch(until_eof=True) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - for f in glob.glob("test_sorted*"): - os.remove(f) - - -def test_tag_sort_bam_dash_t_specified_multiple_times(): - args = [ - "-i", - data_dir + "unsorted.bam", - "-o", - "test_sorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - "-t", - consts.GENE_NAME_TAG_KEY, - "-t", - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.tag_sort_bam(args) - assert return_call == 0 - - tag_keys = [ - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - with pysam.AlignmentFile("test_sorted.bam", "rb") as f: - segments = f.fetch(until_eof=True) - tag_sortable_record_generator = ( - bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments - ) - bam.verify_sort(tag_sortable_record_generator, tag_keys) - - for f in glob.glob("test_sorted*"): - os.remove(f) - - -def test_tag_sort_bam_no_tags(): - args = ["-i", data_dir + "unsorted.bam", "-o", "test_sorted.bam"] - - return_call = platform.GenericPlatform.tag_sort_bam(args) - assert return_call == 0 - - tag_keys = [] - with pysam.AlignmentFile("test_sorted.bam", "rb") as f: - segments = f.fetch(until_eof=True) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - for f in glob.glob("test_sorted*"): - os.remove(f) - - -def test_verify_bam_sort(): - args = [ - "-i", - data_dir + "cell-gene-umi-queryname-sorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.verify_bam_sort(args) - assert return_call == 0 - - -def test_verify_bam_sort_raises_error_on_unsorted(): - args = [ - "-i", - data_dir + "unsorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - with pytest.raises(bam.SortError): - platform.GenericPlatform.verify_bam_sort(args) - - -def test_count_merge(): - tmp = tempfile.mkdtemp() - - data, ind, col = [np.arange(10)] * 3 - matrix = sp.coo_matrix((data, (ind, col)), shape=(10, 10), dtype=np.float32).tocsr() - # be lazy and reuse the inds as the col and row index - counts = count.CountMatrix(matrix, ind, col) - counts.save(tmp + "/test_input_1") - counts.save(tmp + "/test_input_2") - - merge_args = [ - "-o", - tmp + "/test_merged_counts", - "-i", - tmp + "/test_input_2", - tmp + "/test_input_1", - ] - return_call = platform.GenericPlatform.merge_count_matrices(merge_args) - assert return_call == 0 diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_fastq.py b/tools/scripts/sctools/build/lib/sctools/test/test_fastq.py deleted file mode 100644 index fdf8f58c..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_fastq.py +++ /dev/null @@ -1,275 +0,0 @@ -import os -import string -from functools import partial -from itertools import product - -import pytest - -from .. import fastq, consts -from ..reader import zip_readers - -# set some useful globals for testing -data_dir = os.path.split(__file__)[0] + "/data/" -_i7_files = [ - data_dir + f for f in ("test_i7.fastq", "test_i7.fastq.gz", "test_i7.fastq.bz2") -] -_files = [data_dir + f for f in ("test_i7.fastq", "test_r1.fastq", "test_r2.fastq")] -_gz_files = [ - data_dir + f for f in ("test_i7.fastq.gz", "test_r1.fastq.gz", "test_r2.fastq.gz") -] -_bz2_files = [ - data_dir + f - for f in ("test_i7.fastq.bz2", "test_r1.fastq.bz2", "test_r2.fastq.bz2") -] - -_modes = ("r", "rb") -_files_and_modes = list(product(_i7_files, _modes)) -_multifiles_and_modes = list(product((_files, _gz_files, _bz2_files), _modes)) -_map_encoder = {"r": str, "rb": partial(bytes, encoding="utf-8")} - - -# TEST READER - - -@pytest.fixture(scope="module", params=_files_and_modes) -def i7_files_compressions_and_modes(request): - """generates different compression types and modes for testing""" - return request.param[0], request.param[1] - - -@pytest.fixture(scope="module", params=_multifiles_and_modes) -def reader_all_compressions(request): - """generates open fastq reader files for each compression and read mode""" - return fastq.Reader(request.param[0], request.param[1]) - - -@pytest.fixture(scope="module") -def bytes_fastq_record(): - return [b"@name\n", b"ACTACAAT\n", b"+\n", b"%%%%AAAA\n"] - - -@pytest.fixture(scope="module") -def string_fastq_record(): - return ["@name\n", "ACTACAAT\n", "+\n", "%%%%AAAA\n"] - - -def test_reader_stores_filenames(): - names = ["notreal", "fake"] - rd = fastq.Reader(files=names) - assert rd.filenames == names - - -def test_reader_reads_first_record(reader_all_compressions): - for record in reader_all_compressions: - assert isinstance(record, fastq.Record) - expected_result = ( - "NCACAATG\n" if isinstance(record.sequence, str) else b"NCACAATG\n" - ) - assert record.sequence == expected_result - break # just first record - - -def test_reader_skips_header_character_raises_value_error( - i7_files_compressions_and_modes, -): - """ - test should skip the first name line, shifting each record up 1. As a result, the - first sequence should be found in the name field - """ - filename, mode = i7_files_compressions_and_modes - rd = fastq.Reader(filename, mode=mode, header_comment_char="@") - with pytest.raises(ValueError): - next(iter(rd)) - - -def test_reader_reads_correct_number_of_records_across_multiple_files( - reader_all_compressions, -): - assert len(reader_all_compressions) == 300 # 3 files - - -def test_mixed_filetype_read_gets_correct_record_number(): - rd = fastq.Reader([_gz_files[0], _bz2_files[0]], mode="r", header_comment_char="#") - - assert len(rd) == 200 - - -def test_non_string_filename_raises_typeerror(): - with pytest.raises(TypeError): - _ = fastq.Reader(10, "r") - - -def test_non_string_filename_in_iterable_raises_typeerror(): - with pytest.raises(TypeError): - _ = fastq.Reader(("works", 10), "r") - - -def test_invalid_open_mode_raises_valueerror(): - with pytest.raises(ValueError): - _ = fastq.Reader("works", "not_acceptable_open_mode") - - -def test_fastq_returns_correct_filesize_for_single_and_multiple_files(): - rd = fastq.Reader( - _i7_files[0], mode="r", header_comment_char="#" # mode irrelevant - ) - assert rd.size == 7774 - - rd = fastq.Reader(_i7_files, mode="r", header_comment_char="#") # mode irrelevant - assert rd.size == 7774 + 853 + 802 # three file sizes - - -def test_reader_properly_subsets_based_on_indices(): - rd = fastq.Reader(_i7_files[0], mode="r") - indices = {0, 5, 10, 12} - n_records = sum(1 for _ in rd.select_record_indices(indices)) - assert n_records == len(indices) - - -def test_zipping_readers_generates_expected_output(): - rd1 = fastq.Reader(_files[0], "r") - rd2 = fastq.Reader(_files[0], "r") - for r1, r2 in zip_readers(rd1, rd2): - assert isinstance(r1, fastq.Record) - assert isinstance(r2, fastq.Record) - expected_result = "NCACAATG\n" - assert r1.sequence == r2.sequence == expected_result - break # just first record - - -def test_zipping_readers_with_indices_generates_expected_output(): - rd1 = fastq.Reader(_files[0], "r") - rd2 = fastq.Reader(_files[0], "r") - indices = {0, 1, 2, 3} - for r1, r2 in zip_readers(rd1, rd2, indices=indices): - assert isinstance(r1, fastq.Record) - assert isinstance(r2, fastq.Record) - expected_result = "NCACAATG\n" - assert r1.sequence == r2.sequence == expected_result - break # just first record - - -def test_printing_bytes_record_generates_valid_fastq_record(bytes_fastq_record): - record = fastq.Record(bytes_fastq_record) - assert str(record) == b"".join(bytes_fastq_record).decode() - assert bytes(record) == b"".join(bytes_fastq_record) - - -def test_bytes_fastq_record_quality_score_parsing(bytes_fastq_record): - record = fastq.Record(bytes_fastq_record) - assert record.average_quality() == 18 - - -def test_printing_string_record_generates_valid_fastq_record(string_fastq_record): - record = fastq.StrRecord(string_fastq_record) - assert str(record) == "".join(string_fastq_record) - assert bytes(record) == "".join(string_fastq_record).encode() - - -def test_string_fastq_record_quality_score_parsing(string_fastq_record): - record = fastq.StrRecord(string_fastq_record) - assert record.average_quality() == 18 - - -# TEST RECORD - - -def test_fields_populate_properly(reader_all_compressions): - encoder = _map_encoder[reader_all_compressions._mode] - name_prefix = encoder("@") - alphabet = set(encoder("ACGTN")) - name2_string = encoder("+\n") - ascii_chars = set(i for i in encoder(string.printable)) - for record in reader_all_compressions: - assert record.name.startswith(name_prefix) - assert all(i in alphabet for i in record.sequence.strip()) - assert record.name2 == name2_string - assert all(i in ascii_chars for i in record.quality.strip()) - - -# TEST BarcodeGeneratorWithCorrectedCellbarcodes - - -@pytest.fixture(scope="function") -def embedded_barcode_generator(): - cell_barcode = fastq.EmbeddedBarcode( - start=0, - end=16, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - molecule_barcode = fastq.EmbeddedBarcode( - start=16, - end=26, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - return fastq.EmbeddedBarcodeGenerator( - data_dir + "test_r1.fastq.gz", [cell_barcode, molecule_barcode] - ) - - -@pytest.fixture(scope="function") -def barcode_generator_with_corrected_cell_barcodes(): - cell_barcode = fastq.EmbeddedBarcode( - start=0, - end=16, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - molecule_barcode = fastq.EmbeddedBarcode( - start=16, - end=26, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - return fastq.BarcodeGeneratorWithCorrectedCellBarcodes( - data_dir + "test_r1.fastq.gz", - cell_barcode, - data_dir + "1k-august-2016.txt", - [molecule_barcode], - ) - - -def test_embedded_barcode_generator_produces_outputs_of_expected_size( - embedded_barcode_generator, -): - for cell_seq, cell_qual, umi_seq, umi_qual in embedded_barcode_generator: - - # correct values - correct_cell_barcode_length = 16 - correct_umi_length = 10 - - # note that all barcodes are strings and therefore should get 'Z' values - - # test cell tags - assert cell_seq[0] == consts.RAW_CELL_BARCODE_TAG_KEY - assert len(cell_seq[1]) == correct_cell_barcode_length - assert all(v in "ACGTN" for v in cell_seq[1]) - assert cell_seq[2] == "Z" - assert cell_qual[0] == consts.QUALITY_CELL_BARCODE_TAG_KEY - assert len(cell_qual[1]) == correct_cell_barcode_length - assert all(v in string.printable for v in cell_qual[1]) - assert cell_seq[2] == "Z" - - # test umi tags - assert umi_seq[0] == consts.RAW_MOLECULE_BARCODE_TAG_KEY - assert len(umi_seq[1]) == correct_umi_length - assert all(v in "ACGTN" for v in umi_seq[1]) - assert umi_seq[2] == "Z" - assert umi_qual[0] == consts.QUALITY_MOLECULE_BARCODE_TAG_KEY - assert len(umi_qual[1]) == correct_umi_length - assert all(v in string.printable for v in umi_qual[1]) - assert umi_seq[2] == "Z" - - break # just the first tag is fine - - -def test_corrects_barcodes(barcode_generator_with_corrected_cell_barcodes): - success = False - for barcode_sets in barcode_generator_with_corrected_cell_barcodes: - for barcode_set in barcode_sets: - if barcode_set[0] == consts.CELL_BARCODE_TAG_KEY: - success = True - break - assert success diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_groups.py b/tools/scripts/sctools/build/lib/sctools/test/test_groups.py deleted file mode 100644 index 71d24539..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_groups.py +++ /dev/null @@ -1,345 +0,0 @@ -import os -import csv -import itertools -from sctools import platform - - -data_dir = os.path.split(__file__)[0] + "/data/group_metrics/" -unpaired_data_dir = os.path.split(__file__)[0] + "/data/group_metrics_unpaired_ss2/" - - -def check_parsed_metrics_csv(file_name, cell_id, class_name, expected_metrics): - with open(file_name) as f: - column_headers = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - metrics = f.readline().strip().split(",") - assert classes[0] == "Class" - assert set(classes[1:]) == {class_name} - for idx, each in enumerate(column_headers): - if idx == 0: - assert metrics[0] == cell_id - if idx > 0: - metric_name = column_headers[idx] - assert metrics[idx] == expected_metrics[metric_name] - - -def test_write_aggregated_picard_metrics_by_row(): - args = [ - "-f", - data_dir + "test_qc.alignment_summary_metrics.txt", - data_dir + "test_qc.insert_size_metrics.txt", - data_dir + "test_qc.duplicate_metrics.txt", - data_dir + "test_qc.rna_metrics.txt", - data_dir + "test_qc.gc_bias.summary_metrics.txt", - "-t", - "Picard", - "-o", - "output_picard_group", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = {} - with open(data_dir + "expected_picard_group.csv") as f: - column_headers = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - metrics = f.readline().strip().split(",") - for idx, each in enumerate(column_headers): - expected_metrics[each] = {"class": classes[idx], "metric": metrics[idx]} - with open("output_picard_group.csv") as f: - column_headers = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - metrics = f.readline().strip().split(",") - assert len(column_headers) == len(expected_metrics.keys()) - for idx, each in enumerate(column_headers): - header = expected_metrics[each] - assert classes[idx] == header["class"] - assert metrics[idx] == header["metric"] - os.remove("output_picard_group.csv") - - -def test_write_aggregated_picard_metrics_by_table(): - args = [ - "-t", - "PicardTable", - "-o", - "output_picard_group", - "-f", - data_dir + "test_qc.error_summary_metrics.txt", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = [ - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "C"), - ("ALT_COUNT", "16"), - ("REF_BASE", "A"), - ("REF_COUNT", "231512"), - ("SUBSTITUTION", "A>C"), - ("SUBSTITUTION_RATE", "6.9e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "G"), - ("ALT_COUNT", "156"), - ("REF_BASE", "A"), - ("REF_COUNT", "231512"), - ("SUBSTITUTION", "A>G"), - ("SUBSTITUTION_RATE", "0.000673"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "T"), - ("ALT_COUNT", "16"), - ("REF_BASE", "A"), - ("REF_COUNT", "231512"), - ("SUBSTITUTION", "A>T"), - ("SUBSTITUTION_RATE", "6.9e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "A"), - ("ALT_COUNT", "16"), - ("REF_BASE", "C"), - ("REF_COUNT", "173880"), - ("SUBSTITUTION", "C>A"), - ("SUBSTITUTION_RATE", "9.2e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "G"), - ("ALT_COUNT", "14"), - ("REF_BASE", "C"), - ("REF_COUNT", "173880"), - ("SUBSTITUTION", "C>G"), - ("SUBSTITUTION_RATE", "8.1e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "T"), - ("ALT_COUNT", "82"), - ("REF_BASE", "C"), - ("REF_COUNT", "173880"), - ("SUBSTITUTION", "C>T"), - ("SUBSTITUTION_RATE", "0.000471"), - ] - ), - ] - - with open("output_picard_group_error_summary_metrics.csv") as f: - reader = csv.DictReader(f) - - i = 0 - match_list = [] - for line in reader: - assert line in expected_metrics - i = i + 1 - - # expect the same set, list to be precise, of indices - assert i == len(expected_metrics) - - os.remove("output_picard_group_error_summary_metrics.csv") - - -def test_parse_hisat2_paired_end_log(): - args = [ - "-f", - data_dir + "test_hisat2_paired_end_qc.log", - "-t", - "HISAT2", - "-o", - "output_hisat2", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - cell_id = "test_hisat2_paired_end" - tag = "HISAT2G" - expected_metrics = { - "Total pairs": "5479", - "Aligned concordantly or discordantly 0 time": "412", - "Aligned concordantly 1 time": "4414", - "Aligned concordantly >1 times": "652", - "Aligned discordantly 1 time": "1", - "Total unpaired reads": "824", - "Aligned 0 time": "478", - "Aligned 1 time": "240", - "Aligned >1 times": "106", - "Overall alignment rate": "95.64%", - } - check_parsed_metrics_csv("output_hisat2.csv", cell_id, tag, expected_metrics) - os.remove("output_hisat2.csv") - - -def test_parse_hisat2_transcriptome_log(): - args = [ - "-f", - data_dir + "test_hisat2_transcriptome_rsem.log", - "-t", - "HISAT2", - "-o", - "output_hisat2_trans", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - cell_id = "test_hisat2_transcriptome" - tag = "HISAT2T" - expected_metrics = { - "Total pairs": "5479", - "Aligned concordantly or discordantly 0 time": "3635", - "Aligned concordantly 1 time": "360", - "Aligned concordantly >1 times": "1484", - "Aligned discordantly 1 time": "0", - "Total unpaired reads": "7270", - "Aligned 0 time": "7270", - "Aligned 1 time": "0", - "Aligned >1 times": "0", - "Overall alignment rate": "33.66%", - } - check_parsed_metrics_csv("output_hisat2_trans.csv", cell_id, tag, expected_metrics) - os.remove("output_hisat2_trans.csv") - - -def test_parse_rsem_cnt(): - file_name = data_dir + "test_rsem.cnt" - args = ["-f", file_name, "-t", "RSEM", "-o", "output_rsem"] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - cell_id = "test" - class_name = "RSEM" - expected_metrics = None - with open(file_name) as f: - N0, N1, N2, N_tot = f.readline().strip().split(" ") - n_unique, n_multi, n_uncertain = f.readline().strip().split(" ") - n_hits, read_type = f.readline().strip().split(" ") - expected_metrics = { - "unalignable reads": N0, - "alignable reads": N1, - "filtered reads": N2, - "total reads": N_tot, - "unique aligned": n_unique, - "multiple mapped": n_multi, - "total alignments": n_hits, - "strand": read_type, - "uncertain reads": n_uncertain, - } - check_parsed_metrics_csv("output_rsem.csv", cell_id, class_name, expected_metrics) - os.remove("output_rsem.csv") - - -def test_write_aggregated_qc_metrics(): - input_files = [ - data_dir + "test_picard_group.csv", - data_dir + "test_hisat2.csv", - data_dir + "test_hisat2_trans.csv", - data_dir + "test_rsem.csv", - ] - args = [ - "-f", - data_dir + "test_picard_group.csv", - data_dir + "test_hisat2.csv", - data_dir + "test_hisat2_trans.csv", - data_dir + "test_rsem.csv", - "-t", - "Core", - "-o", - "output_QCs", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = [] - expected_headers = [] - for input_file in input_files: - with open(input_file) as f: - reader = csv.DictReader(f) - expected_headers.extend(reader.fieldnames[1:]) - for idx, line in enumerate(reader): - if len(expected_metrics) < idx + 1: - expected_metrics.append(line) - else: - expected_metrics[idx].update(line) - output_headers = [] - with open("output_QCs.csv") as output_file: - reader = csv.DictReader(output_file) - output_headers.extend(reader.fieldnames) - for line in reader: - assert line in expected_metrics - # The output file should contain all of the column headers from the input files plus the "joined column" containing row headers - assert len(output_headers) == len(expected_headers) + 1 - os.remove("output_QCs.csv") - - -def test_unpaired_ss2_write_aggregated_picard_metrics_by_row(): - - sources = [ - unpaired_data_dir + "SRR6258488_qc.alignment_summary_metrics.txt", - unpaired_data_dir + "SRR6258488_qc.duplicate_metrics.txt", - unpaired_data_dir + "SRR6258488_qc.gc_bias.summary_metrics.txt", - unpaired_data_dir + "SRR6258488_qc.rna_metrics.txt", - ] - - args = ["-f", *sources, "-t", "Picard", "-o", "output_picard_group_unpaired"] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = {} - - for source in sources: - with open(source) as f: - for line in f: - if line.startswith("## METRICS CLASS"): - class_ = line.strip().split("\t")[1].split(".")[-1] - break - labels = f.readline().strip().split("\t") - values = f.readline().strip().split("\t") - - for label, value in itertools.zip_longest(labels, values, fillvalue=""): - if label in ("LIBRARY", "SAMPLE", "READ_GROUP", "CATEGORY"): - continue - if class_ == "AlignmentSummaryMetrics": - label += ".UNPAIRED" - try: - value = str(float(value)) - except ValueError: - pass - expected_metrics[(class_, label)] = value - expected_metrics[("Class", "")] = "SRR6258488" - - with open("output_picard_group_unpaired.csv") as f: - labels = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - values = f.readline().strip().split(",") - assert len(labels) == len(expected_metrics) - - for class_, label in expected_metrics: - if class_ not in classes or label not in labels: - print("!", class_, label) - - for class_, label, value in zip(classes, labels, values): - assert (class_, label) in expected_metrics - try: - value = str(float(value)) - except ValueError: - value = value - try: - expected_value = str(float(expected_metrics[(class_, label)])) - except ValueError: - expected_value = expected_metrics[(class_, label)] - assert value == expected_value - os.remove("output_picard_group_unpaired.csv") diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_gtf.py b/tools/scripts/sctools/build/lib/sctools/test/test_gtf.py deleted file mode 100644 index fd74ea91..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_gtf.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -from .. import gtf -from itertools import chain -import pytest - -_data_dir = os.path.split(__file__)[0] + "/data" -_files = ["%s/%s" % (_data_dir, f) for f in ("test.gtf", "test.gtf.gz", "test.gtf.bz2")] - - -@pytest.fixture(scope="module", params=_files) -def files(request): - """returns a filename""" - return request.param - - -def test_opens_file_reads_first_line(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - assert isinstance(record, gtf.GTFRecord) - - -def test_opens_file_populates_fields_properly(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - assert record.seqname == "chr19" - assert record.chromosome == "chr19" - assert record.source == "HAVANA" - assert record.feature == "gene" - assert record.start == 60951 - assert record.end == 71626 - assert record.score == "." - assert record.strand == "-" - assert record.frame == "." - - expected_features = { - "gene_id": "ENSG00000282458.1", - "gene_type": "transcribed_processed_pseudogene", - "gene_status": "KNOWN", - "gene_name": "WASH5P", - "level": "2", - "havana_gene": "OTTHUMG00000180466.8", - } - assert record._attributes == expected_features - - assert all( - i in str(record) - for i in chain(expected_features.keys(), expected_features.values()) - ) - - -def test_set_attribute_verify_included_in_output_string(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - record.set_attribute("test_attr", "foo") - assert record.get_attribute("test_attr") == "foo" - - # verify in output string - assert "foo" in str(record) - - -def test_opens_file_parses_size(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - assert 71626 - 60951 == record.size - - # mangle record, make sure error is raised - record._fields[3:5] = [record.end, record.start] - with pytest.raises(ValueError): - getattr(record, "size") diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_metrics.py b/tools/scripts/sctools/build/lib/sctools/test/test_metrics.py deleted file mode 100644 index 303c573d..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_metrics.py +++ /dev/null @@ -1,930 +0,0 @@ -import fileinput -import math -import os -import tempfile -from typing import Callable - -import numpy as np -import pandas as pd -import pytest -from sctools.metrics.gatherer import ( - GatherGeneMetrics, - GatherCellMetrics, - MetricGatherer, -) -from sctools.metrics.merge import MergeCellMetrics, MergeGeneMetrics -from sctools.platform import TenXV2 - -""" -Testing Data Definition & Acquisition - -The data hardcoded into this file come from the two notebooks associated with these metrics: -characterize-cell-testing-data.ipynb and characterize-gene-testing-data.ipynb. In these -notebooks, the testing .bam files are loaded into memory and interrogated for each of the -metrics in question using pandas and numpy commands. These independent calculation provide the -hard-coded data found in these tests. When testing data is changed, the notebook can be updated -to re-calculate the values found in this file. -""" - -# set the input and output directories, using a tempdir to automatically clean up generated files -_data_dir = os.path.split(__file__)[0] + "/data" -_test_dir = tempfile.mkdtemp() -os.makedirs(_test_dir, exist_ok=True) - -# note, to inspect these testing files, please install samtools and use the following command: -# samtools view | less - -# set the input files -_gene_sorted_bam = os.path.join(_data_dir, "small-gene-sorted.bam") -_cell_sorted_bam = os.path.join(_data_dir, "small-cell-sorted.bam") -_cell_sorted_bam_missing_cell_barcodes = os.path.join( - _data_dir, "cell-sorted-missing-cb.bam" -) - -# specify filenames for temporary metrics outputs that are used in the following tests -_gene_metric_output_file = os.path.join(_test_dir, "gene_metrics.csv.gz") -_cell_metric_output_file = os.path.join(_test_dir, "cell_metrics.csv.gz") -_cell_metric_output_file_missing_cell_barcodes = os.path.join( - _test_dir, "cell_metrics_missing_cb.csv.gz" -) - -# run the gene metrics suite -gene_gatherer = GatherGeneMetrics(_gene_sorted_bam, _gene_metric_output_file) -gene_gatherer.extract_metrics() -_gene_metrics = pd.read_csv(_gene_metric_output_file, index_col=0) - -# run the cell metrics suite -cell_gatherer = GatherCellMetrics(_cell_sorted_bam, _cell_metric_output_file) -cell_gatherer.extract_metrics() -_cell_metrics = pd.read_csv(_cell_metric_output_file, index_col=0) - -# run the cell metrics suite -cell_gatherer_missing_cbs = GatherCellMetrics( - _cell_sorted_bam_missing_cell_barcodes, - _cell_metric_output_file_missing_cell_barcodes, -) -cell_gatherer_missing_cbs.extract_metrics() -_cell_metrics_missing_cbs = pd.read_csv( - _cell_metric_output_file_missing_cell_barcodes, index_col=0 -) - - -def test_calculate_cell_metrics_cli(): - """test the sctools cell metrics CLI invocation""" - cell_metrics_csv = os.path.join(_test_dir, "cell_metrics.csv") - return_call = TenXV2.calculate_cell_metrics( - args=["-i", _cell_sorted_bam, "-o", cell_metrics_csv] - ) - assert return_call == 0 - - -def test_calculate_gene_metrics_cli(): - """test the sctools gene metrics CLI invocation""" - gene_metrics_csv = os.path.join(_test_dir, "gene_metrics.csv") - return_call = TenXV2.calculate_gene_metrics( - args=["-i", _gene_sorted_bam, "-o", gene_metrics_csv] - ) - assert return_call == 0 - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 300), (_cell_metrics, 656)] -) -def test_metrics_n_reads(metrics, expected_value): - """test that the metrics identify the correct read number""" - assert metrics["n_reads"].sum() == expected_value - - -def test_cell_metrics_mean_n_genes_observed(): - """ - test that the GatherCellMetrics method identifies the correct number of genes per cell, on - average. - """ - genes_observed = _cell_metrics["n_genes"].mean() - assert math.isclose(genes_observed, 1.9827, abs_tol=1e-4), "%f != %f" % ( - genes_observed, - 1.9827, - ) - - -def test_gene_metrics_n_genes(): - """Test that GatherGeneMetrics identifies the total number of genes in the test file""" - genes_observed = _gene_metrics.shape[0] - assert genes_observed == 8 - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 88), (_cell_metrics, 249)] -) -def test_metrics_n_molecules(metrics, expected_value): - """Test that each metric identifies the total number of molecules in the test file - - Molecules are defined as a unique combination of {cell barcode, molecule barcode, gene} - """ - molecules_observed = metrics["n_molecules"].sum() - assert molecules_observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 217), (_cell_metrics, 499)] -) -def test_metrics_n_fragments(metrics, expected_value): - """Test that each metric identifies the total number of fragments in the test file. - - Fragments are defined as a unique combination of {cell barcode, molecule barcode, strand, - position, chromosome} - """ - fragments_observed = metrics["n_fragments"].sum() - assert fragments_observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_gene_metrics, "AL627309.7"), (_cell_metrics, "AAACCTGGTAGAAGGA")], -) -def test_metrics_highest_expression_class(metrics, expected_value): - """ - for gene metrics, this is the highest expression gene. For cell metrics, this is the highest - expression cell. - """ - observed_max_gene = metrics["n_reads"].idxmax() - assert observed_max_gene == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 245), (_cell_metrics, 94)] -) -def test_metrics_highest_read_count(metrics, expected_value): - """ - Test that each metric identifies the what the highest read count associated with any single - entity - """ - observed_max_gene_reads = metrics["n_reads"].max() - assert observed_max_gene_reads == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [ - ( - _gene_metrics, - 300, - ), # todo this is 100%, we should mangle a few in the testing data - (_cell_metrics, 655), - ], -) -def test_metrics_number_perfect_molecule_barcodes(metrics, expected_value): - """Test that each metric correctly identifies the number of perfect molecule barcodes where UB == UR""" - observed_perfect_barcodes = metrics["perfect_molecule_barcodes"].sum() - assert observed_perfect_barcodes == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_cell_metrics, 650), (_cell_metrics_missing_cbs, 12861)], -) -def test_metrics_number_perfect_cell_barcodes(metrics, expected_value): - """Test that each metric correctly identifies the number of perfect cell barcodes where CB == CR""" - observed_perfect_cell_barcodes = metrics["perfect_cell_barcodes"].sum() - assert observed_perfect_cell_barcodes == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [ - ( - _gene_metrics, - 300, - ), # todo this is 100%, should get some intronic or other reads - (_cell_metrics, 609), - ], -) -def test_reads_mapped_exonic(metrics, expected_value): - """Test that each metric identifies the number of reads mapped to an exon (XF=='CODING')""" - observed = metrics["reads_mapped_exonic"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_gene_metrics, 0), (_cell_metrics, 28)], # todo null case -) -def test_reads_mapped_intronic(metrics, expected_value): - """Test that each metric identifies the number of reads mapped to an intron (XF=='INTRONIC')""" - observed = metrics["reads_mapped_intronic"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_gene_metrics, 0), (_cell_metrics, 19)], # todo null case -) -def test_reads_mapped_utr(metrics, expected_value): - """Test that each metric identifies the number of reads mapped to a UTR (XF=='UTR')""" - observed = metrics["reads_mapped_utr"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [ - (_gene_metrics, 300), # todo need to include at least 1 multi-mapper - (_cell_metrics, 656), - ], -) -def test_reads_mapped_uniquely(metrics, expected_value): - """Uniquely mapping reads will be tagged with NH==1""" - observed = metrics["reads_mapped_uniquely"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 90), (_cell_metrics, 107)] -) -def test_duplicate_records(metrics, expected_value): - """Duplicate records are identified by the 1024 bit being set in the sam flag""" - observed = metrics["duplicate_reads"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 29), (_cell_metrics, 2)] -) -def test_spliced_reads(metrics, expected_value): - """ - This pipeline defines spliced reads as containing an N segment of any length in the cigar string - """ - observed = metrics["spliced_reads"].sum() - assert observed == expected_value - - -# todo failing -# @pytest.mark.parametrize('metrics', [_gene_metrics, _cell_metrics]) -# def test_relationship_of_duplicates_and_fragments(metrics): -# """ -# We expect the number of duplicates and fragments to add up to the total number of reads. The -# rationale is that any read that is not a duplicate should be a distinct fragment, under our -# definitions. -# -# This fails because of (1) N-base and 2-base cell barcode correction errors and (2) -# fragment calculationes currently do not account for soft clipping. Fixing these will cause -# this test to pass -# """ -# dup_and_fragments = metrics['duplicate_reads'].sum() + metrics['n_fragments'].sum() -# reads = metrics['n_reads'].sum() -# assert reads == dup_and_fragments - - -@pytest.mark.parametrize("metrics", [_gene_metrics, _cell_metrics]) -def test_fragments_number_is_greater_than_molecule_number(metrics): - """ - There should always be more fragments than molecules, as the minimum definition of a molecule is - a fragment covered by a single read - """ - assert np.all(metrics["n_molecules"] >= 1) - assert np.all(metrics["n_fragments"] >= 1) - assert np.all(metrics["n_fragments"] >= metrics["n_molecules"]) - - -@pytest.mark.parametrize( - "metrics, key, expected_value", - [ - ( - _cell_metrics, - "molecule_barcode_fraction_bases_above_30_mean", - np.array( - [ - 1.0000, - 0.9500, - 1.0000, - 1.0000, - 0.9778, - 1.0000, - 1.0000, - 1.0000, - 0.9833, - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 0.9759, - 1.0000, - 1.0000, - 0.9830, - 1.0000, - 1.0000, - 1.0000, - 0.9778, - 0.9783, - 1.0000, - 0.9800, - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 0.9500, - 1.0000, - 0.9895, - 1.0000, - 0.9760, - 1.0000, - 1.0000, - 1.0000, - 0.9889, - 1.0000, - 0.9600, - 1.0000, - 0.9909, - 1.0000, - 1.0000, - 0.9556, - 0.9800, - 1.0000, - 0.9000, - 1.0000, - 0.9588, - 1.0000, - 1.0000, - 0.9889, - 0.8000, - 0.9538, - 0.9909, - 0.9929, - 0.9571, - ] - ), - ), - # todo failing. Odd because mean is passing; catastrophic cancellation in the online method? - # other methods that use the variance estimator work just fine. Something about the gene issue - # that is identified by other methods below? - # (_cell_metrics, 'molecule_barcode_fraction_bases_above_30_variance', - # np.array( - # [np.nan, 0.0050, np.nan, np.nan, 0.0019, 0.0000, 0.0000, np.nan, 0.0015, np.nan, 0.0000, - # 0.0000, np.nan, 0.0000, 0.0048, 0.0000, 0.0000, 0.0029, 0.0000, np.nan, 0.0000, 0.0044, - # 0.0109, 0.0000, 0.0020, 0.0000, 0.0000, np.nan, 0.0000, 0.0100, np.nan, 0.0010, 0.0000, - # 0.0052, 0.0000, 0.0000, 0.0000, 0.0011, 0.0000, 0.0162, 0.0000, 0.0016, 0.0000, np.nan, - # 0.0178, 0.0020, np.nan, np.nan, 0.0000, 0.0163, np.nan, np.nan, 0.0011, np.nan, 0.0147, - # 0.0018, 0.0007, 0.0306])), - ( - _cell_metrics, - "genomic_reads_fraction_bases_quality_above_30_mean", - np.array( - [ - 0.3980, - 0.6786, - 0.5000, - 0.9796, - 0.7800, - 0.7811, - 0.9337, - 0.8469, - 0.6743, - 0.4565, - 0.8622, - 0.9762, - 0.4925, - 0.7857, - 0.7478, - 0.8561, - 0.6327, - 0.7948, - 0.8405, - 0.4286, - 0.7735, - 0.6445, - 0.7291, - 0.8520, - 0.6711, - 0.6123, - 0.8238, - 0.5000, - 0.8376, - 0.5137, - 0.7526, - 0.7584, - 0.7574, - 0.8379, - 0.8490, - 0.5000, - 0.5983, - 0.7489, - 0.7755, - 0.8107, - 0.6963, - 0.8363, - 0.8896, - 0.6186, - 0.7549, - 0.7151, - 1.0000, - 0.5306, - 0.8347, - 0.7340, - 0.8367, - 0.8878, - 0.7347, - 0.4592, - 0.7718, - 0.7583, - 0.8439, - 0.7576, - ] - ), - ), - ( - _cell_metrics, - "genomic_reads_fraction_bases_quality_above_30_variance", - np.array( - [ - np.nan, - 0.1812, - np.nan, - np.nan, - 0.0266, - 0.0461, - 0.0042, - np.nan, - 0.0387, - np.nan, - 0.0178, - 0.0000, - np.nan, - 0.0002, - 0.0455, - 0.0342, - 0.0588, - 0.0359, - 0.0247, - np.nan, - 0.0400, - 0.0436, - 0.0754, - 0.0005, - 0.1140, - 0.0617, - 0.0400, - np.nan, - 0.0230, - 0.0491, - np.nan, - 0.0608, - 0.0556, - 0.0367, - 0.0215, - 0.0860, - 0.2182, - 0.0564, - 0.0008, - 0.0395, - 0.0330, - 0.0433, - 0.0063, - np.nan, - 0.0366, - 0.0778, - np.nan, - np.nan, - 0.0114, - 0.0391, - np.nan, - np.nan, - 0.0193, - np.nan, - 0.0288, - 0.0444, - 0.0311, - 0.0558, - ] - ), - ), - ( - _cell_metrics, - "genomic_read_quality_mean", - np.array( - [ - 25.3776, - 32.5051, - 27.7755, - 39.9184, - 34.3639, - 34.5969, - 37.4592, - 35.9490, - 31.6345, - 26.5870, - 36.7500, - 39.5374, - 28.0896, - 33.7041, - 33.6079, - 36.2787, - 30.8472, - 34.8402, - 35.9327, - 24.7755, - 34.3603, - 31.0934, - 33.2880, - 36.7092, - 31.9647, - 30.2158, - 35.3956, - 27.6837, - 35.8674, - 27.4527, - 34.3918, - 33.7323, - 33.6425, - 35.9552, - 35.5694, - 27.4184, - 30.0479, - 33.4621, - 34.6633, - 35.2128, - 32.4619, - 35.7690, - 36.9963, - 30.0722, - 33.6353, - 32.6708, - 39.8721, - 28.0510, - 35.9388, - 33.1278, - 35.8265, - 36.6633, - 32.7188, - 26.6429, - 34.1053, - 34.0012, - 36.0956, - 33.7704, - ] - ), - ), - ( - _cell_metrics, - "genomic_read_quality_variance", - np.array( - [ - np.nan, - 92.5078, - np.nan, - np.nan, - 18.9818, - 29.9521, - 6.6724, - np.nan, - 25.4164, - np.nan, - 12.8541, - 0.3790, - np.nan, - 0.0019, - 28.7815, - 24.6669, - 37.7402, - 22.8765, - 16.5399, - np.nan, - 22.9679, - 26.2414, - 44.8249, - 0.5740, - 70.4607, - 42.5318, - 24.9536, - np.nan, - 14.0772, - 32.6389, - np.nan, - 38.1213, - 34.4094, - 23.2517, - 13.9110, - 48.9622, - 117.2337, - 32.9814, - 0.3850, - 24.3135, - 17.8765, - 26.5847, - 5.2099, - np.nan, - 22.5846, - 48.2133, - np.nan, - np.nan, - 5.6775, - 23.9395, - np.nan, - np.nan, - 12.9322, - np.nan, - 18.1475, - 29.6960, - 20.7504, - 34.9055, - ] - ), - ), - # todo right now the metrics count reads that have no 'gene' towards molecules, whereas - # the calculations in the notebook exclude them. We should decide which method we prefer. - # there may be further problems. - # (_cell_metrics, 'reads_per_molecule', - # np.array( - # [1.0000, 2.0000, np.nan, 1.0000, 9.0000, 2.4000, 2.0000, 1.0000, 3.0000, 1.0000, 3.0000, - # 3.0000, 1.0000, np.nan, 2.4167, 4.3333, 1.2222, 5.8750, 1.3333, 1.0000, 1.2000, 1.5000, - # 4.6000, 2.0000, 2.5000, 1.2000, 2.1429, 1.0000, 2.6364, 4.0000, 1.0000, 2.1111, 1.7273, - # 6.2500, 5.0000, 1.3333, 2.0000, 2.2500, np.nan, 2.0000, 4.3333, 3.9286, 2.2000, 1.0000, - # 1.5000, 1.6667, np.nan, 1.0000, 1.6667, 1.8889, 1.0000, 1.0000, 2.2500, 1.0000, 9.7500, - # 11.0000, 4.0000, 1.5000])), - ( - _cell_metrics, - "reads_per_fragment", - np.array( - [ - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 1.1250, - 1.3333, - 2.0000, - 1.0000, - 1.2000, - 1.0000, - 1.2000, - 3.0000, - 1.0000, - 2.0000, - 1.3182, - 1.4444, - 1.1000, - 1.4688, - 1.1429, - 1.0000, - 1.2000, - 1.2857, - 1.5333, - 2.0000, - 1.2500, - 1.0000, - 1.1538, - 1.0000, - 1.3182, - 1.0000, - 1.0000, - 1.4615, - 1.3571, - 1.3158, - 1.2500, - 1.3333, - 1.0000, - 1.1250, - 1.0000, - 1.1765, - 1.0833, - 1.4103, - 1.1000, - 1.0000, - 1.2857, - 1.2500, - 1.0000, - 1.0000, - 1.2500, - 1.3077, - 1.0000, - 1.0000, - 1.2857, - 1.0000, - 1.3929, - 1.5714, - 1.4737, - 1.1053, - ] - ), - ), - # (_cell_metrics, 'fragments_per_molecule', # todo failure depends on above reads_per_molecule - # np.array( - # [1.0000, 2.0000, np.nan, 1.0000, 8.0000, 1.8000, 1.0000, 1.0000, 2.5000, 1.0000, 2.5000, - # 1.0000, 1.0000, np.nan, 1.8333, 3.0000, 1.1111, 4.0000, 1.1667, 1.0000, 1.0000, 1.1667, - # 3.0000, 1.0000, 2.0000, 1.2000, 1.8571, 1.0000, 2.0000, 4.0000, 1.0000, 1.4444, 1.2727, - # 4.7500, 4.0000, 1.0000, 2.0000, 2.0000, np.nan, 1.7000, 4.0000, 2.7857, 2.0000, 1.0000, - # 1.1667, 1.3333, np.nan, 1.0000, 1.3333, 1.4444, 1.0000, 1.0000, 1.7500, 1.0000, 7.0000, - # 7.0000, 2.7143, 1.3571])), - ( - _gene_metrics, - "molecule_barcode_fraction_bases_above_30_mean", - np.array([1.0000, 1.0000, 0.8000, 0.9885, 0.9833, 0.9857, 0.7000, 0.9444]), - ), - ( - _gene_metrics, - "molecule_barcode_fraction_bases_above_30_variance", - np.array([np.nan, np.nan, np.nan, 0.0011, 0.0051, 0.0014, np.nan, 0.0120]), - ), - ( - _gene_metrics, - "genomic_reads_fraction_bases_quality_above_30_mean", - np.array([0.8878, 0.3980, 0.4271, 0.8148, 0.7681, 0.7216, 0.1546, 0.5089]), - ), - ( - _gene_metrics, - "genomic_reads_fraction_bases_quality_above_30_variance", - np.array([np.nan, np.nan, np.nan, 0.0282, 0.0346, 0.0537, np.nan, 0.0849]), - ), - ( - _gene_metrics, - "genomic_read_quality_mean", - np.array( - [36.2143, 24.8469, 25.4792, 35.3664, 34.0956, 33.0364, 20.7423, 27.3078] - ), - ), - ( - _gene_metrics, - "genomic_read_quality_variance", - np.array( - [np.nan, np.nan, np.nan, 18.4553, 21.6745, 33.6572, np.nan, 53.5457] - ), - ), - ( - _gene_metrics, - "reads_per_molecule", - np.array([1.0000, 1.0000, 1.0000, 3.2500, 4.1525, 1.7500, 1.0000, 1.3846]), - ), - ( - _gene_metrics, - "reads_per_fragment", - np.array([1.0000, 1.0000, 1.0000, 1.7333, 1.3920, 1.4000, 1.0000, 1.0588]), - ), - ( - _gene_metrics, - "fragments_per_molecule", - np.array([1.0000, 1.0000, 1.0000, 1.8750, 2.9831, 1.2500, 1.0000, 1.3077]), - ), - ], -) -def test_higher_order_metrics_by_gene(metrics, key, expected_value): - """Test metrics that depend on other metrics - - This class tests a very large number of higher-order metrics that examine the functionality of - the test suite across all measured instances of the metric class. E.g. for cell metrics (class), - each test will verify the value for each cell (instance). - - Parameters - ---------- - metrics : pd.DataFrame - Output from subclass of sctools.metrics.MetricAggregator - key : str - The column of metrics to interrogate in the parametrized test - expected_value : np.ndarray - An array of expected values - - """ - # need to sort, metrics are not always in same order as results. - observed = sorted(np.nan_to_num(metrics[key].values).round(4)) - expected_value = sorted(np.nan_to_num(expected_value)) - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, key, expected_value", - [ - # todo failing; suspect related to problem with how fragments are defined - # (_cell_metrics, 'fragments_with_single_read_evidence', 345), - # todo failing. Does not make sense that this would also be a fragment issue. - # (_cell_metrics, 'molecules_with_single_read_evidence', 130), - (_gene_metrics, "fragments_with_single_read_evidence", 155), - (_gene_metrics, "molecules_with_single_read_evidence", 42), - ], -) -def test_single_read_evidence(metrics, key, expected_value): - """ - We want to determine how many molecules and fragments are covered by only one read, as reads - covered by multiple reads have much lower probabilities of being the result of error processes. - """ - observed = metrics[key].sum() - assert observed == expected_value - - -def split_metrics_file(metrics_file): - """ - produces two mergeable on-disk metric files from a single file that contain the first 3/4 - of the file in the first output and the last 3/4 of the file in the second output, such that - 1/2 of the metrics in the two files overlap - """ - with fileinput.FileInput( - [metrics_file], mode="r", openhook=fileinput.hook_compressed - ) as f: - data = [line for line in f] - - header, data = data[0], data[1:] - - low_split, high_split = round(len(data) * 0.25), round(len(data) * 0.75) - file_1, file_2 = [_test_dir + "metrics_for_merging_%d.csv" % i for i in (1, 2)] - - with open(file_1, "wb") as f: - f.write(header + b"\n") - for line in data[:high_split]: - f.write(line + b"\n") - - with open(file_2, "wb") as f: - f.write(header + b"\n") - for line in data[low_split:]: - f.write(line + b"\n") - - return file_1, file_2 - - -@pytest.fixture -def mergeable_cell_metrics(): - return split_metrics_file(_cell_metric_output_file) - - -@pytest.fixture -def mergeable_gene_metrics(): - return split_metrics_file(_gene_metric_output_file) - - -def test_merge_cell_metrics_cli(mergeable_cell_metrics): - """test the sctools merge cell metrics CLI invocation""" - return_call = TenXV2.merge_cell_metrics( - args=["-o", _test_dir + "/merged-cell-metrics.csv.gz"] - + list(mergeable_cell_metrics) - ) - assert return_call == 0 - - -def test_merge_gene_metrics_cli(mergeable_gene_metrics): - """test the sctools merge gene metrics CLI invocation""" - return_call = TenXV2.merge_gene_metrics( - args=["-o", _test_dir + "/merged-gene-metrics.csv.gz"] - + list(mergeable_gene_metrics) - ) - assert return_call == 0 - - -def test_merge_cell_metrics_does_not_correct_duplicates(mergeable_cell_metrics): - """ - test takes offset cell metrics outputs and merges them. Cell metrics does not check for - duplication, so should return a 2x length file. - """ - output_file = os.path.join(_test_dir, "merged_metrics.csv.gz") - m = MergeCellMetrics(mergeable_cell_metrics, output_file) - m.execute() - - merged_data = pd.read_csv(output_file, index_col=0) - - input_sizes = [] - for f in mergeable_cell_metrics: - input_sizes.append(pd.read_csv(f, index_col=0).shape) - target_rows = sum(row for row, col in input_sizes) - - target_cols = input_sizes[0][1] # cols will always be the same - - assert merged_data.shape == (target_rows, target_cols) - - -def test_merge_gene_metrics_averages_over_multiply_detected_genes( - mergeable_gene_metrics, -): - output_file = os.path.join(_test_dir, "merged_metrics.csv.gz") - m = MergeGeneMetrics(mergeable_gene_metrics, output_file) - m.execute() - - merged_data = pd.read_csv(output_file, index_col=0) - - input_data = pd.read_csv(mergeable_gene_metrics[0], index_col=0) - target_cols = input_data.shape[1] - - input_genes = input_data.index - for f in mergeable_gene_metrics[1:]: - input_genes = input_genes.union(pd.read_csv(f, index_col=0).index) - target_rows = len(input_genes) - - assert merged_data.shape == (target_rows, target_cols), "%s" % repr(merged_data) - - -@pytest.mark.parametrize( - "bam, gatherer", - [(_gene_sorted_bam, GatherGeneMetrics), (_cell_sorted_bam, GatherCellMetrics)], -) -def test_gzip_compression(bam: str, gatherer: Callable): - """ - gzip compression should produce a .gz file which is identical when uncompressed to the - uncompressed version - """ - - gz_fout = _test_dir + "test_bam.csv.gz" - g: MetricGatherer = gatherer(bam, gz_fout, compress=True) - g.extract_metrics() - gz_metrics = pd.read_csv(gz_fout, index_col=0) - - fout = _test_dir + "test_bam.csv" - g: MetricGatherer = gatherer(bam, fout, compress=False) - g.extract_metrics() - metrics = pd.read_csv(fout, index_col=0) - - assert np.allclose(gz_metrics.fillna(0).values, metrics.fillna(0).values) diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_platform.py b/tools/scripts/sctools/build/lib/sctools/test/test_platform.py deleted file mode 100644 index e18e0cd8..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_platform.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import tempfile -import pysam - -from .. import platform - -data_dir = os.path.split(__file__)[0] + "/data/" - - -def test_attach_barcodes(): - """High-level test of the AttachBarcodes command""" - - temp_dir_name = tempfile.mkdtemp() - - # Construct cli arguments to pass to the command - temp_output_bam = temp_dir_name + "output.bam" - - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--u2", - data_dir + "test_r2.bam", - "--i1", - data_dir + "test_i1.fastq", - "--o", - temp_output_bam, - "--sample-barcode-start-pos", - "0", - "--sample-barcode-length", - "8", - "--cell-barcode-start-pos", - "0", - "--cell-barcode-length", - "16", - "--molecule-barcode-start-pos", - "16", - "--molecule-barcode-length", - "4", - ] - - platform.BarcodePlatform.attach_barcodes(args) - - with pysam.AlignmentFile(temp_output_bam, "rb", check_sq=False) as samfile: - for read in samfile: - tag_cr = read.get_tag("CR") - tag_cy = read.get_tag("CY") - tag_ur = read.get_tag("UR") - tag_uy = read.get_tag("UY") - tag_sr = read.get_tag("SR") - tag_sy = read.get_tag("SY") - assert len(tag_cr) == 16 - assert len(tag_cy) == 16 - assert len(tag_ur) == 4 - assert len(tag_uy) == 4 - assert len(tag_sr) == 8 - assert len(tag_sy) == 8 diff --git a/tools/scripts/sctools/build/lib/sctools/test/test_stats.py b/tools/scripts/sctools/build/lib/sctools/test/test_stats.py deleted file mode 100644 index c59d8f98..00000000 --- a/tools/scripts/sctools/build/lib/sctools/test/test_stats.py +++ /dev/null @@ -1,21 +0,0 @@ -from .. import stats - - -def test_concentrated_data_produces_entropy_0(): - entropy = stats.base4_entropy([1, 0, 0, 0], axis=0) - assert entropy == 0 - - -def test_concentrated_unnormalized_data_produces_entropy_0(): - entropy = stats.base4_entropy([1000, 0, 0, 0], axis=0) - assert entropy == 0 - - -def test_balanced_data_produces_entropy_1(): - entropy = stats.base4_entropy([0.25, 0.25, 0.25, 0.25], axis=0) - assert entropy == 1 - - -def test_balanced_unnormalized_data_produces_entropy_1(): - entropy = stats.base4_entropy([20, 20, 20, 20], axis=0) - assert entropy == 1 diff --git a/tools/scripts/sctools/docker_build.sh b/tools/scripts/sctools/docker_build.sh deleted file mode 100755 index 1cd5a93d..00000000 --- a/tools/scripts/sctools/docker_build.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -set -e - -# Update version when changes to Dockerfile are made -DOCKER_IMAGE_VERSION=1.0.0 -TIMESTAMP=$(date +"%s") -DIR=$(cd $(dirname $0) && pwd) - -# Registries and tags -GCR_URL="us.gcr.io/broad-gotc-prod/sctools" - -# sctools version -SCTOOLS_VERSION="v0.3.15" - -# Necessary tools and help text -TOOLS=(docker gcloud) -HELP="$(basename "$0") [-h|--help] [-v|--version] [-t|tools] -- script to build the sctools image and push to GCR & Quay - -where: - -h|--help Show help text - -v|--version Version of Samtools to use (default: $SCTOOLS_VERSION) - -t|--tools Show tools needed to run script - " - -function main(){ - for t in "${TOOLS[@]}"; do which "$t" >/dev/null || ok=no; done - if [[ $ok == no ]]; then - echo "Missing one of the following tools: " - for t in "${TOOLS[@]}"; do echo "$t"; done - exit 1 - fi - - while [[ $# -gt 0 ]] - do - key="$1" - case $key in - -v|--version) - SCTOOLS_VERSION="$2" - shift - shift - ;; - -h|--help) - echo "$HELP" - exit 0 - ;; - -t|--tools) - for t in "${TOOLS[@]}"; do echo "$t"; done - exit 0 - ;; - *) - shift - ;; - esac - done - - IMAGE_TAG="$DOCKER_IMAGE_VERSION-$SCTOOLS_VERSION-$TIMESTAMP" - - echo "building and pushing GCR Image - $GCR_URL:$IMAGE_TAG" - docker build --no-cache -t "$GCR_URL:$IMAGE_TAG" \ - --build-arg SCTOOLS_VERSION="$SCTOOLS_VERSION" "$DIR" - docker push "$GCR_URL:$IMAGE_TAG" - - echo -e "$GCR_URL:$IMAGE_TAG" >> "$DIR/docker_versions.tsv" - echo "done" -} - -main "$@" diff --git a/tools/scripts/sctools/docs/README.md b/tools/scripts/sctools/docs/README.md deleted file mode 100644 index bbfa3601..00000000 --- a/tools/scripts/sctools/docs/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# Build Docs - -1. Make sure you have [Sphinx](http://www.sphinx-doc.org/en/stable/) installed. -2. Install the sctools package in advance following the instructions. -3. From the current directory (/docs/), type: - -```bash -make target -``` -where `target` is one of {html, epub, latex, ...}. For more details about the sphinx builders, check [here](http://www.sphinx-doc.org/en/master/man/sphinx-build.html) - -Note that there are still some bugs to be worked out. -- There are warnings about: -``` -WARNING: [autosummary] failed to import 'sctools.metrics.CellMetrics': no module named sctools.metrics.CellMetrics -WARNING: [autosummary] failed to import 'sctools.metrics.GeneMetrics': no module named sctools.metrics.GeneMetrics -WARNING: [autosummary] failed to import 'sctools.metrics.MetricAggregatorBase': no module named sctools.metrics.MetricAggregatorBase -``` - -- There are a bunch of warnings: `WARNING: Unexpected section title.` -- There are a bunch of warnings: `WARNING: toctree contains reference to nonexisting document` - -Most of the warnings can be solved by refactoring the docstrings and standardize the usages of `autosummary` later. diff --git a/tools/scripts/sctools/docs/source/Makefile b/tools/scripts/sctools/docs/source/Makefile deleted file mode 100644 index ade82564..00000000 --- a/tools/scripts/sctools/docs/source/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SPHINXPROJ = SCTools -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/tools/scripts/sctools/docs/source/conf.py b/tools/scripts/sctools/docs/source/conf.py deleted file mode 100644 index 58dd0a96..00000000 --- a/tools/scripts/sctools/docs/source/conf.py +++ /dev/null @@ -1,166 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/stable/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('..')) -from pkg_resources import get_distribution - - -# -- Project information ----------------------------------------------------- - -project = 'SC Tools' -copyright = '2018, Ambrose J. Carr' -author = 'Ambrose J. Carr' - -# The short X.Y version -version = '' -# The full version, including alpha/beta/rc tags -release = get_distribution('sctools').version - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.mathjax', - 'sphinx.ext.viewcode', - 'sphinx.ext.napoleon', - 'sphinx.ext.autosummary', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = ['.rst', '.md'] - -# The master toctree document. -master_doc = 'index' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path . -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'sphinx_rtd_theme' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = 'SCToolsdoc' - - -# -- Options for LaTeX output ------------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'SCTools.tex', 'SC Tools Documentation', 'Ambrose J. Carr', 'manual') -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [(master_doc, 'sctools', 'SC Tools Documentation', [author], 1)] - - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ( - master_doc, - 'SCTools', - 'SC Tools Documentation', - author, - 'SCTools', - 'One line description of project.', - 'Miscellaneous', - ) -] - - -# -- Extension configuration ------------------------------------------------- -numpydoc_show_class_members = False -autosummary_generate = True diff --git a/tools/scripts/sctools/docs/source/index.rst b/tools/scripts/sctools/docs/source/index.rst deleted file mode 100644 index a27ddfc3..00000000 --- a/tools/scripts/sctools/docs/source/index.rst +++ /dev/null @@ -1,21 +0,0 @@ -.. toctree:: - :maxdepth: 1 - :caption: Overview - - readme - -.. toctree:: - :maxdepth: 4 - :caption: API References - - sctools - sctools.metrics - sctools.test - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/tools/scripts/sctools/docs/source/readme.rst b/tools/scripts/sctools/docs/source/readme.rst deleted file mode 100644 index a6210d3d..00000000 --- a/tools/scripts/sctools/docs/source/readme.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../../README.rst diff --git a/tools/scripts/sctools/docs/source/sctools.metrics.rst b/tools/scripts/sctools/docs/source/sctools.metrics.rst deleted file mode 100644 index 126b83c0..00000000 --- a/tools/scripts/sctools/docs/source/sctools.metrics.rst +++ /dev/null @@ -1,41 +0,0 @@ -sctools.metrics package -======================= - -Submodules -~~~~~~~~~~ - -sctools.metrics.aggregator module ---------------------------------- - -.. automodule:: sctools.metrics.aggregator - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.metrics.gatherer module -------------------------------- - -.. automodule:: sctools.metrics.gatherer - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.metrics.merge module ----------------------------- - -.. automodule:: sctools.metrics.merge - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.metrics.writer module ------------------------------ - -.. automodule:: sctools.metrics.writer - :members: - :undoc-members: - :show-inheritance: - :inherited-members: diff --git a/tools/scripts/sctools/docs/source/sctools.rst b/tools/scripts/sctools/docs/source/sctools.rst deleted file mode 100644 index a20c8ba6..00000000 --- a/tools/scripts/sctools/docs/source/sctools.rst +++ /dev/null @@ -1,78 +0,0 @@ -sctools package -=============== - - -Submodules -~~~~~~~~~~ - -sctools.bam module ------------------- - -.. automodule:: sctools.bam - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.barcode module ----------------------- - -.. automodule:: sctools.barcode - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.encodings module ------------------------- - -.. automodule:: sctools.encodings - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.fastq module --------------------- - -.. automodule:: sctools.fastq - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.gtf module ------------------- - -.. automodule:: sctools.gtf - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.platform module ------------------------ - -.. automodule:: sctools.platform - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.reader module ---------------------- - -.. automodule:: sctools.reader - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -sctools.stats module --------------------- - -.. automodule:: sctools.stats - :members: - :undoc-members: - :show-inheritance: - :inherited-members: diff --git a/tools/scripts/sctools/docs/source/sctools.test.rst b/tools/scripts/sctools/docs/source/sctools.test.rst deleted file mode 100644 index 23c590d5..00000000 --- a/tools/scripts/sctools/docs/source/sctools.test.rst +++ /dev/null @@ -1,69 +0,0 @@ -sctools.test package -==================== - -Submodules -~~~~~~~~~~ - -sctools.test.test\_bam module ------------------------------ - -.. automodule:: sctools.test.test_bam - :members: - :undoc-members: - :show-inheritance: - -sctools.test.test\_barcode module ---------------------------------- - -.. automodule:: sctools.test.test_barcode - :members: - :undoc-members: - :show-inheritance: - -sctools.test.test\_encodings module ------------------------------------ - -.. automodule:: sctools.test.test_encodings - :members: - :undoc-members: - :show-inheritance: - -sctools.test.test\_entrypoints module -------------------------------------- - -.. automodule:: sctools.test.test_entrypoints - :members: - :undoc-members: - :show-inheritance: - -sctools.test.test\_fastq module -------------------------------- - -.. automodule:: sctools.test.test_fastq - :members: - :undoc-members: - :show-inheritance: - -sctools.test.test\_gtf module ------------------------------ - -.. automodule:: sctools.test.test_gtf - :members: - :undoc-members: - :show-inheritance: - -sctools.test.test\_metrics module ---------------------------------- - -.. automodule:: sctools.test.test_metrics - :members: - :undoc-members: - :show-inheritance: - -sctools.test.test\_stats module -------------------------------- - -.. automodule:: sctools.test.test_stats - :members: - :undoc-members: - :show-inheritance: diff --git a/tools/scripts/sctools/fastqpreprocessing/.gitignore b/tools/scripts/sctools/fastqpreprocessing/.gitignore deleted file mode 100644 index 868ec69c..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -*~ -*.o -*.a -*.bak -dox/ -dox_errors.txt -*# -*nohup.txt diff --git a/tools/scripts/sctools/fastqpreprocessing/Makefile b/tools/scripts/sctools/fastqpreprocessing/Makefile deleted file mode 100644 index d1caaad0..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/Makefile +++ /dev/null @@ -1,56 +0,0 @@ -IDIR1 = libStatGen/include -IDIR2 = htslib-1.13 -IDIR3 = gzstream - -CC = g++ -std=c++17 -fPIC -DHTSLIB -Wall -O4 -Wwrite-strings - -CFLAGS = -I$(IDIR1) -LlibStatGen -Lgzstream - -LIBS = -LlibStatGen -lStatGen -lz -lpthread -lstdc++fs -Lgzstream -lgzstream - -_DEPS = src/utilities.h src/input_options.h src/fastq_common.h - -TARGET1 = bin/fastqprocess -TARGET1_OBJ = obj/fastqprocess.o - -TARGET2 = bin/TagSort -TARGET2_OBJ = obj/tagsort.o obj/htslib_tagsort.o obj/metricgatherer.o - -TARGET3 = bin/fastq_slideseq -TARGET3_OBJ = obj/fastq_slideseq.o - -TARGET4 = bin/fastq_metrics -TARGET4_OBJ = obj/fastq_metrics.o - -TARGET5 = bin/samplefastq -TARGET5_OBJ = obj/samplefastq.o - -install: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) - cp htslib-1.13/*.so.? bin/ - -all: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) - -COMMON_OBJ = obj/utilities.o obj/input_options.o obj/fastq_common.o - -obj/%.o: src/%.cpp $(_DEPS) - $(CC) -c -o $@ $< -I$(IDIR1) -I$(IDIR2) -I$(IDIR3) - -$(TARGET1): $(COMMON_OBJ) $(TARGET1_OBJ) - $(CC) -o $@ $^ $(CFLAGS) $(LIBS) - -$(TARGET2): $(COMMON_OBJ) $(TARGET2_OBJ) - $(CC) -Wl,-rpath,/usr/local/bin:fastqpreprocessing/bin:bin:. -o $@ $(COMMON_OBJ) $(TARGET2_OBJ) $(LIBS) -Lhtslib-1.13 -lhts - -$(TARGET3): $(COMMON_OBJ) $(TARGET3_OBJ) - $(CC) -o $@ $^ $(CFLAGS) $(LIBS) - -$(TARGET4): $(COMMON_OBJ) $(TARGET4_OBJ) - $(CC) -o $@ $^ $(CFLAGS) $(LIBS) - -$(TARGET5): $(COMMON_OBJ) $(TARGET5_OBJ) - $(CC) -o $@ $^ $(CFLAGS) $(LIBS) - -.PHONY: clean -clean: - rm -f obj/*.o *~ core $(INCDIR)/*~ *.o *.so *.a - rm -rf $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) diff --git a/tools/scripts/sctools/fastqpreprocessing/patches/BgzfFileType.cpp.patch b/tools/scripts/sctools/fastqpreprocessing/patches/BgzfFileType.cpp.patch deleted file mode 100644 index 7af2fdd2..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/patches/BgzfFileType.cpp.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- libStatGen/general/BgzfFileType.cpp 2015-07-08 20:03:23.000000000 +0000 -+++ /tmp/BgzfFileType.cpp 2020-11-03 12:25:36.168474179 +0000 -@@ -23,7 +23,7 @@ - #include "BgzfFileType.h" - - // Default to require the EOF block at the end of the file. --bool BgzfFileType::ourRequireEofBlock = true; -+bool BgzfFileType::ourRequireEofBlock = false; - - BgzfFileType::BgzfFileType(const char * filename, const char * mode) - { diff --git a/tools/scripts/sctools/fastqpreprocessing/patches/FastQFile.cpp.patch b/tools/scripts/sctools/fastqpreprocessing/patches/FastQFile.cpp.patch deleted file mode 100644 index 884fc7d7..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/patches/FastQFile.cpp.patch +++ /dev/null @@ -1,18 +0,0 @@ ---- libStatGen-1.0.14/fastq/FastQFile.cpp 2015-07-08 20:03:23.000000000 +0000 -+++ ../libStatGen/FastQFile.cpp 2020-09-17 19:35:48.797593411 +0000 -@@ -489,6 +489,7 @@ - // Check to see if the sequenceIdentifier is a repeat by adding - // it to the set and seeing if it already existed. - std::pair::iterator,bool> insertResult; -+ /* - insertResult = - myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), - myLineNum)); -@@ -505,6 +506,7 @@ - reportErrorOnLine(); - return(false); - } -+ */ - } - - // Valid, return true. diff --git a/tools/scripts/sctools/fastqpreprocessing/patches/Makefile.patch b/tools/scripts/sctools/fastqpreprocessing/patches/Makefile.patch deleted file mode 100644 index 0e03b435..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/patches/Makefile.patch +++ /dev/null @@ -1,22 +0,0 @@ ---- libStatGen-1.0.14/Makefile 2015-07-08 20:03:23.000000000 +0000 -+++ ../libStatGen/Makefile 2020-09-03 14:15:41.904210140 +0000 -@@ -2,7 +2,8 @@ - - .PHONY: package - --SUBDIRS=general bam fastq glf samtools vcf -+#SUBDIRS=general bam fastq glf samtools vcf -+SUBDIRS=general fastq samtools bam - - include Makefiles/Makefile.base - -@@ -16,7 +17,8 @@ - general: samtools - - # other subdirectories depend on general --bam fastq glf vcf: general -+#bam fastq glf vcf: general -+bam fastq : general - - RELEASE_FILE?=libStatGen.$(VERSION).tgz - diff --git a/tools/scripts/sctools/fastqpreprocessing/patches/general.Makefile.patch b/tools/scripts/sctools/fastqpreprocessing/patches/general.Makefile.patch deleted file mode 100644 index 51d153cd..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/patches/general.Makefile.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- libStatGen-1.0.14/general/Makefile 2020-09-17 20:29:00.320563968 +0000 -+++ ../libStatGen/Makefile.general 2020-09-17 20:57:47.982915972 +0000 -@@ -8,7 +8,7 @@ - # an error, but allow unused results and variables for the - # time being. - # -- USER_WARNINGS ?= -Werror $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi) -+ USER_WARNINGS ?= $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi) - #-Wno-strict-overflow - # -Wno-unused-variable $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-unused-result" ; fi) - endif diff --git a/tools/scripts/sctools/fastqpreprocessing/src/example-run.sh b/tools/scripts/sctools/fastqpreprocessing/src/example-run.sh deleted file mode 100755 index a8fa1755..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/example-run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -./fastqprocess --verbose \ - --bam-size 0.001 \ - --barcode-length 16 \ - --umi-length 10 \ - --sample-id L8TX \ - --white-list ../../../data/L8TX/737K-august-2016.txt \ - --I1 ../../../data/L8TX/A_I1.fastq.gz \ - --R1 ../../../data/L8TX/A_R1.fastq.gz \ - --R2 ../../../data/L8TX/A_R2.fastq.gz \ - --I1 ../../../data/L8TX/B_I1.fastq.gz \ - --R1 ../../../data/L8TX/B_R1.fastq.gz \ - --R2 ../../../data/L8TX/B_R2.fastq.gz \ diff --git a/tools/scripts/sctools/fastqpreprocessing/src/fastq_common.cpp b/tools/scripts/sctools/fastqpreprocessing/src/fastq_common.cpp deleted file mode 100644 index 478a0c5d..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/fastq_common.cpp +++ /dev/null @@ -1,414 +0,0 @@ -#include -#include -#include -#include - -#include "fastq_common.h" -// number of samrecords per buffer in each reader -constexpr size_t kSamRecordBufferSize = 10000; -#include "input_options.h" -#include "utilities.h" - -#include "FastQFile.h" -#include "FastQStatus.h" -#include "BaseAsciiMap.h" -#include "SamFile.h" -#include "SamValidation.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Overview of multithreading: -// * There are reader threads and writer threads. (Writers are either fastq or -// bam, depending on how the program was run). -// * Each {reader, writer} has its own {input, output} file. -// * Each reader has an entry in g_read_arenas, and each writer has an entry in -// g_write_queues. -// * Readers load each chunk of their processed results into SamRecord pointers -// loaned out by their arena. They put the pointer in the correct write queue. -// * When a write queue finishes writing a SamRecord to the file, it notifies -// the record pointer's arena that the record's memory is no longer in use. -// The arena can then give that pointer to its reader for a new read. - -PendingWrite WriteQueue::dequeueWrite() -{ - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return !queue_.empty(); }); - auto pair = queue_.front(); - queue_.pop(); - return pair; -} -void WriteQueue::enqueueWrite(PendingWrite write) -{ - mutex_.lock(); - queue_.push(write); - mutex_.unlock(); - cv_.notify_one(); -} -void WriteQueue::enqueueShutdownSignal() -{ - mutex_.lock(); - queue_.push(std::make_pair(nullptr, kShutdown)); - mutex_.unlock(); - cv_.notify_one(); -} -std::vector> g_write_queues; - -// I wrote this class to stay close to the performance characteristics of the -// original code, but I suspect the large buffers might not be necessary. -// If it doesn't slow things down noticeably, it would be cleaner to just delete -// this class, and have the WriteQueue accept unique_ptr (with the -// addition of some reasonable bound on how much WriteQueue can have -// outstanding; maybe kSamRecordBufferSize items), and let them be directly -// destroyed after writing rather than be reused with this arena approach. -class SamRecordArena -{ -public: - SamRecordArena() - { - for (int i = 0; i < kSamRecordBufferSize; i++) - samrecords_memory_.push_back(std::make_unique()); - - for (int i = samrecords_memory_.size() - 1; i >= 0; i--) - available_samrecords_.push(samrecords_memory_[i].get()); - } - - SamRecord* acquireSamRecordMemory() - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return !available_samrecords_.empty(); }); - SamRecord* sam = available_samrecords_.top(); - available_samrecords_.pop(); - return sam; - } - void releaseSamRecordMemory(SamRecord* sam) - { - mutex_.lock(); - available_samrecords_.push(sam); - mutex_.unlock(); - cv_.notify_one(); - } -private: - std::vector> samrecords_memory_; - std::mutex mutex_; - std::condition_variable cv_; - // Reusing most-recently-used memory first ought to be more cache friendly. - std::stack available_samrecords_; -}; - -std::vector> g_read_arenas; -void releaseReaderThreadMemory(int reader_thread_index, SamRecord* samRecord) -{ - g_read_arenas[reader_thread_index]->releaseSamRecordMemory(samRecord); -} - - -void writeFastqRecord(ogzstream& r1_out, ogzstream& r2_out, SamRecord* sam) -{ - r1_out << "@" << sam->getReadName() << "\n" << sam->getString("CR").c_str() - << sam->getString("UR") << "\n+\n" << sam->getString("CY") << sam->getString("UY") << "\n"; - r2_out << "@" << sam->getReadName() << "\n" << sam->getSequence() << "\n+\n" - << sam->getQuality() << "\n"; -} - -void fastqWriterThread(int write_thread_index) -{ - std::string r1_output_fname = "fastq_R1_" + std::to_string(write_thread_index) + ".fastq.gz"; - ogzstream r1_out(r1_output_fname.c_str()); - if (!r1_out) - crash("ERROR: Failed to open R1 fastq file " + r1_output_fname + " for writing"); - - std::string r2_output_fname = "fastq_R2_" + std::to_string(write_thread_index) + ".fastq.gz"; - ogzstream r2_out(r2_output_fname.c_str()); - if (!r2_out) - crash("ERROR: Failed to open R2 fastq file " + r2_output_fname + " for writing"); - - while (true) - { - auto [sam, source_reader_index] = g_write_queues[write_thread_index]->dequeueWrite(); - if (source_reader_index == WriteQueue::kShutdown) - break; - - writeFastqRecord(r1_out, r2_out, sam); - g_read_arenas[source_reader_index]->releaseSamRecordMemory(sam); - } - - // close the fastq files - r1_out.close(); - r2_out.close(); -} - -void bamWriterThread(int write_thread_index, std::string sample_id) -{ - std::string bam_out_fname = "subfile_" + std::to_string(write_thread_index) + ".bam"; - SamFile samOut; - samOut.OpenForWrite(bam_out_fname.c_str()); - - // Write the sam header. - SamFileHeader samHeader; - - // add the HD tags for the header - samHeader.setHDTag("VN", "1.6"); - samHeader.setHDTag("SO", "unsorted"); - - // add the RG group tags - SamHeaderRG* headerRG = new SamHeaderRG; - headerRG->setTag("ID", "A"); - headerRG->setTag("SM", sample_id.c_str()); - samHeader.addRG(headerRG); - - // add the header to the output bam - samOut.WriteHeader(samHeader); - - while (true) - { - auto [sam, source_reader_index] = g_write_queues[write_thread_index]->dequeueWrite(); - if (source_reader_index == WriteQueue::kShutdown) - break; - - samOut.WriteRecord(samHeader, *sam); - g_read_arenas[source_reader_index]->releaseSamRecordMemory(sam); - } - - // close the bamfile - samOut.Close(); -} - -void fillSamRecordCommon(SamRecord* samRecord, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list, - std::string const& barcode_seq, std::string const& barcode_quality, - std::string const& umi_seq, std::string const& umi_quality) -{ - // reset the samrecord - samRecord->resetRecord(); - // add read group and the sam flag - samRecord->addTag("RG", 'Z', "A"); - samRecord->setFlag(4); - // add identifier, sequence and quality score of the alignments - samRecord->setReadName(fastQFileR2->mySequenceIdentifier.c_str()); - samRecord->setSequence(fastQFileR2->myRawSequence.c_str()); - samRecord->setQuality(fastQFileR2->myQualityString.c_str()); - // add barcode and quality - samRecord->addTag("CR", 'Z', barcode_seq.c_str()); - samRecord->addTag("CY", 'Z', barcode_quality.c_str()); - // add UMI - samRecord->addTag("UR", 'Z', umi_seq.c_str()); - samRecord->addTag("UY", 'Z', umi_quality.c_str()); - // add raw sequence and quality sequence for the index - if (has_I1_file_list) - { - samRecord->addTag("SR", 'Z', fastQFileI1->myRawSequence.c_str()); - samRecord->addTag("SY", 'Z', fastQFileI1->myQualityString.c_str()); - } -} - -// Computes the whitelist-corrected barcode and adds it to sam_record. -// Returns the index of the bamfile bucket / writer thread where sam_record -// should be sent. -int32_t correctBarcodeToWhitelist( - const std::string& barcode, SamRecord* sam_record, const WhiteListData* white_list_data, - int* n_barcode_corrected, int* n_barcode_correct, int* n_barcode_errors, int num_writer_threads) -{ - std::string correct_barcode; - // bucket barcode is used to pick the target bam file - // This is done because in the case of incorrigible barcodes - // we need a mechanism to uniformly distribute the alignments - // so that no bam is oversized to putting all such barcode less - // sequences into one particular. Incorregible barcodes are simply - // added withouth the CB tag - std::string bucket_barcode; - if (auto it = white_list_data->mutations.find(barcode) ; it != white_list_data->mutations.end()) - { - int64_t mutation_index = it->second; - if (mutation_index == -1) // -1 means raw barcode is correct - { - correct_barcode = barcode; - *n_barcode_correct += 1; - } - else - { - // it is a 1-mutation of some whitelist barcode so get the - // barcode by indexing into the vector of whitelist barcodes - correct_barcode = white_list_data->barcodes[mutation_index]; - *n_barcode_corrected += 1; - } - // is used for computing the file index - bucket_barcode = correct_barcode; - - // corrected barcode should be added to the samrecord - sam_record->addTag("CB", 'Z', correct_barcode.c_str()); - } - else // not possible to correct the raw barcode - { - *n_barcode_errors += 1; - bucket_barcode = barcode; - } - // destination bam file index computed based on the bucket_barcode - return std::hash {}(bucket_barcode) % num_writer_threads; -} - -// Returns true if successfully read a sequence. -bool readOneItem(FastQFile& fastQFileI1, bool has_I1_file_list, - FastQFile& fastQFileR1, FastQFile& fastQFileR2) -{ - return (!has_I1_file_list || - ( - has_I1_file_list && - fastQFileI1.readFastQSequence() == FastQStatus::FASTQ_SUCCESS - ) - ) - && fastQFileR1.readFastQSequence() == FastQStatus::FASTQ_SUCCESS - && fastQFileR2.readFastQSequence() == FastQStatus::FASTQ_SUCCESS; -} - -void fastQFileReaderThread( - int reader_thread_index, std::string filenameI1, String filenameR1, - String filenameR2, const WhiteListData* white_list_data, - std::function sam_record_filler, - std::function barcode_getter, - std::function output_handler) -{ - /// setting the shortest sequence allowed to be read - FastQFile fastQFileI1(4, 4); - FastQFile fastQFileR1(4, 4); - FastQFile fastQFileR2(4, 4); - - bool has_I1_file_list = true; - if (!filenameI1.empty()) - { - if (fastQFileI1.openFile(String(filenameI1.c_str()), BaseAsciiMap::UNKNOWN) != - FastQStatus::FASTQ_SUCCESS) - { - crash(std::string("Failed to open file: ") + filenameI1); - } - } - else - has_I1_file_list = false; - - if (fastQFileR1.openFile(filenameR1, BaseAsciiMap::UNKNOWN) != - FastQStatus::FASTQ_SUCCESS) - { - crash(std::string("Failed to open file: ") + filenameR1.c_str()); - } - if (fastQFileR2.openFile(filenameR2, BaseAsciiMap::UNKNOWN) != - FastQStatus::FASTQ_SUCCESS) - { - crash(std::string("Failed to open file: ") + filenameR2.c_str()); - } - - // Keep reading the file until there are no more fastq sequences to process. - int total_reads = 0; - int n_barcode_errors = 0; - int n_barcode_corrected = 0; - int n_barcode_correct = 0; - printf("Opening the thread in %d\n", reader_thread_index); - - while (fastQFileR1.keepReadingFile()) - { - if (readOneItem(fastQFileI1, has_I1_file_list, fastQFileR1, fastQFileR2)) - { - total_reads++; - - SamRecord* samrec = g_read_arenas[reader_thread_index]->acquireSamRecordMemory(); - - // prepare the samrecord with the sequence, barcode, UMI, and their quality sequences - sam_record_filler(samrec, &fastQFileI1, &fastQFileR1, &fastQFileR2, has_I1_file_list); - std::string barcode = barcode_getter(samrec, &fastQFileI1, &fastQFileR1, &fastQFileR2, has_I1_file_list); - - // bucket barcode is used to pick the target bam file - // This is done because in the case of incorrigible barcodes - // we need a mechanism to uniformly distribute the alignments - // so that no bam is oversized to putting all such barcode less - // sequences into one particular. Incorregible barcodes are simply - // added withouth the CB tag - int32_t bam_bucket = correctBarcodeToWhitelist( - barcode, samrec, white_list_data, &n_barcode_corrected, &n_barcode_correct, - &n_barcode_errors, g_write_queues.size()); - - output_handler(g_write_queues[bam_bucket].get(), samrec, reader_thread_index); - - if (total_reads % 10000000 == 0) - { - printf("%d\n", total_reads); - std::string a = std::string(fastQFileR1.myRawSequence.c_str()); - printf("%s\n", fastQFileR1.mySequenceIdLine.c_str()); - printf("%s\n", fastQFileR2.mySequenceIdLine.c_str()); - } - } - } - - // Finished processing all of the sequences in the file. - // Close the input files. - if (has_I1_file_list) - fastQFileI1.closeFile(); - fastQFileR1.closeFile(); - fastQFileR2.closeFile(); - printf("Total barcodes:%d\n correct:%d\ncorrected:%d\nuncorrectible" - ":%d\nuncorrected:%lf\n", - total_reads, n_barcode_correct, n_barcode_corrected, n_barcode_errors, - n_barcode_errors/static_cast(total_reads) * 100); -} - -void mainCommon( - std::string white_list_file, int num_writer_threads, std::string output_format, - std::vector I1s, std::vector R1s, std::vector R2s, - std::string sample_id, - std::function sam_record_filler, - std::function barcode_getter, - std::function output_handler) -{ - std::cout << "reading whitelist file " << white_list_file << "..."; - // stores barcode correction map and vector of correct barcodes - WhiteListData white_list_data = readWhiteList(white_list_file); - std::cout << "done" << std::endl; - - - for (int i = 0; i < num_writer_threads; i++) - g_write_queues.push_back(std::make_unique()); - - // execute the bam file writers threads - std::vector writers; - if (output_format == "BAM") - for (int i = 0; i < num_writer_threads; i++) - writers.emplace_back(bamWriterThread, i, sample_id); - else if (output_format == "FASTQ") - for (int i = 0; i < num_writer_threads; i++) - writers.emplace_back(fastqWriterThread, i); - else - crash("ERROR: Output-format must be either FASTQ or BAM"); - - // execute the fastq readers threads - std::vector readers; - - for (unsigned int i = 0; i < R1s.size(); i++) - { - assert(I1s.empty() || I1s.size() == R1s.size()); - // if there is no I1 file then send an empty file name - std::string I1 = I1s.empty() ? "" : I1s[i]; - - g_read_arenas.push_back(std::make_unique()); - readers.emplace_back(fastQFileReaderThread, i, I1.c_str(), R1s[i].c_str(), - R2s[i].c_str(), &white_list_data, sam_record_filler, barcode_getter, output_handler); - } - - for (auto& reader : readers) - reader.join(); - - // Now that there's nothing left to read, we can safely append a shutdown - // signal to all the write queues. - for (auto& write_queue : g_write_queues) - write_queue->enqueueShutdownSignal(); - - for (auto& writer : writers) -writer.join(); -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/fastq_common.h b/tools/scripts/sctools/fastqpreprocessing/src/fastq_common.h deleted file mode 100644 index 50d61c64..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/fastq_common.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_ -#define __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_ - -#include -#include -#include -#include -#include -#include - -#include "FastQFile.h" -#include "FastQStatus.h" -#include "SamFile.h" -#include "SamValidation.h" - -// A pointer to a valid SamRecord waiting to be written to disk, and the index -// of the g_read_arenas that pointer should be released to after the write. -using PendingWrite = std::pair; - -class WriteQueue -{ -public: - static constexpr int kShutdown = -1; - PendingWrite dequeueWrite(); - void enqueueWrite(PendingWrite write); - void enqueueShutdownSignal(); -private: - std::mutex mutex_; - std::condition_variable cv_; - std::queue queue_; -}; - -// This is a hack for the sake of samplefastq program. -void releaseReaderThreadMemory(int reader_thread_index, SamRecord* samRecord); - -void fillSamRecordCommon(SamRecord* samRecord, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list, - std::string const& barcode_seq, std::string const& barcode_quality, - std::string const& umi_seq, std::string const& umi_quality); - -void mainCommon( - std::string white_list_file, int num_writer_threads, std::string output_format, - std::vector I1s, std::vector R1s, std::vector R2s, - std::string sample_id, - std::function sam_record_filler, - std::function barcode_getter, - std::function output_handler); - -#endif // __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_ diff --git a/tools/scripts/sctools/fastqpreprocessing/src/fastq_metrics.cpp b/tools/scripts/sctools/fastqpreprocessing/src/fastq_metrics.cpp deleted file mode 100644 index 3edd6700..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/fastq_metrics.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/** - * @file fastq_metrics.cpp - * @brief functions for computing metrics - * @author Farzaneh Khajouei and Fred Douglas - * @date 2022-05-25 - ***********************************************/ -#include "FastQFile.h" -#include "FastQStatus.h" -#include "fastq_metrics.h" -#include -#include -#include -#include - -using std::string; - -std::vector> parseReadStructure(std::string read_structure) -{ - std::vector> ret; - int next_ind = 0; - while (next_ind < read_structure.size()) - { - int type_ind = read_structure.find_first_not_of("0123456789", next_ind); - assert(type_ind != std::string::npos); - char type = read_structure[type_ind]; - int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind)); - ret.emplace_back(type, len); - next_ind = type_ind + 1; - } - return ret; -} - -int getLengthOfType(string read_structure,char type) -{ - int total_length = 0; - for (auto [curr_type, length] : parseReadStructure(read_structure)) - if (curr_type == type) - total_length += length; - return total_length; -} - -void PositionWeightMatrix::recordChunk(string s) -{ - for (int index = 0; index < s.size(); index++) - { - switch (s[index]) - { - case 'A': - case 'a': - A[index]++; - break; - case 'C': - case 'c': - C[index]++; - break; - case 'G': - case 'g': - G[index]++; - break; - case 'T': - case 't': - T[index]++; - break; - case 'N': - case 'n': - N[index]++; - break; - default: - std::cerr<<"Unknown character:"<processShard(filenameR1, read_structure, white_list_data); -} -void FastQMetricsShard::processShard(String filenameR1, std::string read_structure, - const WhiteListData* white_list_data) -{ - /// setting the shortest sequence allowed to be read - FastQFile fastQFileR1(4, 4); - // open the R1 file - if (fastQFileR1.openFile(filenameR1, BaseAsciiMap::UNKNOWN) != FastQStatus::FASTQ_SUCCESS) - crash("Failed to open R1 file"); - - // Keep reading the file until there are no more fastq sequences to process. - int n_lines_read = 0; - while (fastQFileR1.keepReadingFile()) - { - if (fastQFileR1.readFastQSequence() != FastQStatus::FASTQ_SUCCESS) - break; - - ingestBarcodeAndUMI(std::string_view(fastQFileR1.myRawSequence.c_str(),fastQFileR1.myRawSequence.Length())); - - n_lines_read++; - if (n_lines_read % 10000000 == 0) - { - printf("%d\n", n_lines_read); - std::string a = std::string(fastQFileR1.myRawSequence.c_str()); - printf("%s\n", fastQFileR1.mySequenceIdLine.c_str()); - } - } - // Finished processing all of the sequences in the file. - // Close the input files. - fastQFileR1.closeFile(); -} - -PositionWeightMatrix& PositionWeightMatrix::operator+=(const PositionWeightMatrix& rhs) -{ - for (int i=0; i < A.size(); i++) - { - A[i] += rhs.A[i]; - C[i] += rhs.C[i]; - G[i] += rhs.G[i]; - T[i] += rhs.T[i]; - N[i] += rhs.N[i]; - } - return *this; -} - -FastQMetricsShard& FastQMetricsShard::operator+=(const FastQMetricsShard& rhs) -{ - for (auto [key,value] : rhs.barcode_counts_) - barcode_counts_[key] += value; - for (auto [key,value] : rhs.umi_counts_) - umi_counts_[key] += value; - - barcode_+=rhs.barcode_; - umi_+=rhs.umi_; - return *this; -} - -/** @copydoc process_inputs */ -void process_inputs(const INPUT_OPTIONS_FASTQ_READ_STRUCTURE& options, - const WhiteListData* white_list_data) -{ - // number of files based on the input size - int num_files = options.R1s.size(); - - // compute UMI and cell_barcode lengths - - int umi_length = getLengthOfType(options.read_structure,'M'); - int CB_length = getLengthOfType(options.read_structure,'C'); - - // create the data for the threads - vector fastqMetrics; - for (int i = 0; i < num_files; i++) - fastqMetrics.emplace_back(options.read_structure); - - // execute the fastq readers threads - vector readers; - for (unsigned int i = 0; i < options.R1s.size(); i++) - { - readers.emplace_back(processShard, - &fastqMetrics[i], - options.R1s[i].c_str(), - options.read_structure.c_str(), - white_list_data); - - } - - // every reader thread joins. - for (unsigned int i = 0; i < options.R1s.size(); i++) - readers[i].join(); - - std::cout << "Done reading all shards. Will now aggregate and write to file; " - << "this will take a few minutes." << std::endl; - FastQMetricsShard::mergeMetricsShardsToFile(options.sample_id, fastqMetrics, umi_length, CB_length); -} - -void writeCountsFile(std::unordered_map counts, std::string filename) -{ - std::ofstream out(filename, std::ofstream::out); - std::vector> sorted_counts; - for (auto [str, count] : counts) - sorted_counts.emplace_back(str, count); - std::sort(sorted_counts.begin(), sorted_counts.end(), //sort counts from most to fewest! - [](std::pair const& a, std::pair const& b) - { - return a.second > b.second; - }); - for (auto [str, count] : sorted_counts) - out << count << "\t" << str << "\n"; -} -void PositionWeightMatrix::writeToFile(std::string filename) -{ - std::ofstream out(filename, std::ofstream::out); - out << "position\tA\tC\tG\tT\tN\n"; - for (int i = 0; i < A.size(); i++) - out << (i + 1) << "\t" << A[i] << "\t" << C[i] << "\t" << G[i] << "\t" << T[i] << "\t" << N[i] << "\n"; -} -void FastQMetricsShard::mergeMetricsShardsToFile(std::string filename_prefix, vector shards, int umi_length, int CB_length) -{ - FastQMetricsShard total(shards[0].read_structure_); - for (FastQMetricsShard const& shard : shards) - total += shard; - - writeCountsFile(total.umi_counts_, filename_prefix + ".numReads_perCell_XM.txt"); - writeCountsFile(total.barcode_counts_, filename_prefix + ".numReads_perCell_XC.txt"); - total.barcode_.writeToFile(filename_prefix + ".barcode_distribution_XC.txt"); - total.umi_.writeToFile(filename_prefix + ".barcode_distribution_XM.txt"); -} - -int main(int argc, char** argv) -{ - INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqMetrics(argc, argv); - std::cout << "reading whitelist file " << options.white_list_file << "..."; - WhiteListData white_list_data = readWhiteList(options.white_list_file); - std::cout << "done" << std::endl; - - process_inputs(options, &white_list_data); - return 0; -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/fastq_metrics.h b/tools/scripts/sctools/fastqpreprocessing/src/fastq_metrics.h deleted file mode 100644 index 5ac3fbd3..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/fastq_metrics.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef __FASTQ_METRICS_H__ -#define __FASTQ_METRICS_H__ -/** - * @file fastq_metrics.h - * @brief functions for computing metrics - * @author Farzaneh Khajouei and Fred Douglas - * @date 2022-05-25 - ***********************************************/ -#include -#include -#include -#include -#include "BaseAsciiMap.h" -#include "utilities.h" -#include "input_options.h" -#include "FastQFile.h" -#include "FastQStatus.h" - -class PositionWeightMatrix -{ -public: - PositionWeightMatrix(int length): A(length), C(length), G(length), T(length), N(length) {} - void recordChunk(std::string s); - PositionWeightMatrix& operator+=(const PositionWeightMatrix& rhs); - void writeToFile(std::string filename); - - std::vector A; - std::vector C; - std::vector G; - std::vector T; - std::vector N; -}; - -class FastQMetricsShard -{ -public: - FastQMetricsShard(std::string read_structure); - void ingestBarcodeAndUMI(std::string_view raw_seq); - void processShard(String filenameR1, std::string read_structure, - const WhiteListData* white_list_data); - static void mergeMetricsShardsToFile(std::string filename_prefix, - std::vector shards, - int umi_length, int CB_length); - FastQMetricsShard& operator+=(const FastQMetricsShard& rhs); - - -private: - std::string read_structure_; - int barcode_length_; - int umi_length_; - std::vector> tagged_lengths_; - std::unordered_map barcode_counts_; - std::unordered_map umi_counts_; - PositionWeightMatrix barcode_; - PositionWeightMatrix umi_; -}; - -#endif // __FASTQ_METRICS_H__ diff --git a/tools/scripts/sctools/fastqpreprocessing/src/fastq_slideseq.cpp b/tools/scripts/sctools/fastqpreprocessing/src/fastq_slideseq.cpp deleted file mode 100644 index 899b438d..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/fastq_slideseq.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "fastq_common.h" -#include "input_options.h" - -std::vector> parseReadStructure(std::string const& read_structure) -{ - std::vector> ret; - int next_ind = 0; - while (next_ind < read_structure.size()) - { - int type_ind = read_structure.find_first_not_of("0123456789", next_ind); - assert(type_ind != std::string::npos); - char type = read_structure[type_ind]; - int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind)); - ret.emplace_back(type, len); - next_ind = type_ind + 1; - } - return ret; -} - -std::vector> g_parsed_read_structure; - -void fillSamRecordWithReadStructure(SamRecord* sam, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list) -{ - // check the sequence names matching - std::string a = std::string(fastQFileR1->myRawSequence.c_str()); - std::string b = std::string(fastQFileR1->myQualityString.c_str()); - // extract the raw barcode and UMI 8C18X6C9M1X and raw barcode and UMI quality string - - std::string barcode_seq, barcode_quality, umi_seq, umi_quality; - int cur_ind = 0; - for (auto [tag, length] : g_parsed_read_structure) - { - switch (tag) - { - case 'C': - barcode_seq += a.substr(cur_ind, length); - barcode_quality += b.substr(cur_ind, length); - break; - case 'M': - umi_seq += a.substr(cur_ind, length); - umi_quality += b.substr(cur_ind, length); - break; - default: - break; - } - cur_ind += length; - } - fillSamRecordCommon(sam, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list, - barcode_seq, barcode_quality, umi_seq, umi_quality); -} - -std::string slideseqBarcodeGetter(SamRecord* sam, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list) -{ - return std::string(sam->getString("CR").c_str()); -} - -void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index) -{ - cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index)); -} - -int main(int argc, char** argv) -{ - INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqSlideseq(argc, argv); - // number of output bam files, and one writer thread per bam file - int num_writer_threads = get_num_blocks(options); - - g_parsed_read_structure = parseReadStructure(options.read_structure); - - mainCommon(options.white_list_file, num_writer_threads, options.output_format, - options.I1s, options.R1s, options.R2s, options.sample_id, - fillSamRecordWithReadStructure, slideseqBarcodeGetter, outputHandler); - return 0; -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/fastqprocess.cpp b/tools/scripts/sctools/fastqpreprocessing/src/fastqprocess.cpp deleted file mode 100644 index 4fe21f25..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/fastqprocess.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/** - * @file fastqprocess.cpp - * @brief functions for file processing - * @author Kishori Konwar - * @date 2020-08-27 - ***********************************************/ - -#include "fastq_common.h" -#include "input_options.h" - -unsigned int g_barcode_length; -unsigned int g_umi_length; - -void fillSamRecord(SamRecord* samRecord, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list) -{ - // check the sequence names matching - std::string a = std::string(fastQFileR1->myRawSequence.c_str()); - std::string b = std::string(fastQFileR1->myQualityString.c_str()); - - // extract the raw barcode and UMI - std::string barcode_seq = a.substr(0, g_barcode_length); - std::string umi_seq = a.substr(g_barcode_length, g_umi_length); - - // extract raw barcode and UMI quality string - std::string barcode_quality = b.substr(0, g_barcode_length); - std::string umi_quality = b.substr(g_barcode_length, g_umi_length); - - fillSamRecordCommon(samRecord, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list, - barcode_seq, barcode_quality, umi_seq, umi_quality); -} - -std::string barcodeGetter(SamRecord* samRecord, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list) -{ - return std::string(fastQFileR1->myRawSequence.c_str()).substr(0, g_barcode_length); -} - -void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index) -{ - cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index)); -} - -int main(int argc, char** argv) -{ - InputOptionsFastqProcess options = readOptionsFastqProcess(argc, argv); - // number of output bam files, and one writer thread per bam file - int num_writer_threads = get_num_blocks(options); - - g_barcode_length = options.barcode_length; - g_umi_length = options.umi_length; - - mainCommon(options.white_list_file, num_writer_threads, options.output_format, - options.I1s, options.R1s, options.R2s, options.sample_id, - fillSamRecord, barcodeGetter, outputHandler); - return 0; -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/htslib_tagsort.cpp b/tools/scripts/sctools/fastqpreprocessing/src/htslib_tagsort.cpp deleted file mode 100644 index 069c91ad..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/htslib_tagsort.cpp +++ /dev/null @@ -1,487 +0,0 @@ -/** - * @file htslib_tagsort.cpp - * @brief functions for file processing - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ - -constexpr int kThreshold = 30; // qual score threshold - -#include "htslib_tagsort.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -extern "C" { - bam_hdr_t* sam_hdr_read(samFile*); //read header - htsFile* hts_open(const char* fn, const char* mode); -} - - -/* - @brief get the int tag or -1 - -*/ -inline int get_itag_or_default(bam1_t* aln, const char* tagname, int default_value) -{ - uint8_t* p; - int tag_value = -1; - if ((p = bam_aux_get(aln, tagname)) == nullptr) - tag_value = default_value; - else - tag_value = bam_aux2i(p); - - return tag_value; -} - -/* - @brief get the string tag or the default - -*/ -inline char* get_Ztag_or_default(bam1_t* aln, const char* tagname, char* default_value) -{ - uint8_t* p; - char* tag_value = nullptr; - if ((p = bam_aux_get(aln, tagname)) == nullptr) - tag_value = default_value; - else - { - tag_value = bam_aux2Z(p); - if (strcmp(tag_value, "-") == 0) - tag_value = default_value; - } - return tag_value; -} - -using TRIPLET = std::tuple; - -using TAGTUPLE = std::tuple< - TRIPLET /* barcode umi and gene_id, not necessarily in that order */, - std::string /* reference */, - std::string /* biotype */, - int /* pos */, - int /*rev strand 1 for yes, 0 otherwise*/, - float /*avg barcode qual score */, - float /* frac of barcode qual score >30 */, - float /*avg qual seq */, - float /*fract of >30 score qual seq*/, - int /*NH*/, - int /*perfect molecule barcode, 1 is yes, 0 otherwise*/, - int /*spliced reads 1 yes, 0 otherwise*/, - int /*is duplicate */, - int /*perfect cell barcode 1 is yes, 0 otherwise*/, - float /* fraction of umi qual score > 30 */ - >; - -enum class TagOrder { BUG, BGU, UBG, UGB, GUB, GBU }; -TRIPLET makeTriplet(std::string barcode, std::string umi, std::string gene_id, TagOrder tag_order) -{ - switch (tag_order) - { - case TagOrder::BUG: return TRIPLET(barcode, umi, gene_id); - case TagOrder::BGU: return TRIPLET(barcode, gene_id, umi); - case TagOrder::UBG: return TRIPLET(umi, barcode, gene_id); - case TagOrder::UGB: return TRIPLET(umi, gene_id, barcode); - case TagOrder::GUB: return TRIPLET(gene_id, umi, barcode); - case TagOrder::GBU: return TRIPLET(gene_id, barcode, umi); - default: crash("no such TagOrder"); return TRIPLET("","",""); - } -} - -void parseOneAlignment(std::vector* tuple_records, bam1_t* aln, - INPUT_OPTIONS_TAGSORT& options, const bam_hdr_t* bam_hdr, - TagOrder tag_order) -{ - // "consts" that the library doesn't allow to be const. - char empty[] = ""; - char none[] = "None"; - char nochr[] = "*"; - - // extract the barcodes corrected and corrected - char* barcode = get_Ztag_or_default(aln, options.barcode_tag.c_str(), none); - char* barcode_raw = get_Ztag_or_default(aln, "CR", empty); - - // to be called perfect, the corrected and raw barcodes should match - int perfect_cell_barcode = (strcmp(barcode, barcode_raw) == 0) ? 1 : 0; - - // barcode quality score - char* barcode_qual = get_Ztag_or_default(aln, "CY", empty); - - //average barcode across the query and the fraction of barcodes above threshold - float sum_barcode_qual = 0; - float num_bp_above_threshold = 0; - size_t len = strlen(barcode_qual); - for (unsigned int k = 0; k < len; k++) - { - // barcodes qual strings are in ASCII symbols subtracting 33 gives the phred qual score - uint8_t qual_score = (((uint8_t)barcode_qual[k]) - 33); - sum_barcode_qual += qual_score; - if (qual_score > kThreshold) - num_bp_above_threshold += 1; - } - float avg_cell_barcode_qual = sum_barcode_qual / (float)len; - float cell_barcode_qual_above_threshold = (float)num_bp_above_threshold / (float)len; - - // corrected molecule barcodes (UMIs) - char* umi = get_Ztag_or_default(aln, options.umi_tag.c_str(), none); - // raw molecule barcodes - char* umi_raw = get_Ztag_or_default(aln, "UR", empty); - - // to be called perfect, the corrected and raw molecular barcodes should match - int perfect_molecule_barcode = (strcmp(umi, umi_raw) == 0) ? 1 : 0; - - // qual score for molecular barcodes - char* umi_qual = get_Ztag_or_default(aln, "UY", empty); - - float sum_umi_qual = 0; - float num_umi_above_threshold = 0; - len = strlen(umi_qual); - for (unsigned int k = 0; k < len; k++) - { - // molecular barcodes qual strings are in ASCII symbols subtracting 33 gives the phred qual score - sum_umi_qual += ((uint8_t)umi_qual[k] -33); - if (((uint8_t)umi_qual[k] - 33) > kThreshold) - num_umi_above_threshold += 1; - } - float frac_umi_qual_above_threshold = (float)num_umi_above_threshold / (float)len; - - char* gene_id = get_Ztag_or_default(aln, options.gene_tag.c_str(), none); - char* location_tag = get_Ztag_or_default(aln, "XF", empty); - - int nh_num = get_itag_or_default(aln, "NH", -1); - - const char* chr = (aln->core.tid == -1) ? nochr : bam_hdr->target_name[aln->core.tid]; - - uint32_t pos = aln->core.pos; // position. - uint32_t isrev = bam_is_rev(aln) ? 1 : 0; // is reverse stand - uint32_t is_duplicate = ((aln->core.flag & BAM_FDUP) != 0) ? 1 : 0; - - // sequence quality score - float avg_sequence_qual = 0, sum_qual = 0; - float qual_above_threshold = 0; - uint8_t* qual_seq = bam_get_qual(aln); // pointer to the qual data - len = aln->core.l_qseq; //length of qual seq. - for (unsigned int k = 0; k < len; k++) - { - // the qual string are already in phred scores - sum_qual += qual_seq[k]; - if (qual_seq[k] > kThreshold) - qual_above_threshold += 1; - } - avg_sequence_qual = sum_qual / (float)len; - qual_above_threshold = qual_above_threshold / (float)len; - - uint32_t* cigar = bam_get_cigar(aln); - // see if it is spliced, i.e., N appears in the CIGAR string - uint32_t spliced_read = 0; - for (unsigned int k = 0; k < aln->core.n_cigar; k++) - { - uint32_t op = cigar[k] & BAM_CIGAR_MASK; - if (op == 3 && (cigar[k] >> BAM_CIGAR_SHIFT) != 0) - { - spliced_read = 1; - break; - } - } - - tuple_records->emplace_back( - makeTriplet(barcode, umi, gene_id, tag_order), /* triplet of tags */ - std::string(chr), /* record[0] */ - std::string(location_tag), /* record[1] */ - pos, /* record [2] */ - isrev, /* record[3] */ - avg_cell_barcode_qual, /* record[4] */ - cell_barcode_qual_above_threshold, /* record[5] */ - avg_sequence_qual, /* record[6] */ - qual_above_threshold, /* record[7] */ - nh_num, /* record[8] */ - perfect_molecule_barcode, /* record[9] */ - spliced_read, /* record[10] */ - is_duplicate, /* record[11] */ - perfect_cell_barcode, /* record[12] */ - frac_umi_qual_above_threshold /* record[13] */); -} - -inline bool sortTripletsLex(std::pair const& a, - std::pair const& b) -{ - using std::get; - if (get<0>(a.first) != get<0>(b.first)) - return get<0>(a.first).compare(get<0>(b.first)) < 0; - if (get<1>(a.first) != get<1>(b.first)) - return get<1>(a.first).compare(get<1>(b.first)) < 0; - return get<2>(a.first).compare(get<2>(b.first)) < 0; -} - -// Generates a random alphanumeric string (AZaz09) of a fixed length. -constexpr int kStringLen = 40; -std::string randomString() -{ - auto randchar = []() -> char - { - const char charset[] = - "0123456789" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz"; - const size_t max_index = (sizeof(charset) - 1); - return charset[ rand() % max_index ]; - }; - std::string str(kStringLen, 0); - std::generate_n(str.begin(), kStringLen, randchar); - return str; -} - -/** - * @brief This function takes a vector of tuples of the tags, sorts them - * in the dictionary order of the tags and then writes these in the same - * order to a txt file - * - * @details - * The function take the vector of tags tuples, writes the sorted tuples into - * a file. The filename is generated randomly (enought to avoid collision with other files) - * in the temp folder specified. - * - * @param tuple_records: vector &, reference to a vector of TAGTUPLES - * @return a string for the random file name -*/ -std::string sortAndWriteToPartialTxtFile(std::vector const& tuple_records, - std::string const& tmp_folder) -{ - using std::get; - - std::string tempfile_filename = tmp_folder + "/" + randomString() + ".txt"; - std::ofstream outfile(tempfile_filename); - - // Sort by triplet, maintaining each triplet's pre-sorted index... - std::vector> index_pairs; - for (size_t i = 0; i < tuple_records.size(); i++) - index_pairs.emplace_back(get<0>(tuple_records[i]), i); - std::sort(index_pairs.begin(), index_pairs.end(), sortTripletsLex); - - // ...then write the triplets in sorted order, linked up with the tuple_record - // located at its pre-sorted index, i.e. the record for the triplet. - for (auto& [triplet_ptr, record_index] : index_pairs) - { - // TODO? - // what if you ran out of disk space ???? NEED TO add logic - outfile << get<0>(triplet_ptr) /* first tag */ << "\t" - << get<1>(triplet_ptr) /* second tag */ << "\t" - << get<2>(triplet_ptr) /* third tag */ << "\t" - << get<1>(tuple_records[record_index]) /* record[0] */ << "\t" - << get<2>(tuple_records[record_index]) /* record[1] */ << "\t" - << get<3>(tuple_records[record_index]) /* record[2] */ << "\t" - << get<4>(tuple_records[record_index]) /* record[3] */ << "\t" - << get<5>(tuple_records[record_index]) /* record[4] */ << "\t" - << get<6>(tuple_records[record_index]) /* record[5] */ << "\t" - << get<7>(tuple_records[record_index]) /* record[6] */ << "\t" - << get<8>(tuple_records[record_index]) /* record[7] */ << "\t" - << get<9>(tuple_records[record_index]) /* record[8] */ << "\t" - << get<10>(tuple_records[record_index]) /* record[9] */ << "\t" - << get<11>(tuple_records[record_index]) /* record[10] */ << "\t" - << get<12>(tuple_records[record_index]) /* record[11] */ << "\t" - << get<13>(tuple_records[record_index]) /* record[12] */ << "\t" - << get<14>(tuple_records[record_index]) /* record[13] */ << "\n"; - } - - return tempfile_filename; -} - -// Manages worker threads' access to reading the input file. Any worker can take -// any line from the file, but only one thread can be reading at a time. This -// class lets them take turns: they call readAlignments() whenever they want -// more data, and it blocks until they can have it. -class AlignmentReader -{ -public: - explicit AlignmentReader(INPUT_OPTIONS_TAGSORT options) : options_(options) - { - if ((sam_file_ptr_ = hts_open(options.bam_input.c_str(),"r")) == nullptr) - crash(options.bam_input + ": cannot open file."); - - bam_hdr_ = sam_hdr_read(sam_file_ptr_); //read header - - allocateAlignmentBuffers(); - } - - ~AlignmentReader() - { - for (unsigned int i = 0; i < options_.nthreads; i++) - { - for (unsigned int k = 0; k < options_.alignments_per_batch; k++) - bam_destroy1(aln_arr_[i][k]); - free(aln_arr_[i]); - } - free(aln_arr_); - sam_hdr_destroy(bam_hdr_); - hts_close(sam_file_ptr_); - } - - // Blocks until it's this thread's turn to read, then reads a batch of alignments. - // Returns a pointer to the alignment ptr array, and number of alignment ptrs in the array. - std::pair readAlignments(int thread_index) - { - const std::lock_guard lock(mutex_); - unsigned int cur_num_read = 0; - while (cur_num_read < options_.alignments_per_batch) - { - if (sam_read1(sam_file_ptr_, bam_hdr_, aln_arr_[thread_index][cur_num_read]) == 0) - cur_num_read++; - else - break; - } - total_aligns_read_ += cur_num_read; - batches_read_++; - std::cout << "Finished reading batch number: " << batches_read_ << std::endl; - return std::make_pair(aln_arr_[thread_index], cur_num_read); - } - - void addToPartialFilenames(std::vector names) - { - const std::lock_guard lock(mutex_); - for (std::string name : names) - partial_filenames_.push_back(name); - } - - std::vector partial_filenames() const { return partial_filenames_; } - bam_hdr_t* bam_hdr() const { return bam_hdr_; } - uint64_t total_aligns_read() const { return total_aligns_read_; } - -private: - void allocateAlignmentBuffers() - { - assert(options_.nthreads <= kMaxTagsortThreads); - std::string msg = "Now allocating alignment buffers. If the program crashes " - "here, it probably ran out of memory..."; - std::cout << msg << std::endl; - std::cerr << msg << std::endl; - - aln_arr_ = (bam1_t***)malloc(sizeof(bam1_t**) * options_.nthreads); - for (unsigned int i = 0; i < options_.nthreads; i++) - { - aln_arr_[i] = (bam1_t**)malloc(sizeof(bam1_t*) * options_.alignments_per_batch); - for (unsigned int k = 0; k < options_.alignments_per_batch; k++) - aln_arr_[i][k] = bam_init1(); //initialize an alignment - } - std::string done_msg = "Successfully allocated alignment buffers."; - std::cout << done_msg << std::endl; - std::cerr << done_msg << std::endl; - } - - std::mutex mutex_; - uint64_t total_aligns_read_ = 0; - uint64_t batches_read_ = 0; - INPUT_OPTIONS_TAGSORT options_; - samFile* sam_file_ptr_ = nullptr; - bam_hdr_t* bam_hdr_ = nullptr; - bam1_t*** aln_arr_ = nullptr; - std::vector partial_filenames_; -}; - -void partialSortWorkerThread(int my_thread_index, AlignmentReader* alignment_reader, - TagOrder tag_order, INPUT_OPTIONS_TAGSORT options) -{ - std::vector my_partial_filenames; - bam_hdr_t* bam_hdr = alignment_reader->bam_hdr(); - while (true) - { - std::vector tuple_records; - - auto [aln_ptr_array, alns_length] = alignment_reader->readAlignments(my_thread_index); - if (alns_length == 0) - break; - - for (unsigned int i = 0; i < alns_length; i++) - parseOneAlignment(&tuple_records, aln_ptr_array[i], options, bam_hdr, tag_order); - - my_partial_filenames.push_back( - sortAndWriteToPartialTxtFile(tuple_records, options.temp_folder)); - } - alignment_reader->addToPartialFilenames(my_partial_filenames); -} - -TagOrder getTagOrder(INPUT_OPTIONS_TAGSORT options) -{ - assert(options.tag_order.size() == 3); - // the order of the three tags are define by the order of the supplied input arguments - // tag.order [tag_name] -> order map - if (options.tag_order[options.barcode_tag] == 0 && - options.tag_order[options.gene_tag] == 1 && - options.tag_order[options.umi_tag] == 2) - { - return TagOrder::BGU; - } - if (options.tag_order[options.umi_tag] == 0 && - options.tag_order[options.barcode_tag] == 1 && - options.tag_order[options.gene_tag] == 2) - { - return TagOrder::UBG; - } - if (options.tag_order[options.umi_tag] == 0 && - options.tag_order[options.gene_tag] == 1 && - options.tag_order[options.barcode_tag] == 2) - { - return TagOrder::UGB; - } - if (options.tag_order[options.gene_tag] == 0 && - options.tag_order[options.umi_tag] == 1 && - options.tag_order[options.barcode_tag] == 2) - { - return TagOrder::GUB; - } - if (options.tag_order[options.gene_tag] == 0 && - options.tag_order[options.barcode_tag] == 1 && - options.tag_order[options.umi_tag] == 2) - { - return TagOrder::GBU; - } - return TagOrder::BUG; -} - -/** - * @brief From the input bam create a list of txt files with the records (lines) - * sorted according to the * tags - * - * @details - * The input bam file is read chunk by chunk, sorted by the tags and the written - * out as a text file in the sorted manner. - * - * @param options: INPUT_OPTIONS_TAGSORT the inputs to the program - * @return a vector containing the file paths of the partial files -*/ -std::vector create_sorted_file_splits_htslib(INPUT_OPTIONS_TAGSORT options) -{ - std::cout << "Running htslib" << std::endl; - AlignmentReader alignment_reader(options); - - TagOrder tag_order = getTagOrder(options); - - std::vector worker_threads; - for (int i = 0; i < options.nthreads; i++) - { - worker_threads.emplace_back(partialSortWorkerThread, - i, &alignment_reader, tag_order, options); - } - for (auto& worker_thread : worker_threads) - worker_thread.join(); - - std::cout << "Read " << alignment_reader.total_aligns_read() << " records in batches of " - << options.alignments_per_batch << std::endl; - - return alignment_reader.partial_filenames(); -} // function - diff --git a/tools/scripts/sctools/fastqpreprocessing/src/htslib_tagsort.h b/tools/scripts/sctools/fastqpreprocessing/src/htslib_tagsort.h deleted file mode 100644 index 9628260e..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/htslib_tagsort.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef __HTSLIB_TAG_SORT__ -#define __HTSLIB_TAG_SORT__ - -/** - * @file htslib_tagsort.h - * @brief Utility functions for input options processing - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ - -#include -#include "input_options.h" -#include "utilities.h" - - -/** - * @brief From the input bam create a list of txt files with the records (lines) - * sorted according to the * tags - * - * @details - * The input bam file is read chunk by chunk, sorted by the tags and the written - * out as a text file in the sorted manner. - * - * @param options: INPUT_OPTIONS_TAGSORT the inputs to the program - * @return a vector containing the file paths of the partial files -*/ -std::vector create_sorted_file_splits_htslib(INPUT_OPTIONS_TAGSORT options); - -#endif diff --git a/tools/scripts/sctools/fastqpreprocessing/src/input_options.cpp b/tools/scripts/sctools/fastqpreprocessing/src/input_options.cpp deleted file mode 100644 index 0958cead..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/input_options.cpp +++ /dev/null @@ -1,672 +0,0 @@ -/** - * @file input_options.cpp - * @brief functions for optons and input checking - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ -#include "input_options.h" - -#include -#include -#include -#include -#include -#include - -namespace fs = std::experimental::filesystem; -using std::string; - -int64_t filesize(string const& filename) -{ - FILE* f = fopen(filename.c_str(), "rb"); - - int64_t size = 0; - if (fseek(f, 0, SEEK_END) == 0) - size = ftell(f); - fclose(f); - return size; -} - -void printFileInfo(std::vector const& fastqs, - string const& type) -{ - if (fastqs.size()) - { - std::cout << "INFO " << type << " files:" << std::endl; - for (unsigned int i= 0; i < fastqs.size(); i++) - { - if (fs::exists(fastqs[i].c_str())) - { - std::cout << "\t " << fastqs[i] << " exists, file size " - << filesize(fastqs[i]) << std::endl; - } - else - { - std::cout << "ERROR " << fastqs[i] << " is missing!\n"; - std::cerr << "ERROR " << fastqs[i] << " is missing!\n"; - exit(1); - } - } - } -} - -int64_t get_num_blocks(std::vector const& I1s, - std::vector const& R1s, - std::vector const& R2s, double bam_size) -{ - assert(R1s.size() == R2s.size()); - double tot_size = 0; - for (unsigned int i = 0; i < R1s.size(); i++) - { - assert(I1s.empty() || I1s.size() == R1s.size()); - if (!I1s.empty()) - tot_size += filesize(I1s[i]); - - std::cout << "file " << R1s[i] << " : " << filesize(R1s[i]) << " bytes" << std::endl; - tot_size += filesize(R1s[i]); - tot_size += filesize(R2s[i]); - } - - const int GiB = 1024*1024*1024; - return std::ceil((tot_size / GiB) / bam_size); -} - -int64_t get_num_blocks(InputOptionsFastqProcess const& options) -{ - return get_num_blocks(options.I1s, options.R1s, options.R2s, options.bam_size); -} - -int64_t get_num_blocks(INPUT_OPTIONS_FASTQ_READ_STRUCTURE const& options) -{ - return get_num_blocks(options.I1s, options.R1s, options.R2s, options.bam_size); -} - -/** @copydoc readOptionsTagsort */ -INPUT_OPTIONS_TAGSORT readOptionsTagsort(int argc, char** argv) -{ - INPUT_OPTIONS_TAGSORT options; - int c; - int i; - - static struct option long_options[] = - { - /* These options set a flag. */ - {"compute-metric", no_argument, 0, 'm'}, - {"output-sorted-info", no_argument, 0, 'n'}, - /* These options don’t set a flag. - We distinguish them by their indices. */ - {"bam-input", required_argument, 0, 'b'}, - {"gtf-file", required_argument, 0, 'a'}, - {"temp-folder", required_argument, 0, 't'}, - {"sorted-output", required_argument, 0, 'o'}, - {"metric-output", required_argument, 0, 'M'}, - {"alignments-per-thread", required_argument, 0, 'p'}, - {"nthreads", required_argument, 0, 'T'}, - {"barcode-tag", required_argument, 0, 'C'}, - {"umi-tag", required_argument, 0, 'U'}, - {"gene-tag", required_argument, 0, 'G'}, - {"metric-type", required_argument, 0, 'K'}, - {"mitochondrial-gene-names-filename", required_argument, 0, 'g'}, - {0, 0, 0, 0} - }; - - // help messages when the user types -h - const char* help_messages[] = - { - "compute metric, metrics are computed if this option is provided [optional]", - "sorted output file is produced if this option is provided [optional]", - "input bam file [required]", - "gtf file (unzipped) required then metric type is cell [required with metric cell]", - "temp folder for disk sorting [options: default /tmp]", - "sorted output file [optional]", - "metric file, the metrics are output in this file [optional]", - "number of alignments per thread [optional: default 1000000], if this number is increased then more RAM is required but reduces the number of file splits", - "number of threads [optional: default 1]", - "barcode-tag the call barcode tag [required]", - "umi-tag the umi tag [required]: the tsv file output is sorted according the tags in the options barcode-tag, umi-tag or gene-tag", - "gene-tag the gene tag [required]", - "metric type, either \"cell\" or \"gene\" [required]", - "file listing gene names, one per line, that the program should care about. [required, may omit if you want mouse or human]" - }; - - - /* getopt_long stores the option index here. */ - int option_index = 0; - int curr_size = 0; - while ((c = getopt_long(argc, argv, - "b:a:t:no:mM:p:T:C:U:G:K:", - long_options, - &option_index)) !=- 1) - { - // process the option or arguments - switch (c) - { - case 'm': - options.compute_metric = true; - break; - case 'n': - options.output_sorted_info = true; - break; - case 0: - /* If this option set a flag, do nothing else now. */ - if (long_options[option_index].flag != 0) - break; - printf("option %s", long_options[option_index].name); - if (optarg) - printf(" with arg %s", optarg); - printf("\n"); - break; - case 'b': - options.bam_input = string(optarg); - break; - case 'a': - options.gtf_file = string(optarg); - break; - case 't': - options.temp_folder = string(optarg); - break; - case 'o': - options.sorted_output_file = string(optarg); - break; - case 'M': - options.metric_output_file = string(optarg); - break; - case 'p': - options.alignments_per_batch = atoi(optarg); - break; - case 'T': - options.nthreads = atoi(optarg); - break; - case 'C': - options.barcode_tag = string(optarg); - curr_size = options.tag_order.size(); - options.tag_order[string(optarg)] = curr_size; - break; - case 'U': - options.umi_tag = string(optarg); - curr_size = options.tag_order.size(); - options.tag_order[string(optarg)] = curr_size; - break; - case 'G': - options.gene_tag = string(optarg); - curr_size = options.tag_order.size(); - options.tag_order[string(optarg)] = curr_size; - break; - case 'K': - options.metric_type = string(optarg); - break; - case 'g': - options.mitochondrial_gene_names_filename = string(optarg); - break; - case '?': - case 'h': - i = 0; - printf("Usage: %s [options] \n", argv[0]); - while (long_options[i].name != 0) - { - printf("\t--%-20s %-25s %-35s\n", long_options[i].name, - long_options[i].has_arg == no_argument? - "no argument" : "required_argument", - help_messages[i]); - i = i + 1; - } - /* getopt_long already printed an error message. */ - exit(0); - break; - default: - abort(); - } - } - - // Check the options - // either metric computation or the sorted tsv file must be produced - if (!options.output_sorted_info && !options.compute_metric) - crash("ERROR: The choice of either the sorted alignment info or metric computation must be specified"); - - if (options.compute_metric && options.metric_output_file.empty()) - crash("ERROR: Must specify --metric-output when specifying --compute-metric"); - - if (options.output_sorted_info && options.sorted_output_file.empty()) - crash("ERROR: Must specify --sorted-output when specifying --output-sorted-info"); - - // metric type must be either of type cell or gene - if (options.metric_type != "cell" && options.metric_type != "gene") - crash("ERROR: --metric-type must either be \"cell\" or \"gene\""); - - // if metric type is cell then the gtf file must be provided - if (options.metric_type == "cell" && options.gtf_file.empty()) - crash("ERROR: The gtf file name must be provided with metric_type \"cell\""); - - // the gtf file should not be gzipped - std::regex reg1(".gz$", std::regex_constants::icase); - if (std::regex_search(options.gtf_file, reg1)) - crash("ERROR: The gtf file must not be gzipped"); - - // bam input file must be there - if (options.bam_input.empty()) - crash("ERROR: Must specify a input file name"); - - // check for input file - if (!fs::exists(options.bam_input.c_str())) - crash("ERROR: bam_input " + options.bam_input + " is missing!"); - - // check for the temp folder - if (!fs::exists(options.temp_folder.c_str())) - crash("ERROR: temp folder " + options.temp_folder + " is missing!"); - - // check for three distinct tags, barcode, umi and gene_id tags - if (options.tag_order.size() != 3) - crash("ERROR: Must have three distinct tags"); - bool seen_tag_index[3] = { false, false, false }; - for (auto [tag, index] : options.tag_order) - { - if (index < 0 || index > 2) - crash("Invalid tag index " + std::to_string(index) + "; must be 0 1 or 2"); - else - seen_tag_index[index] = true; - } - if (!(seen_tag_index[0] && seen_tag_index[1] && seen_tag_index[2])) - crash("Need tag indices 0 1 and 2"); - - // The size of a set of aligments for in-memory sorting must be positive - if (options.alignments_per_batch < 1000) - crash("ERROR: The number of alignments per thread must be at least 1000"); - - // The number of threads must be between 1 and kMaxTagsortThreads - if (options.nthreads > kMaxTagsortThreads || options.nthreads < 1) - crash("ERROR: The number of threads must be between 1 and " + std::to_string(kMaxTagsortThreads)); - - return options; -} - - -/** @copydoc readOptionsFastqProcess */ -InputOptionsFastqProcess readOptionsFastqProcess(int argc, char** argv) -{ - InputOptionsFastqProcess options; - int c; - int i; - bool verbose_flag = false; - - static struct option long_options[] = - { - /* These options set a flag. */ - {"verbose", no_argument, 0, 'v'}, - /* These options don’t set a flag. - We distinguish them by their indices. */ - {"barcode-length", required_argument, 0, 'b'}, - {"umi-length", required_argument, 0, 'u'}, - {"bam-size", required_argument, 0, 'B'}, - {"sample-id", required_argument, 0, 's'}, - {"I1", required_argument, 0, 'I'}, - {"R1", required_argument, 0, 'R'}, - {"R2", required_argument, 0, 'r'}, - {"white-list", required_argument, 0, 'w'}, - {"output-format", required_argument, 0, 'F'}, - {0, 0, 0, 0} - }; - - // help messages when the user types -h - const char* help_messages[] = - { - "verbose messages ", - "barcode length [required]", - "UMI length [required]", - "output BAM file in GB [optional: default 1 GB]", - "sample id [required]", - "I1 [optional]", - "R1 [required]", - "R2 [required]", - "whitelist (from cellranger) of barcodes [required]", - "output-format : either FASTQ or BAM [required]", - }; - - - /* getopt_long stores the option index here. */ - int option_index = 0; - while ((c = getopt_long(argc, argv, - "b:u:B:s:I:R:r:w:F:v", - long_options, - &option_index)) !=- 1 - ) - { - // process the option or arguments - switch (c) - { - case 'v': - verbose_flag = true; - break; - case 0: - /* If this option set a flag, do nothing else now. */ - if (long_options[option_index].flag != 0) - break; - printf("option %s", long_options[option_index].name); - if (optarg) - printf(" with arg %s", optarg); - printf("\n"); - break; - case 'b': - options.barcode_length = atoi(optarg); - break; - case 'u': - options.umi_length = atoi(optarg); - break; - case 'B': - options.bam_size = atof(optarg); - break; - case 's': - options.sample_id = string(optarg); - break; - case 'I': - options.I1s.push_back(string(optarg)); - break; - case 'R': - options.R1s.push_back(string(optarg)); - break; - case 'r': - options.R2s.push_back(string(optarg)); - break; - case 'w': - options.white_list_file = string(optarg); - break; - case 'F': - options.output_format = string(optarg); - break; - case '?': - case 'h': - i = 0; - printf("Usage: %s [options] \n", argv[0]); - while (long_options[i].name != 0) - { - printf("\t--%-20s %-25s %-35s\n", long_options[i].name, - long_options[i].has_arg == no_argument? - "no argument" : "required_argument", - help_messages[i]); - i = i + 1; - } - /* getopt_long already printed an error message. */ - return options; - default: - abort(); - } - } - - if ((options.R1s.size() != options.R2s.size())) - { - crash("ERROR: Unequal number of R1 and R2 fastq files in input: R1: " + - std::to_string(options.R1s.size()) + ", R2: " + std::to_string(options.R2s.size())); - } - - if (options.R1s.empty()) - crash("ERROR: No R1 file provided"); - - if (options.I1s.size() != options.R1s.size() && !options.I1s.empty()) - crash("ERROR: Must provide as many I1 input files as R1 input files, or else no I1 input files at all."); - - if (options.bam_size <= 0) - crash("ERROR: Size of a bam file (in GB) cannot be negative or 0"); - - if (options.sample_id.empty()) - crash("ERROR: Must provide a sample id or name"); - - if (options.output_format!="FASTQ" && options.output_format!="BAM") - crash("ERROR: output-format must be either FASTQ or BAM"); - - if (options.barcode_length <= 0) - crash("ERROR: Barcode length must be a positive integer"); - - if (options.umi_length <= 0) - crash("ERROR: UMI length must be a positive integer"); - - if (verbose_flag) - { - if (!options.I1s.empty()) - printFileInfo(options.I1s, string("I1")); - if (!options.R1s.empty()) - printFileInfo(options.R1s, string("R1")); - if (!options.R2s.empty()) - printFileInfo(options.R2s, string("R2")); - } - - return options; -} - -INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqSlideseq(int argc, char** argv) -{ - INPUT_OPTIONS_FASTQ_READ_STRUCTURE options; - int c; - int i; - bool verbose_flag = false; - - static struct option long_options[] = - { - /* These options set a flag. */ - {"verbose", no_argument, 0, 'v'}, - /* These options don’t set a flag. - We distinguish them by their indices. */ - {"bam-size", required_argument, 0, 'B'}, - {"read-structure", required_argument, 0, 'S'}, - {"sample-id", required_argument, 0, 's'}, - {"I1", required_argument, 0, 'I'}, - {"R1", required_argument, 0, 'R'}, - {"R2", required_argument, 0, 'r'}, - {"white-list", required_argument, 0, 'w'}, - {"output-format", required_argument, 0, 'F'}, - {0, 0, 0, 0} - }; - - // help messages when the user types -h - const char* help_messages[] = - { - "verbose messages ", - "output BAM file in GB [optional: default 1 GB]", - "read structure [required]", - "sample id [required]", - "I1 [optional]", - "R1 [required]", - "R2 [required]", - "whitelist (from cellranger) of barcodes [required]", - "output-format : either FASTQ or BAM [required]", - }; - - - /* getopt_long stores the option index here. */ - int option_index = 0; - while ((c = getopt_long(argc, argv, - "B:S:s:I:R:r:w:F:v", - long_options, - &option_index)) !=- 1) - { - // process the option or arguments - switch (c) - { - case 'v': - verbose_flag = true; - break; - case 0: - /* If this option set a flag, do nothing else now. */ - if (long_options[option_index].flag != 0) - break; - printf("option %s", long_options[option_index].name); - if (optarg) - printf(" with arg %s", optarg); - printf("\n"); - break; - case 'B': - options.bam_size = atof(optarg); - break; - case 'S': - options.read_structure = string(optarg); - break; - case 's': - options.sample_id = string(optarg); - break; - case 'I': - options.I1s.push_back(string(optarg)); - break; - case 'R': - options.R1s.push_back(string(optarg)); - break; - case 'r': - options.R2s.push_back(string(optarg)); - break; - case 'w': - options.white_list_file = string(optarg); - break; - case 'F': - options.output_format = string(optarg); - break; - case '?': - case 'h': - i = 0; - printf("Usage: %s [options] \n", argv[0]); - while (long_options[i].name != 0) - { - printf("\t--%-20s %-25s %-35s\n", long_options[i].name, - long_options[i].has_arg == no_argument? - "no argument" : "required_argument", - help_messages[i]); - i = i + 1; - } - /* getopt_long already printed an error message. */ - return options; - default: - abort(); - } - } - - if ((options.R1s.size() != options.R2s.size())) - { - crash("ERROR: Unequal number of R1 and R2 fastq files in input: R1: " + - std::to_string(options.R1s.size()) + ", R2: " + std::to_string(options.R2s.size())); - } - - if (options.R1s.empty()) - crash("ERROR: No R1 file provided"); - - if (options.I1s.size() != options.R1s.size() && !options.I1s.empty()) - crash("ERROR: Must provide as many I1 input files as R1 input files, or else no I1 input files at all."); - - if (options.bam_size <= 0) - crash("ERROR: Size of a bam file (in GB) cannot be negative or 0"); - - if (options.sample_id.empty()) - crash("ERROR: Must provide a sample id or name"); - - if (options.output_format!="FASTQ" && options.output_format!="BAM") - crash("ERROR: output-format must be either FASTQ or BAM"); - - if (options.read_structure.empty()) - crash("ERROR: Must provide read structures"); - - if (verbose_flag) - { - if (!options.R1s.empty()) - printFileInfo(options.R1s, string("R1")); - if (!options.R2s.empty()) - printFileInfo(options.R2s, string("R2")); - } - - return options; -} - - -INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqMetrics(int argc, char** argv) -{ - INPUT_OPTIONS_FASTQ_READ_STRUCTURE options; - int c; - int i; - bool verbose_flag = false; - - static struct option long_options[] = - { - /* These options set a flag. */ - {"verbose", no_argument, 0, 'v'}, - /* These options don’t set a flag. - We distinguish them by their indices. */ - {"read-structure", required_argument, 0, 'S'}, - {"sample-id", required_argument, 0, 's'}, - {"R1", required_argument, 0, 'R'}, - {"white-list", required_argument, 0, 'w'}, - {0, 0, 0, 0} - }; - - // help messages when the user types -h - const char* help_messages[] = - { - "verbose messages ", - "read structure [required]", - "sample id [required]", - "R1 [required]", - "whitelist of cell/bead barcodes [required]", - }; - - - /* getopt_long stores the option index here. */ - int option_index = 0; - while ((c = getopt_long(argc, argv, - "S:s:R:w:v", - long_options, - &option_index)) !=- 1 - ) - { - // process the option or arguments - switch (c) - { - case 'v': - verbose_flag = 1; - break; - case 0: - /* If this option set a flag, do nothing else now. */ - if (long_options[option_index].flag != 0) - break; - printf("option %s", long_options[option_index].name); - if (optarg) - printf(" with arg %s", optarg); - printf("\n"); - break; - case 'S': - options.read_structure = string(optarg); - break; - case 's': - options.sample_id = string(optarg); - break; - case 'R': - options.R1s.push_back(string(optarg)); - break; - case 'w': - options.white_list_file = string(optarg); - break; - case '?': - case 'h': - i = 0; - printf("Usage: %s [options] \n", argv[0]); - while (long_options[i].name != 0) - { - printf("\t--%-20s %-25s %-35s\n", long_options[i].name, - long_options[i].has_arg == no_argument? - "no argument" : "required_argument", - help_messages[i]); - i = i + 1; - } - /* getopt_long already printed an error message. */ - return options; - default: - abort(); - } - } - - if (options.R1s.empty()) - crash("ERROR: No R1 file provided"); - - if (options.read_structure.empty()) - crash("ERROR: Must provide read structures"); - - if (options.sample_id.empty()) - crash("ERROR: Must provide a sample id or name"); - - if (verbose_flag && !options.R1s.empty()) - printFileInfo(options.R1s, string("R1")); - - return options; -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/input_options.h b/tools/scripts/sctools/fastqpreprocessing/src/input_options.h deleted file mode 100644 index 1379d2c3..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/input_options.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_ -#define __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_ -/** - * @file input_options.h - * @brief Utility functions for input options processing - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ - -#include "utilities.h" - -#include -#include - -constexpr unsigned int kMaxTagsortThreads = 30; -constexpr unsigned int kDefaultNumAlignsPerThread = 1000000; - -struct INPUT_OPTIONS_FASTQ_READ_STRUCTURE -{ - // I1, R1 and R2 files name - std::vector I1s, R1s, R2s; - - // Bead Barcode list - std::string white_list_file; - - std::string output_format; - - // Bam file size to split by (in GB) - double bam_size = 1.0; - - std::string read_structure; - - std::string sample_id; -}; - - -// Structure to hold input options for fastqprocess -struct InputOptionsFastqProcess -{ - // I1, R1 and R2 files name - std::vector I1s, R1s, R2s; - - // Barcode white list file - std::string white_list_file; - - std::string output_format; - - // chemistry dependent (V2/V3) barcode and UMI length - int barcode_length = -1; - int umi_length = -1; - - // Bam file size to split by (in GB) - double bam_size = 1.0; - - std::string sample_id; -}; - - -// Structure to hold input options for tagsort -struct INPUT_OPTIONS_TAGSORT -{ - std::string metric_type; - bool output_sorted_info = false; - bool compute_metric = false; - // name of the bam file - std::string bam_input; - // name of the gtf file - std::string gtf_file; - // temp folder for disk sorting - std::string temp_folder = "/tmp/"; - - std::string metric_output_file; - // sorted tsv output file - std::string sorted_output_file; - - // Size (in number of alignments) of individual chunks to sort in a batch and - // write to a partial file. Approximately 20 million alignments makes 1 GB bam file. - unsigned int alignments_per_batch = kDefaultNumAlignsPerThread; - unsigned int nthreads = 1; - std::string barcode_tag; - std::string umi_tag; - std::string gene_tag; - - // order of the tags to sort by - std::unordered_map tag_order; - - std::string mitochondrial_gene_names_filename; -}; - -InputOptionsFastqProcess readOptionsFastqProcess(int argc, char** argv); - -INPUT_OPTIONS_TAGSORT readOptionsTagsort(int argc, char** argv); - -INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqSlideseq(int argc, char** argv); - -INPUT_OPTIONS_FASTQ_READ_STRUCTURE readOptionsFastqMetrics(int argc, char** argv); - -int64_t get_num_blocks(InputOptionsFastqProcess const& options); -int64_t get_num_blocks(INPUT_OPTIONS_FASTQ_READ_STRUCTURE const& options); - -#endif // __SCTOOLS_FASTQPREPROCESSING_INPUT_OPTIONS_H_ diff --git a/tools/scripts/sctools/fastqpreprocessing/src/metricgatherer.cpp b/tools/scripts/sctools/fastqpreprocessing/src/metricgatherer.cpp deleted file mode 100644 index 102ce293..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/metricgatherer.cpp +++ /dev/null @@ -1,424 +0,0 @@ -/** - * @file metricgatherer.cpp - * @brief functions for file processing - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ -#include "metricgatherer.h" - -template -inline void freeContainer(T& p_container) -{ - T empty; - using std::swap; - swap(p_container, empty); -} - - -std::string to_nan(float x) -{ - std::stringstream s; - s << std::setprecision(10) << x; - return x==-1 ? "nan" : s.str(); -} - -void Metrics::clear() -{ - n_reads = 0; - // noise_reads = 0; //# long polymers, N-sequences; NotImplemented - freeContainer(_fragment_histogram); - freeContainer(_molecule_histogram); - - freeContainer(_molecule_barcode_fraction_bases_above_30); - perfect_molecule_barcodes = 0; - freeContainer(_genomic_reads_fraction_bases_quality_above_30); - freeContainer(_genomic_read_quality); - - reads_mapped_exonic = 0; - reads_mapped_intronic = 0; - reads_mapped_utr = 0; - - // alignment uniqueness information - reads_mapped_uniquely = 0; - reads_mapped_multiple = 0; - duplicate_reads = 0; - - // alignment splicing information - spliced_reads = 0; - antisense_reads = 0; - plus_strand_reads = 0; // strand balance - - // higher-order methods, filled in by finalize() when all data is extracted - molecule_barcode_fraction_bases_above_30_mean = -1; - molecule_barcode_fraction_bases_above_30_variance = -1; - genomic_reads_fraction_bases_quality_above_30_mean = -1; - genomic_reads_fraction_bases_quality_above_30_variance = -1; - genomic_read_quality_mean = -1; - genomic_read_quality_variance = -1; - n_molecules = -1; - n_fragments = -1; - reads_per_molecule = -1; - reads_per_fragment = -1; - fragments_per_molecule = -1; - fragments_with_single_read_evidence = -1; - molecules_with_single_read_evidence = -1; -} - - -void Metrics::output_metrics(std::ofstream& fmetric_out) -{ - fmetric_out << std::setprecision(10) << prev_tag << "," - << n_reads << "," - << noise_reads << "," - << perfect_molecule_barcodes << "," - << reads_mapped_exonic << "," - << reads_mapped_intronic << "," - << reads_mapped_utr << "," - << reads_mapped_uniquely << "," - << reads_mapped_multiple << "," - << duplicate_reads << "," - << spliced_reads << "," - << antisense_reads << "," - << molecule_barcode_fraction_bases_above_30_mean << "," - << to_nan(molecule_barcode_fraction_bases_above_30_variance) << "," - << genomic_reads_fraction_bases_quality_above_30_mean << "," - << to_nan(genomic_reads_fraction_bases_quality_above_30_variance) << "," - << to_nan(genomic_read_quality_mean) << "," - << to_nan(genomic_read_quality_variance) << "," - << n_molecules << "," - << n_fragments << "," - << reads_per_molecule << "," - << reads_per_fragment << "," - << fragments_per_molecule << "," - << fragments_with_single_read_evidence << "," - << molecules_with_single_read_evidence; -} - - -constexpr unsigned int kOffset = 3; -void Metrics::parse_line(std::string& str, std::ofstream& fmetric_out, - std::unordered_set& mitochondrial_genes, - MetricType metric_type) -{ - char line[1000]; - std::string NONE("None"); - std::size_t len = str.copy(line, str.size(), 0); - line[str.size()]='\0'; - - assert(len < 1000); - - char* c = line; - - unsigned int k = 0; - record[k] = c; - while (*c!='\0') - { - if (*c == '\t') - { - *c='\0'; - record[++k] = c + 1; - } - c++; - } - - assert(k==16); - - std::string current_tag = record[0]; - - // ignore the None gene - if (current_tag.compare(NONE)==0) - return; - - // load the tags - std::string first_tag, second_tag, third_tag; - first_tag = record[0]; - second_tag = record[1]; - third_tag = record[2]; - if (metric_type == MetricType::Gene && first_tag.find(",")!=std::string::npos) - return; - - std::string tags = first_tag + std::string("-") + second_tag + std::string("-") + third_tag; - if (prev_tag.compare(current_tag)!=0 && prev_tag.size()!=0) - { - finalize(mitochondrial_genes); - output_metrics(fmetric_out); - output_metrics_extra(fmetric_out); - clear(); - } - - parse_extra_fields(first_tag, second_tag, third_tag, record); - - n_reads += 1; - - // the tags passed to this function define a molecule, this increments the counter, - // identifying a new molecule only if a new tag combination is observed - - /* updating the molecule histogram with tags */ - if (_molecule_histogram.find(tags)==_molecule_histogram.end()) - _molecule_histogram[tags] = 0; - _molecule_histogram[tags] += 1; - - _molecule_barcode_fraction_bases_above_30.update(std::stof(record[kOffset + 13])); - perfect_molecule_barcodes += std::stoi(record[kOffset + 9]); - - _genomic_reads_fraction_bases_quality_above_30.update(std::stof(record[kOffset + 7])); - _genomic_read_quality.update(std::stof(record[kOffset + 6])); - - // the remaining portions deal with aligned reads, so if the read is not mapped, we are - // done with it - if (std::string(record[kOffset + 0]).compare("*")==0) - return; - - // get components that define a unique sequence fragment and increment the histogram - std::string position_str = record[kOffset + 2]; - std::string strand = std::stoi(std::string(record[kOffset + 3]))==1 ? "true" : "false"; - std::string reference = record[kOffset + 0]; - - std::string _ref_pos_str_tags = reference + std::string("\t") + - position_str + std::string("\t") + - strand + std::string("\t") + tags; - std::string ref_pos_str_tags = std::to_string(std::hash {}(_ref_pos_str_tags)); - - /* updating the fragment histogram with tag, strand and pos */ - if (_fragment_histogram.find(ref_pos_str_tags)==_fragment_histogram.end()) - _fragment_histogram[ref_pos_str_tags] = 0; - _fragment_histogram[ref_pos_str_tags] += 1; - - std::string alignment_location = std::string(record[kOffset + 1]); - if (alignment_location == "CODING") - reads_mapped_exonic += 1; - else if (alignment_location == "INTRONIC") - reads_mapped_intronic += 1; - else if (alignment_location == "UTR") - reads_mapped_utr += 1; - - // in futher check if read maps outside window (when we add a gene model) - // and create distances from terminate side (needs gene model) uniqueness - int number_mappings = std::stoi(std::string(record[kOffset + 8])); - - if (number_mappings==1) - reads_mapped_uniquely += 1; - else - reads_mapped_multiple += 1; // without multi-mapping, this number is zero! - - duplicate_reads += std::stoi(std::string(record[kOffset + 11])); - - // cigar N field (3) indicates a read is spliced if the value is non-zero - spliced_reads += std::stoi(std::string(record[kOffset + 10])); - - prev_tag = current_tag; -} - - -// Calculate metrics that require information from all molecules of an entity -// ``finalize()`` replaces attributes in-place that were initialized by the constructor as -// ``None`` with a value calculated across all molecule data that has been aggregated. - -void Metrics::finalize(std::unordered_set& mitochondrial_genes) -{ - molecule_barcode_fraction_bases_above_30_mean = - _molecule_barcode_fraction_bases_above_30.getMean(); - - molecule_barcode_fraction_bases_above_30_variance = - _molecule_barcode_fraction_bases_above_30.calculate_variance(); - - genomic_reads_fraction_bases_quality_above_30_mean = - _genomic_reads_fraction_bases_quality_above_30.getMean(); - - genomic_reads_fraction_bases_quality_above_30_variance = - _genomic_reads_fraction_bases_quality_above_30.calculate_variance(); - - genomic_read_quality_mean = _genomic_read_quality.getMean(); - - genomic_read_quality_variance = _genomic_read_quality.calculate_variance(); - - n_molecules = _molecule_histogram.size(); - - n_fragments = _fragment_histogram.size(); - - reads_per_molecule = -1; // float("nan") - if (n_molecules != 0) - reads_per_molecule = n_reads / n_molecules; - - reads_per_fragment = -1; //float("nan") - if (n_fragments != 0) - reads_per_fragment = n_reads / n_fragments; - - fragments_per_molecule = -1; // float("nan") - if (n_molecules != 0) - fragments_per_molecule = n_fragments / n_molecules; - - fragments_with_single_read_evidence = 0; - for (auto const& [key, val] : _fragment_histogram) - if (val == 1) - fragments_with_single_read_evidence++; - - molecules_with_single_read_evidence = 0; - for (auto const& [key, val] : _molecule_histogram) - if (val == 1) - molecules_with_single_read_evidence++; -} - -//////////////// CellMetrics //////////////////////// -std::string CellMetrics::getHeader() -{ - std::string s; - for (int i=0; i<24; i++) - s += std::string(",") + common_headers[i]; // TODO ok to start with ,? - for (int i=0; i<11; i++) - s += std::string(",") + cell_specific_headers[i]; - return s; -} - -// Parses a record to extract gene-specific information -void CellMetrics::parse_extra_fields(const std::string& first_tag, - const std::string& second_tag, - const std::string& third_tag, - char** record) -{ - _cell_barcode_fraction_bases_above_30.update(std::stof(record[kOffset + 5])); - perfect_cell_barcodes += std::stoi(record[kOffset + 12]); - - std::string record_str(record[kOffset + 1]); - if (!record_str.empty()) // TODO can the empty check be skipped, or is there a non-empty non-unmapped case? - { - if (record_str == "INTERGENIC") - reads_mapped_intergenic += 1; - } - else - reads_unmapped += 1; - - /* updating the genes histogram with tags */ - if (_genes_histogram.find(third_tag) == _genes_histogram.end()) - _genes_histogram[third_tag] = 0; - _genes_histogram[third_tag] += 1; -} - -void CellMetrics::output_metrics_extra(std::ofstream& fmetric_out) -{ - fmetric_out << std::setprecision(10) - << "," << perfect_cell_barcodes - << "," << reads_mapped_intergenic - << "," << reads_unmapped - << "," << reads_mapped_too_many_loci - << std::setprecision(10) - << "," << to_nan(cell_barcode_fraction_bases_above_30_variance) - << "," << cell_barcode_fraction_bases_above_30_mean - << "," << n_genes - << "," << genes_detected_multiple_observations - << "," << n_mitochondrial_genes - << "," << n_mitochondrial_molecules - << "," << pct_mitochondrial_molecules - << std::endl; -} - -void CellMetrics::finalize(std::unordered_set& mitochondrial_genes) -{ - // call the finalize function in the parent class - Metrics::finalize(mitochondrial_genes); - - cell_barcode_fraction_bases_above_30_mean = - _cell_barcode_fraction_bases_above_30.getMean(); - - cell_barcode_fraction_bases_above_30_variance = - _cell_barcode_fraction_bases_above_30.calculate_variance(); - - n_genes = _genes_histogram.size(); - - genes_detected_multiple_observations = 0; - n_mitochondrial_genes = 0; - n_mitochondrial_molecules = 0; - for (auto const& [gene, count] : _genes_histogram) - { - if (count > 1) - genes_detected_multiple_observations++; - if (mitochondrial_genes.find(gene) != mitochondrial_genes.end()) - { - n_mitochondrial_genes++; - n_mitochondrial_molecules += count; - } - } - - if (n_mitochondrial_molecules > 0) - { - int tot_molecules = 0; - for (auto const& [gene, count] : _genes_histogram) - tot_molecules += count; - - // TODO BUG associativity and integer division combine to make this always 0 - pct_mitochondrial_molecules = (n_mitochondrial_molecules/tot_molecules * 100.0); - } - else - pct_mitochondrial_molecules = 0.0; -} - -void CellMetrics::clear() -{ - // call the clear function in the parent class - Metrics::clear(); - - _cell_barcode_fraction_bases_above_30.clear(); - perfect_cell_barcodes = 0; - reads_mapped_intergenic = 0; - reads_unmapped = 0; - reads_mapped_too_many_loci = 0; - _genes_histogram.clear(); - - n_genes = 0; - genes_detected_multiple_observations = 0; - n_mitochondrial_genes = 0; - n_mitochondrial_molecules = 0; - pct_mitochondrial_molecules = 0; -} - - -//////////////// GeneMetrics //////////////////////// -std::string GeneMetrics::getHeader() -{ - std::string s; - for (int i=0; i<24; i++) - s += std::string(",") + common_headers[i]; // TODO ok to start with ,? - for (int i=0; i<2; i++) - s += std::string(",") + gene_specific_headers[i]; - return s; -} - -void GeneMetrics::parse_extra_fields(const std::string& first_tag, - const std::string& second_tag, - const std::string& third_tag, - char** record) -{ - // updating the cell histogram with tags - if (_cells_histogram.find(second_tag)==_cells_histogram.end()) - _cells_histogram[second_tag] = 0; - - _cells_histogram[second_tag] += 1; -} - -void GeneMetrics::output_metrics_extra(std::ofstream& fmetric_out) -{ - fmetric_out << "," << number_cells_detected_multiple - << "," << number_cells_expressing - << std::endl; -} - -void GeneMetrics::finalize(std::unordered_set& mitochondrial_genes) -{ - // call the finalize function in the parent class - Metrics::finalize(mitochondrial_genes); - - number_cells_expressing = _cells_histogram.size(); - number_cells_detected_multiple = 0; - for (auto const& [cell, count] : _cells_histogram) - if (count > 1) - number_cells_detected_multiple++; -} - -void GeneMetrics::clear() -{ - // call the clear function in the parent class - Metrics::clear(); - number_cells_detected_multiple = 0; - number_cells_expressing = 0; - freeContainer(_cells_histogram); -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/metricgatherer.h b/tools/scripts/sctools/fastqpreprocessing/src/metricgatherer.h deleted file mode 100644 index 59f0af81..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/metricgatherer.h +++ /dev/null @@ -1,274 +0,0 @@ -#ifndef __METRIC_GATHERER__ -#define __METRIC_GATHERER__ -/** - * @file metricgatherer.h - * @brief functions for file processing - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -enum class MetricType { Cell, Gene }; - -/* - Methods - ------- - update(new_value: float) - incorporate new_value into the online estimate of mean and variance - getMean() - return the mean value - calculate_variance() - calculate and return the variance - mean_and_variance() - return both mean and variance -*/ -class OnlineGaussianSufficientStatistic -{ -private: - double _mean_squared_error = 0.0; - double sum_EX2 = 0.0; - double _mean = 0.0; - double _sum = 0.0; - double _count = 0.0; - -public: - void update(double new_value) - { - _count += 1.0; - _sum += new_value; - sum_EX2 += (new_value*new_value); - } - - // return the mean value - double getMean() - { - _mean = _sum/_count; - return _mean; - } - - // calculate and return the variance - double calculate_variance() - { - if (_count < 2) - return -1.0; - return sum_EX2 / (_count - 1) - (_sum/_count) * (_sum / (_count - 1)); - } - - void clear() - { - _mean_squared_error = 0.0; - _mean = 0.0; - _count = 0; - _sum = 0; - sum_EX2 = 0.0; - } -}; - -class Metrics -{ -private: - // count information - int n_reads = 0; - const int noise_reads = 0; //# long polymers, N-sequences; NotImplemented - - std::unordered_map _fragment_histogram; - std::unordered_map _molecule_histogram; - - // molecule information - OnlineGaussianSufficientStatistic _molecule_barcode_fraction_bases_above_30; - - int perfect_molecule_barcodes = 0; - - OnlineGaussianSufficientStatistic _genomic_reads_fraction_bases_quality_above_30; - - OnlineGaussianSufficientStatistic _genomic_read_quality; - - // alignment location information - int reads_mapped_exonic = 0; - int reads_mapped_intronic = 0; - int reads_mapped_utr = 0; - - // in future we can implement this when we have a gene model - // self.reads_mapped_outside_window = 0 # reads should be within 1000 bases of UTR - // self._read_distance_from_termination_site = OnlineGaussianSufficientStatistic() - - // alignment uniqueness information - int reads_mapped_uniquely = 0; - int reads_mapped_multiple = 0; - int duplicate_reads = 0; - - // alignment splicing information - int spliced_reads = 0; - int antisense_reads = 0; - int plus_strand_reads = 0; // strand balance - - // higher-order methods, filled in by finalize() when all data is extracted - float molecule_barcode_fraction_bases_above_30_mean = -1; - float molecule_barcode_fraction_bases_above_30_variance = -1; - float genomic_reads_fraction_bases_quality_above_30_mean = -1; - float genomic_reads_fraction_bases_quality_above_30_variance = -1; - float genomic_read_quality_mean = -1; - float genomic_read_quality_variance = -1; - float n_molecules = -1; - float n_fragments = -1; - float reads_per_molecule = -1; - float reads_per_fragment = -1; - float fragments_per_molecule = -1; - int fragments_with_single_read_evidence = -1; - int molecules_with_single_read_evidence = -1; - - // TODO separate these 2 out from the above, all of which gets clear()d - std::string prev_tag; - char* record[20]; - -protected: - std::string common_headers[24] = - { - "n_reads", - "noise_reads", - "perfect_molecule_barcodes", - "reads_mapped_exonic", - "reads_mapped_intronic", - "reads_mapped_utr", - "reads_mapped_uniquely", - "reads_mapped_multiple", - "duplicate_reads", - "spliced_reads", - "antisense_reads", - "molecule_barcode_fraction_bases_above_30_mean", - "molecule_barcode_fraction_bases_above_30_variance", - "genomic_reads_fraction_bases_quality_above_30_mean", - "genomic_reads_fraction_bases_quality_above_30_variance", - "genomic_read_quality_mean", - "genomic_read_quality_variance", - "n_molecules", - "n_fragments", - "reads_per_molecule", - "reads_per_fragment", - "fragments_per_molecule", - "fragments_with_single_read_evidence", - "molecules_with_single_read_evidence" - }; - - -public: - virtual ~Metrics() {} - // get the headers - virtual std::string getHeader() = 0; - - void parse_line(std::string& str, std::ofstream& fmetric_out, - std::unordered_set& mitochondrial_genes, - MetricType metric_type); - - void output_metrics(std::ofstream& fmetric_out); - virtual void output_metrics_extra(std::ofstream& fmetric_out) = 0; - virtual void parse_extra_fields(const std::string& first_tag, - const std::string& second_tag, - const std::string& third_tag, - char** record) = 0; - virtual void finalize(std::unordered_set& mitochondrial_genes); - virtual void clear(); -}; - -class CellMetrics: public Metrics -{ -private: - int perfect_cell_barcodes; // The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``) - int reads_mapped_intergenic; // The number of reads mapped to an intergenic region for this cell - - // reads unmapped - int reads_unmapped; - // The number of reads that were mapped to too many loci across the genome and as a - // consequence, are reported unmapped by the aligner - int reads_mapped_too_many_loci; - - // The variance of the fraction of Illumina base calls for the cell barcode sequence that - // are greater than 30, across molecules - float cell_barcode_fraction_bases_above_30_variance; - - // The average fraction of Illumina base calls for the cell barcode sequence that - // are greater than 30, across molecules - float cell_barcode_fraction_bases_above_30_mean; - - int n_genes; //The number of genes detected by this cell - - int genes_detected_multiple_observations; // The number of genes that are observed by more than one read in this cell - int n_mitochondrial_genes; // The number of mitochondrial genes detected by this cell - int n_mitochondrial_molecules; // The number of molecules from mitochondrial genes detected for this cell - int pct_mitochondrial_molecules; // The percentage of molecules from mitoc - - OnlineGaussianSufficientStatistic _cell_barcode_fraction_bases_above_30; - std::unordered_map _genes_histogram; - - std::string cell_specific_headers[11] = - { - "perfect_cell_barcodes", - "reads_mapped_intergenic", - "reads_unmapped", - "reads_mapped_too_many_loci", - "cell_barcode_fraction_bases_above_30_variance", - "cell_barcode_fraction_bases_above_30_mean", - "n_genes", - "genes_detected_multiple_observations", - "n_mitochondrial_genes", - "n_mitochondrial_molecules", - "pct_mitochondrial_molecules" - }; - -public: - std::string getHeader() override; - void output_metrics_extra(std::ofstream& fmetric_out) override; - void parse_extra_fields(const std::string& first_tag, - const std::string& second_tag, - const std::string& third_tag, - char** record) override; - - void finalize(std::unordered_set& mitochondrial_genes); - - void clear(); -}; - - -class GeneMetrics: public Metrics -{ -private: - int number_cells_detected_multiple; - int number_cells_expressing; - - std::unordered_map _cells_histogram; - std::string gene_specific_headers[2] = - { - "number_cells_detected_multiple", - "number_cells_expressing" - }; - -public: - GeneMetrics() - { - number_cells_detected_multiple = 0; - number_cells_expressing = 0; - } - -public: - std::string getHeader() override; - void output_metrics_extra(std::ofstream& fmetric_out) override; - void parse_extra_fields(std::string const& first_tag, - std::string const& second_tag, - std::string const& third_tag, - char** record) override; - - void finalize(std::unordered_set& mitochondrial_genes); - void clear(); -}; - -#endif diff --git a/tools/scripts/sctools/fastqpreprocessing/src/samplefastq.cpp b/tools/scripts/sctools/fastqpreprocessing/src/samplefastq.cpp deleted file mode 100644 index 810a6ae2..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/samplefastq.cpp +++ /dev/null @@ -1,105 +0,0 @@ -#include "fastq_common.h" -#include "input_options.h" -#include - -std::vector> parseReadStructure(std::string const& read_structure) -{ - std::vector> ret; - int next_ind = 0; - while (next_ind < read_structure.size()) - { - int type_ind = read_structure.find_first_not_of("0123456789", next_ind); - assert(type_ind != std::string::npos); - char type = read_structure[type_ind]; - int len = std::stoi(read_structure.substr(next_ind, type_ind - next_ind)); - ret.emplace_back(type, len); - next_ind = type_ind + 1; - } - return ret; -} - -std::vector> g_parsed_read_structure; - -void fillSamRecordWithReadStructure(SamRecord* sam, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list) -{ - // check the sequence names matching - std::string a = std::string(fastQFileR1->myRawSequence.c_str()); - std::string b = std::string(fastQFileR1->myQualityString.c_str()); - // extract the raw barcode and UMI 8C18X6C9M1X and raw barcode and UMI quality string - - std::string barcode_seq, barcode_quality, umi_seq, umi_quality; - int cur_ind = 0; - for (auto [tag, length] : g_parsed_read_structure) - { - switch (tag) - { - case 'C': - barcode_seq += a.substr(cur_ind, length); - barcode_quality += b.substr(cur_ind, length); - break; - case 'M': - umi_seq += a.substr(cur_ind, length); - umi_quality += b.substr(cur_ind, length); - break; - default: - break; - } - cur_ind += length; - } - fillSamRecordCommon(sam, fastQFileI1, fastQFileR1, fastQFileR2, has_I1_file_list, - barcode_seq, barcode_quality, umi_seq, umi_quality); -} - -std::string slideseqBarcodeGetter(SamRecord* sam, FastQFile* fastQFileI1, - FastQFile* fastQFileR1, FastQFile* fastQFileR2, - bool has_I1_file_list) -{ - return std::string(sam->getString("CR").c_str()); -} - -void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index) -{ - cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index)); -} - - -int main(int argc, char** argv) -{ - INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqSlideseq(argc, argv); - // number of output bam files, and one writer thread per bam file - int num_writer_threads = get_num_blocks(options); - - std::ofstream outfile_r1("sampled_down.R1"); - if (!outfile_r1) - crash("Failed to open output file sampled_down.R1"); - std::ofstream outfile_r2("sampled_down.R2"); - if (!outfile_r2) - crash("Failed to open output file sampled_down.R2"); - - g_parsed_read_structure = parseReadStructure(options.read_structure); - mainCommon(options.white_list_file, /*num_writer_threads=*/1, options.output_format, - options.I1s, options.R1s, options.R2s, options.sample_id, - fillSamRecordWithReadStructure, slideseqBarcodeGetter, - [&outfile_r1, &outfile_r2](WriteQueue* ignored1, SamRecord* sam, int reader_thread_index) - { - if (sam->getStringTag("CB")) - { - // Assumed read structure of 8C18X6C9M1X with a fixed spacer sequence - const char* barcode = sam->getString("CR").c_str(); - const char* quality_score = sam->getString("CY").c_str(); - outfile_r1 << "@" << sam->getReadName() << "\n" - << std::string_view(barcode, 8) << "CTTCAGCGTTCCCGAGAG" << std::string_view(barcode+8, 6) << sam->getString("UR") <<"T\n" - << "+\n" - << std::string_view(quality_score, 8)<<"FFFFFFFFFFFFFFFFFF" << std::string_view(quality_score+8, 6) << sam->getString("UY") <<"F"<< "\n"; - - outfile_r2 << "@" << sam->getReadName() << "\n" - << sam->getSequence() << "\n" - << "+\n" - << sam->getQuality() << "\n"; - } - releaseReaderThreadMemory(reader_thread_index,sam); - }); - return 0; -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/tagsort.cpp b/tools/scripts/sctools/fastqpreprocessing/src/tagsort.cpp deleted file mode 100644 index 59567a74..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/tagsort.cpp +++ /dev/null @@ -1,491 +0,0 @@ -/** - * @file tagsort.cpp - * @brief functions for file processing - * @author Kishori M. Konwar - * @date 2021-08-11 - ***********************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "htslib_tagsort.h" -#include "metricgatherer.h" - -constexpr int kDataBufferSize = 1000; - -struct Context -{ - std::vector> data; - - std::vector file_offset; - std::vector data_size; - std::vector ptrs; - std::vector isempty; - int index_ = -1; - int num_active_files = 0; - const int num_parts_; - - Context(unsigned int num_parts) : num_parts_(num_parts) - { - // set the file offsets to 0 - for (int i=0; i < num_parts_; i++) - file_offset.push_back(0); - - // set the isempty for each file to false - for (int i=0; i < num_parts_; i++) - isempty.push_back(false); - - // set a vector of vectors of data for each file - for (int i=0; i < num_parts_; i++) - data.push_back(std::vector()); - - // set the data_size of the buffer for each file to 0 - for (int i=0; i < num_parts_; i++) - data_size.push_back(0); - - // set the pointer to f each buffer to kDataBufferSize - for (int i=0; i < num_parts_; i++) - ptrs.push_back(kDataBufferSize); - } - - void print_status() - { - std::cout << "Contx status " << std::endl; - for (int i=0; i < num_parts_; i++) - { - index_ = i; - std::cout << "\t" << index_ << "\t" << data[index_].size() << "\t" - << data_size[index_] << "\t" << ptrs[index_] << std::endl; - } - } - - void clear() - { - data_size.clear(); - ptrs.clear(); - isempty.clear(); - } -}; - -using QUEUETUPLE = std::tuple; - -inline std::string ltrim(std::string& s) -{ - auto it = find_if_not(s.begin(), s.end(), [](int c) { return isspace(c); }); - s.erase(s.begin(), it); - return s; -} - -// remove the " (quotes) from the beginning and end of the string -// (TODO and the middle; hopefully nobody is trying to use escaped quotes). -std::string removeQuotes(std::string& s) -{ - s.erase(std::remove_if(s.begin(), s.end(), [](unsigned char c) - { - return c=='\"'; - }), s.end()); - return s; -} - -std::vector splitStringToFields(std::string const& str, char delim) -{ - std::stringstream splitter(str); - std::vector ret; - for (std::string field; std::getline(splitter, field, delim); ) - ret.push_back(field); - return ret; -} - -class MitochondrialGeneSelector -{ -public: - MitochondrialGeneSelector(std::string const& mitochondrial_gene_names_filename) - { - if (mitochondrial_gene_names_filename.empty()) - { - default_old_behavior_ = true; - return; - } - - std::ifstream input_file(mitochondrial_gene_names_filename); - if (!input_file) - { - crash("ERROR failed to open the mitochondrial gene names file named: " + - mitochondrial_gene_names_filename); - } - for (std::string line; std::getline(input_file, line);) - { - if (line.empty() || line[0] == '#') // skip comment lines - continue; - mito_genes_.insert(line); - } - } - - bool interestedInGeneName(std::string const& gene_name) - { - if (default_old_behavior_) - return std::regex_search(gene_name, std::regex("^mt-", std::regex_constants::icase)); - else - return mito_genes_.find(gene_name) != mito_genes_.end(); - } - -private: - bool default_old_behavior_ = false; - std::unordered_set mito_genes_; -}; - -// TODO function is named "get gene names", and there is something in there called -// "gene name", but it instead returns a set of "gene id"s. correct? -// -// The file at gtf_filename should be unzipped. -std::unordered_set get_mitochondrial_gene_names( - std::string const& gtf_filename, std::string const& mitochondrial_gene_names_filename) -{ - std::unordered_set mitochondrial_gene_ids; - - MitochondrialGeneSelector gene_selector(mitochondrial_gene_names_filename); - - std::ifstream input_file(gtf_filename); - if (!input_file) - crash("ERROR failed to open the GTF file named: " + gtf_filename); - - for (std::string line; std::getline(input_file, line);) - { - if (line.empty() || line[0] == '#') // skip comment lines - continue; - - std::vector tabbed_fields = splitStringToFields(line, '\t'); - assert(tabbed_fields.size() > 8); - if (tabbed_fields[2] != "gene") // skip the line unless it is a gene - continue; - // split the semicolon-separated attributes field - std::vector attribs = splitStringToFields(tabbed_fields[8], ';'); - - std::string gene_name; - std::string gene_id; - // now examine each of the attribute name-value pairs - for (std::string attrib : attribs) - { - // each attribute is a space-separated key-value pair - std::vector key_and_val = splitStringToFields(ltrim(attrib), ' '); - if (key_and_val.size() != 2) - crash("Expected 2 fields, found " + std::to_string(key_and_val.size()) + " fields"); - - // the second element in the pair is the value string - std::string& key = key_and_val[0]; - std::string value = removeQuotes(key_and_val[1]); - - if (key == "gene_id") - gene_id = value; - if (key == "gene_name") - gene_name = value; - } - if (gene_name.empty()) - { - crash("Malformed GTF file detected. Record is of type gene but does not " - "have a gene_name in line:\n" + line); - } - - if (gene_selector.interestedInGeneName(gene_name)) - mitochondrial_gene_ids.insert(gene_id); // TODO what if gene_id is empty? - } - std::cout << "Number of mitochondrial genes found " << mitochondrial_gene_ids.size() << std::endl; - return mitochondrial_gene_ids; -} - - -/* - * @brief fills the buffer for the files - * - * @param contx is the context of the file - * @return int number of alignments processed -*/ -int fill_buffer(Context& contx, std::vector const& partial_files) -{ - contx.data[contx.index_].clear(); - int k = 0; - int filling_counter = 0; - - std::ifstream input_file(partial_files[contx.index_]); - if (!input_file) - crash("ERROR failed to open the file " + partial_files[contx.index_]); - - input_file.seekg(contx.file_offset[contx.index_]); - - // the order of the loop condition is iportant first make sure if you can accomodate then try to read, - // otherwise it might create a read but never processed - for (std::string line; k < kDataBufferSize && std::getline(input_file, line); k++) - { - contx.data[contx.index_].push_back(line); - filling_counter++; - } - assert(contx.data[contx.index_].size() <= kDataBufferSize); - - contx.file_offset[contx.index_] = input_file.tellg(); - - contx.data_size[contx.index_] = contx.data[contx.index_].size(); - - if (contx.data_size[contx.index_] != 0) - { - contx.ptrs[contx.index_] = 0; - contx.isempty[contx.index_] = false; - } - else - { - contx.ptrs[contx.index_] = kDataBufferSize; - contx.isempty[contx.index_] = true; - } - -#ifdef DEBUG - std::cout << "-->" << std::endl; - for (int m = 0; m < contx.num_parts_; m++) - std::cout << "\t" << m << " : " << contx.data_size[m] << " : " << contx.ptrs[m] << std::endl; -#endif - - return filling_counter; -} - -// TODO if after other refactoring this ends up being the only regex use, then -// probably would be worth switching away from regex here. -// From e.g. "A\tB\tC\tD\tE", extract "A\tB\tC" -std::string extractCompTag(std::string& s) -{ - const std::regex rgx("\t"); - const std::sregex_token_iterator end; - std::sregex_token_iterator iter(s.begin(), s.end(), rgx, -1); - std::stringstream comp_tag; - for (auto k = 0; k < 3 && iter != end; ++iter, k++) - { - if (k > 0) - comp_tag << "\t"; - comp_tag << *iter; - } - return comp_tag.str(); -} - -// returns number of alignments processed -int mergeSortedPartialFiles(INPUT_OPTIONS_TAGSORT const& options, - std::vector const& partial_files) -{ - const std::string& sorted_output_file = options.sorted_output_file; - const std::string& metric_type = options.metric_type; - const std::string& metric_output_file = options.metric_output_file; - int filling_counter = 0; - - std::unordered_set mitochondrial_genes; - if (!options.gtf_file.empty()) - { - mitochondrial_genes = get_mitochondrial_gene_names( - options.gtf_file, options.mitochondrial_gene_names_filename); - } - - // input the buffer size and partial files - Context contx(partial_files.size()); - auto cmp = [](const QUEUETUPLE &a, const QUEUETUPLE &b) - { - return std::get<0>(a) > std::get<0>(b); - }; - std::priority_queue, decltype(cmp) > heap(cmp); - - for (int i=0; i < contx.num_parts_; i++) - { - contx.index_ = i; - filling_counter += fill_buffer(contx, partial_files); - } - - // create the heap from the first batch loaded data - contx.num_active_files = 0; - for (int i=0; i< contx.num_parts_; i++) - { - contx.index_ = i; - if (contx.ptrs[i] != kDataBufferSize) - { - heap.push(QUEUETUPLE(extractCompTag(contx.data[i][contx.ptrs[i]]), i, contx.ptrs[i])); - contx.ptrs[i]++; - contx.num_active_files += 1; - } - } - - // now merge by pop an push - std::ofstream fout; - if (options.compute_metric) // TODO i think this is a mistake, and should actually be options.output_sorted_info - fout.open(sorted_output_file); - - // pop and push from the heap - int num_alignments = 0; - int i, j; - - Metrics* metric_gatherer = nullptr; - MetricType metric_type_enum = MetricType::Cell; - if (metric_type.compare("cell")==0) - { - metric_gatherer = new CellMetrics; - metric_type_enum = MetricType::Cell; - } - else if (metric_type.compare("gene")==0) - { - metric_gatherer = new GeneMetrics; - metric_type_enum = MetricType::Gene; - } - else - crash("Expected metric_type 'cell' or 'gene', got: " + metric_type); - - metric_gatherer->clear(); - - std::ofstream fmetric_out; - if (options.compute_metric) - { - fmetric_out.open(metric_output_file.c_str()); - fmetric_out << metric_gatherer->getHeader() << std::endl; - } - - // TODO just write directly to fout... don't think the 'binary' is significant, - // but not 100% sure - std::stringstream str(std::stringstream::out | std::stringstream::binary); - std::string prev_comp_tag = ""; - while (!heap.empty()) - { - // read the top - QUEUETUPLE qtuple = heap.top(); - std::string curr_comp_tag = std::get<0>(qtuple); - assert(prev_comp_tag.compare(curr_comp_tag) <= 0); - -#ifdef DEBUG - contx.print_status(); - if (prev_comp_tag.compare(curr_comp_tag) <= 0) - std::cout << "Expected " << prev_comp_tag << "\n\t\t" << curr_comp_tag << std::endl; - else - crash("Anomaly " + prev_comp_tag + "\n\t\t" + curr_comp_tag); -#endif - i = std::get<1>(qtuple); //buffer no - j = std::get<2>(qtuple); //the pointer into the ith buffer array - - heap.pop(); - - // start writing in chunks from the stream buffer - if (num_alignments%kDataBufferSize==0) - { - if (options.output_sorted_info) - { - fout.write(str.str().c_str(), str.str().length()); - str.clear(); - str.str(""); - } - } - - // load into stream buffer - std::string field = contx.data[i][j]; - if (options.output_sorted_info) - str << field << std::endl; - - if (options.compute_metric) - metric_gatherer->parse_line(field, fmetric_out, mitochondrial_genes, metric_type_enum); - num_alignments += 1; - - // if ismpty is true means the file has been fully read - if (!contx.isempty[i] && contx.ptrs[i] == contx.data_size[i]) - { - contx.index_ = i; - filling_counter += fill_buffer(contx, partial_files); - } - - // make sure it is not empty - if (contx.data_size[i] > 0) - { - heap.push(QUEUETUPLE(extractCompTag(contx.data[i][contx.ptrs[i]]), i, contx.ptrs[i])); - contx.ptrs[i]++; - } - else // one more file is fully read - contx.num_active_files -= 1; - - if (num_alignments % 1000000 == 0) - std::cout << "num alns read " << num_alignments << std::endl; - - prev_comp_tag = curr_comp_tag; - } - - // process the final line - metric_gatherer->finalize(mitochondrial_genes); - metric_gatherer->output_metrics(fmetric_out); - metric_gatherer->output_metrics_extra(fmetric_out); - delete metric_gatherer; - - // close the metric file - if (options.compute_metric) - fmetric_out.close(); - - // write out the remaining data - if (options.output_sorted_info) - { - fout.write(str.str().c_str(), str.str().length()); - str.str(""); - str.clear(); - } - - // close output files as there is no more to write - if (options.output_sorted_info) - fout.close(); - - std::cout << "Written "<< num_alignments << " alignments in total" << std::endl; - contx.clear(); - return filling_counter; -} - -void warnIfNo_mitochondrial_gene_names_filename(INPUT_OPTIONS_TAGSORT const& options) -{ - if (options.mitochondrial_gene_names_filename.empty()) - { - std::string msg = -"*** WARNING! You did not specify --mitochondrial_gene_names_filename.\n" -"Therefore, we fell back to selecting only genes beginning with 'mt-' (case\n" -"insensitive). Please write a list of all gene names you're interested in into\n" -"a file, and pass the filename with --mitochondrial_gene_names_filename."; - std::cout << msg << std::endl; - std::cerr << msg << std::endl; - } -} - -/* Flag set by ‘--verbose’. */ -int main(int argc, char** argv) -{ - INPUT_OPTIONS_TAGSORT options = readOptionsTagsort(argc, argv); - warnIfNo_mitochondrial_gene_names_filename(options); - - std::cout << "bam input " << options.bam_input << std::endl; - std::cout << "temp folder " << options.temp_folder << std::endl; - std::cout << "sorted output file " << options.sorted_output_file << std::endl; - std::cout << "metric output file " << options.metric_output_file << std::endl; - std::cout << "temp folder " << options.alignments_per_batch << std::endl; - std::cout << "tags:" << std::endl; - - for (auto const& [tag, tag_order_num] : options.tag_order) - std::cout << "\t" << tag << "\t" << tag_order_num << std::endl; - - /* first create a list of sorted, and simplified sorted files */ - std::vector partial_files = create_sorted_file_splits_htslib(options); - - /* now merge the sorted files to create one giant sorted file by using - a head to compare the values based on the tags used */ - std::cout << "Merging " << partial_files.size() << " sorted files!"<< std::endl; - - int filling_counter = mergeSortedPartialFiles(options, partial_files); - - // we no longer need the partial files - for (unsigned int i=0; i < partial_files.size(); i++) - if (remove(partial_files[i].c_str()) != 0) - std::cerr << "Warning: error deleting file " << partial_files[i] << std::endl; - - partial_files.clear(); - std::cout << "Aligments " << filling_counter << " loaded to buffer " << std::endl; - - warnIfNo_mitochondrial_gene_names_filename(options); - return 0; -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/utilities.cpp b/tools/scripts/sctools/fastqpreprocessing/src/utilities.cpp deleted file mode 100644 index 965191ef..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/utilities.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/** - * @file utilities.cpp - * @brief Utility functions for file processing - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ - -#include "utilities.h" - -#include -#include - -/** @copydoc readWhiteList */ -WhiteListData readWhiteList(std::string const& white_list_file) -{ - const char ATCG[] = {'A', 'C', 'G', 'T', 'N'}; - - std::ifstream file(white_list_file); - if (!file.is_open()) - crash("Couldn't open whitelist file " + white_list_file); - - WhiteListData white_list_data; - int k = 0; - // read data from file object and put it into string. - for (std::string tp; getline(file, tp); ) - { - white_list_data.barcodes.push_back(tp); - - for (unsigned int i=0; i < tp.size(); i++) - { - for (int j=0; j < 5; j++) - { - char c = tp[i]; - tp[i] = ATCG[j]; - // If the mutation we're writing is already present, we just overwrite - // what was there with the current. - // This is done to have the same values for corrected barcodes - // as in the python implementation. - white_list_data.mutations[tp] = k; - tp[i] = c; - } - } - - // -1 suggests it is already a whitelisted barcode - // This is used, instead of the actual index, because when - // the barcode is seen with -1 then no correction is necessary. - // Avoids lots of map lookups, as most barcodes are not erroneous. - white_list_data.mutations[tp] = -1; - k++; - } - - return white_list_data; -} - - -/** @copydoc crashWithPerror */ -void crashWithPerror(std::string msg) -{ - perror(msg.c_str()); - exit(1); -} - -void crash(std::string msg) -{ - std::cout << msg << std::endl; - std::cerr << msg << std::endl; - exit(1); -} diff --git a/tools/scripts/sctools/fastqpreprocessing/src/utilities.h b/tools/scripts/sctools/fastqpreprocessing/src/utilities.h deleted file mode 100644 index b3272d8c..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/src/utilities.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __OPTIMUS_UTILITES__ -#define __OPTIMUS_UTILITES__ - -/** - * @file utilities.h - * @brief Utility functions for file processing - * @author Kishori Konwar - * @date 2021-08-11 - ***********************************************/ - -#include -#include -#include - -// structure for correcting the barcodes -struct WhiteListData -{ - // an unordered map from whitelist barcodes and 1-mutations - // to the index of the correct barcode - std::unordered_map mutations; - // vector of whitelist barcodes - std::vector barcodes; -}; - -/** - * @brief Build barcode correction map white list barcodes & mutations - * - * @details - * A barcode is computed by checking if it is either in the white - * list or 1-mutation away from any white listed barcode. To check - * whether a barcode is correct or to correct it, if 1-mutation away from - * a barcode in the white list, we build a - * a map is created with the barcodes and the 1-mutation. The keys are - * barcodes or mutation and the values are index of the crrect barcode - * - * @param whilte_list_file white list file from 10x genomics' cellranger - * @return a stricture containing the barcode/1-mutation barcode to index - * of the correct barcode -*/ -WhiteListData readWhiteList(std::string const& white_list_file); - -/** - * @brief Print system error and exit - * - * @param msg error string to print -*/ -void crashWithPerror(std::string msg); - -void crash(std::string msg); - -#endif diff --git a/tools/scripts/sctools/fastqpreprocessing/utils/big-run.sh b/tools/scripts/sctools/fastqpreprocessing/utils/big-run.sh deleted file mode 100755 index 82a1e3ec..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/utils/big-run.sh +++ /dev/null @@ -1 +0,0 @@ -./fastqproc ../../L8TX/L8TX_180221_01_F12_R1.fastq.gz ../../L8TX/L8TX_180221_01_F12_I1.fastq.gz ../../L8TX/L8TX_180221_01_F12_R2.fastq.gz ../../L8TX/L8TX_171026_01_F03_R1.fastq.gz ../../L8TX/L8TX_171026_01_F03_I1.fastq.gz ../../L8TX/L8TX_171026_01_F03_R2.fastq.gz diff --git a/tools/scripts/sctools/fastqpreprocessing/utils/check_barcode_partition.py b/tools/scripts/sctools/fastqpreprocessing/utils/check_barcode_partition.py deleted file mode 100644 index 572205b9..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/utils/check_barcode_partition.py +++ /dev/null @@ -1,39 +0,0 @@ -import pysam -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument("--bam", nargs="+", dest="bams", help="BAM files") - - -def check_disjoint_cbs(): - global parser - opts = parser.parse_args() - barcodes = {} - tot_alignments = 0 - - for bam in opts.bams: - print("reading " + bam) - barcodes[bam] = {} - with pysam.AlignmentFile(bam, "rb", check_sq=False) as input_alignments: - for alignment in input_alignments: - tot_alignments += 1 - if alignment.has_tag("CB"): - barcodes[bam][alignment.get_tag("CB")] = True - - for bam in opts.bams: - print("checking " + bam) - files = set(opts.bams) - otherbams = files.difference(set([bam])) - for cb in barcodes[bam].keys(): - for obam in otherbams: - if cb in barcodes[obam]: - print("not a partition") - return - - print("total alignments : ", tot_alignments) - print("is a partition") - return - - -if __name__ == "__main__": - check_disjoint_cbs() diff --git a/tools/scripts/sctools/fastqpreprocessing/utils/create_fastq.sh b/tools/scripts/sctools/fastqpreprocessing/utils/create_fastq.sh deleted file mode 100755 index 92f00cc9..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/utils/create_fastq.sh +++ /dev/null @@ -1,10 +0,0 @@ -zcat ../../L8TX/L8TX_180221_01_F12_R2.fastq.gz | head -n 4000000 > a_R1.fastq - -gzip a_R1.fastq - -cp a_R1.fastq.gz b_R2.fastq.gz -cp a_R1.fastq.gz b_I1.fastq.gz -cp a_R1.fastq.gz b_R1.fastq.gz - -cp a_R1.fastq.gz a_R2.fastq.gz -cp a_R1.fastq.gz a_I1.fastq.gz diff --git a/tools/scripts/sctools/fastqpreprocessing/utils/example-run.sh b/tools/scripts/sctools/fastqpreprocessing/utils/example-run.sh deleted file mode 100755 index 7006b138..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/utils/example-run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -./fastqprocess --verbose \ - --bam-size 0.001 \ - --barcode-length 16 \ - --umi-length 10 \ - --sample-id L8TX \ - --white-list ../../../data/L8TX/737K-august-2016.txt \ - --I1 ../../../data/L8TX/A_I1.fastq.gz \ - --R1 ../../../data/L8TX/A_R1.fastq.gz \ - --R2 ../../../data/L8TX/A_R2.fastq.gz \ - --I1 ../../../data/L8TX/B_I1.fastq.gz \ - --R1 ../../../data/L8TX/B_R1.fastq.gz \ - --R2 ../../../data/L8TX/B_R2.fastq.gz \ diff --git a/tools/scripts/sctools/fastqpreprocessing/utils/run.sh b/tools/scripts/sctools/fastqpreprocessing/utils/run.sh deleted file mode 100755 index bb61f611..00000000 --- a/tools/scripts/sctools/fastqpreprocessing/utils/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - - -# --tool=memcheck \ -# --leak-check=full \ -# --log-file=valgrind-out.txt \ - -valgrind \ - --tool=massif \ - --time-unit=B \ - ./fastqproc a_R1.fastq.gz a_I1.fastq.gz a_R2.fastq.gz \ - b_R1.fastq.gz b_I1.fastq.gz b_R2.fastq.gz diff --git a/tools/scripts/sctools/pull_request_template.md b/tools/scripts/sctools/pull_request_template.md deleted file mode 100644 index 14f16fb2..00000000 --- a/tools/scripts/sctools/pull_request_template.md +++ /dev/null @@ -1,14 +0,0 @@ -### Purpose - - -- No issue is linked to this PR. - -### Changes - - -- No changes. - -### Review Instructions - - -- No instructions. diff --git a/tools/scripts/sctools/readthedocs.yml b/tools/scripts/sctools/readthedocs.yml deleted file mode 100644 index be483081..00000000 --- a/tools/scripts/sctools/readthedocs.yml +++ /dev/null @@ -1,10 +0,0 @@ -# .readthedocs.yml - -build: - image: latest - -python: - version: 3.6 - use_system_site_packages: false # Set to true will let the virtualenv use the pre-installed packages such as numpy, which is not what we want - setup_py_install: false - pip_install: true diff --git a/tools/scripts/sctools/requirements.txt b/tools/scripts/sctools/requirements.txt deleted file mode 100644 index 499fc1c9..00000000 --- a/tools/scripts/sctools/requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ -crimson==0.5.2 -pandas==0.25.3 -pysam==0.16.0.1 -pytest-cov==2.10.1 -pytest==5.1.1 -scipy==1.5.2 -black==19.3b0 -flake8==3.7.7 -gffutils==0.9 -numpy==1.19.1 -requests==2.20.0 -setuptools==40.4.3 -setuptools_scm==3.1.0 -h5py==2.10.0 -tables==3.4.4 \ No newline at end of file diff --git a/tools/scripts/sctools/security.txt b/tools/scripts/sctools/security.txt deleted file mode 100644 index 2893b57a..00000000 --- a/tools/scripts/sctools/security.txt +++ /dev/null @@ -1,4 +0,0 @@ -If you'd like to report a security issue please contact us. - -Contact: security-leads@data.humancellatlas.org - diff --git a/tools/scripts/sctools/setup.py b/tools/scripts/sctools/setup.py deleted file mode 100644 index 3195f55a..00000000 --- a/tools/scripts/sctools/setup.py +++ /dev/null @@ -1,61 +0,0 @@ -from setuptools import setup - -CLASSIFIERS = [ - "Development Status :: 4 - Beta", - "Natural Language :: English", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.6", - "Topic :: Scientific/Engineering :: Bio-Informatics", -] - -setup( - name="sctools", - use_scm_version=True, - setup_requires=["setuptools_scm"], - description="Utilities for large-scale distributed single cell" + "data processing", - url="https://github.com/humancellatlas/sctools.git", - author="Ambrose J. Carr", - author_email="mail@ambrosejcarr.com", - package_dir={"": "src"}, - packages=["sctools", "sctools/test", "sctools/metrics"], - install_requires=[ - "gffutils", - "numpy", - "pandas", - "pysam", - "pytest", - "pytest-cov", - "sphinx", - "sphinxcontrib-websupport", - "sphinx_rtd_theme", - "setuptools_scm>=3.1.0", - "setuptools>=40.4.3", - "scipy>=1.0.0", - "crimson>=0.3.0", - ], - entry_points={ - "console_scripts": [ - "AttachBarcodes = sctools.platform:BarcodePlatform." + "attach_barcodes", - "Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes", - "SplitBam = sctools.platform:GenericPlatform.split_bam", - "CalculateGeneMetrics = sctools.platform:GenericPlatform." - + "calculate_gene_metrics", - "CalculateCellMetrics = sctools.platform:GenericPlatform." - + "calculate_cell_metrics", - "MergeGeneMetrics = sctools.platform:GenericPlatform." - + "merge_gene_metrics", - "MergeCellMetrics = sctools.platform:GenericPlatform." - + "merge_cell_metrics", - "CreateCountMatrix = sctools.platform:GenericPlatform." - + "bam_to_count_matrix", - "MergeCountMatrices = sctools.platform:GenericPlatform." - + "merge_count_matrices", - "TagSortBam = sctools.platform:GenericPlatform.tag_sort_bam", - "VerifyBamSort = sctools.platform:GenericPlatform.verify_bam_sort", - "GroupQCs = sctools.platform:GenericPlatform.group_qc_outputs", - ] - }, - classifiers=CLASSIFIERS, - include_package_data=True, -) diff --git a/tools/scripts/sctools/src/sctools.egg-info/PKG-INFO b/tools/scripts/sctools/src/sctools.egg-info/PKG-INFO deleted file mode 100644 index a9ff8c25..00000000 --- a/tools/scripts/sctools/src/sctools.egg-info/PKG-INFO +++ /dev/null @@ -1,14 +0,0 @@ -Metadata-Version: 2.1 -Name: sctools -Version: 0.4.1.dev33+g1f28a47 -Summary: Utilities for large-scale distributed single celldata processing -Home-page: https://github.com/humancellatlas/sctools.git -Author: Ambrose J. Carr -Author-email: mail@ambrosejcarr.com -Classifier: Development Status :: 4 - Beta -Classifier: Natural Language :: English -Classifier: License :: OSI Approved :: BSD License -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python :: 3.6 -Classifier: Topic :: Scientific/Engineering :: Bio-Informatics -License-File: LICENSE diff --git a/tools/scripts/sctools/src/sctools.egg-info/SOURCES.txt b/tools/scripts/sctools/src/sctools.egg-info/SOURCES.txt deleted file mode 100644 index 3890c5ae..00000000 --- a/tools/scripts/sctools/src/sctools.egg-info/SOURCES.txt +++ /dev/null @@ -1,132 +0,0 @@ -.dockerignore -.flake8 -.gitignore -.pre-commit-config.yaml -Dockerfile -LICENSE -MANIFEST.in -README.rst -docker_build.sh -pull_request_template.md -readthedocs.yml -requirements.txt -security.txt -setup.py -.circleci/config.yml -docs/README.md -docs/source/Makefile -docs/source/conf.py -docs/source/index.rst -docs/source/readme.rst -docs/source/sctools.metrics.rst -docs/source/sctools.rst -docs/source/sctools.test.rst -fastqpreprocessing/.gitignore -fastqpreprocessing/Makefile -fastqpreprocessing/patches/BgzfFileType.cpp.patch -fastqpreprocessing/patches/FastQFile.cpp.patch -fastqpreprocessing/patches/Makefile.patch -fastqpreprocessing/patches/general.Makefile.patch -fastqpreprocessing/src/example-run.sh -fastqpreprocessing/src/fastq_common.cpp -fastqpreprocessing/src/fastq_common.h -fastqpreprocessing/src/fastq_metrics.cpp -fastqpreprocessing/src/fastq_metrics.h -fastqpreprocessing/src/fastq_slideseq.cpp -fastqpreprocessing/src/fastqprocess.cpp -fastqpreprocessing/src/htslib_tagsort.cpp -fastqpreprocessing/src/htslib_tagsort.h -fastqpreprocessing/src/input_options.cpp -fastqpreprocessing/src/input_options.h -fastqpreprocessing/src/metricgatherer.cpp -fastqpreprocessing/src/metricgatherer.h -fastqpreprocessing/src/samplefastq.cpp -fastqpreprocessing/src/tagsort.cpp -fastqpreprocessing/src/utilities.cpp -fastqpreprocessing/src/utilities.h -fastqpreprocessing/utils/big-run.sh -fastqpreprocessing/utils/check_barcode_partition.py -fastqpreprocessing/utils/create_fastq.sh -fastqpreprocessing/utils/example-run.sh -fastqpreprocessing/utils/run.sh -src/sctools/__init__.py -src/sctools/bam.py -src/sctools/barcode.py -src/sctools/consts.py -src/sctools/count.py -src/sctools/encodings.py -src/sctools/fastq.py -src/sctools/groups.py -src/sctools/gtf.py -src/sctools/platform.py -src/sctools/reader.py -src/sctools/stats.py -src/sctools.egg-info/PKG-INFO -src/sctools.egg-info/SOURCES.txt -src/sctools.egg-info/dependency_links.txt -src/sctools.egg-info/entry_points.txt -src/sctools.egg-info/requires.txt -src/sctools.egg-info/top_level.txt -src/sctools/metrics/README.md -src/sctools/metrics/__init__.py -src/sctools/metrics/aggregator.py -src/sctools/metrics/gatherer.py -src/sctools/metrics/merge.py -src/sctools/metrics/writer.py -src/sctools/test/__init__.py -src/sctools/test/characterize-cell-testing-data.ipynb -src/sctools/test/characterize-gene-testing-data.ipynb -src/sctools/test/test_bam.py -src/sctools/test/test_barcode.py -src/sctools/test/test_count.py -src/sctools/test/test_encodings.py -src/sctools/test/test_entrypoints.py -src/sctools/test/test_fastq.py -src/sctools/test/test_groups.py -src/sctools/test/test_gtf.py -src/sctools/test/test_metrics.py -src/sctools/test/test_platform.py -src/sctools/test/test_stats.py -src/sctools/test/data/1k-august-2016.txt -src/sctools/test/data/cell-gene-umi-queryname-sorted.bam -src/sctools/test/data/cell-sorted-missing-cb.bam -src/sctools/test/data/cell-sorted.bam -src/sctools/test/data/cell_metrics_missing_cb.csv.gz -src/sctools/test/data/chr1.30k_records.gtf.gz -src/sctools/test/data/small-cell-sorted.bam -src/sctools/test/data/small-gene-sorted.bam -src/sctools/test/data/test.bam -src/sctools/test/data/test.gtf -src/sctools/test/data/test.gtf.bz2 -src/sctools/test/data/test.gtf.gz -src/sctools/test/data/test.sam -src/sctools/test/data/test_i7.fastq -src/sctools/test/data/test_i7.fastq.bz2 -src/sctools/test/data/test_i7.fastq.gz -src/sctools/test/data/test_r1.fastq -src/sctools/test/data/test_r1.fastq.bz2 -src/sctools/test/data/test_r1.fastq.gz -src/sctools/test/data/test_r2.bam -src/sctools/test/data/test_r2.fastq -src/sctools/test/data/test_r2.fastq.bz2 -src/sctools/test/data/test_r2.fastq.gz -src/sctools/test/data/test_r2_tagged.bam -src/sctools/test/data/unsorted.bam -src/sctools/test/data/group_metrics/expected_picard_group.csv -src/sctools/test/data/group_metrics/test_hisat2.csv -src/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log -src/sctools/test/data/group_metrics/test_hisat2_trans.csv -src/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log -src/sctools/test/data/group_metrics/test_picard_group.csv -src/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt -src/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt -src/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt -src/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt -src/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt -src/sctools/test/data/group_metrics/test_qc.rna_metrics.txt -src/sctools/test/data/group_metrics/test_rsem.cnt -src/sctools/test/data/group_metrics/test_rsem.csv -src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt -src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt -src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt -src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt \ No newline at end of file diff --git a/tools/scripts/sctools/src/sctools.egg-info/dependency_links.txt b/tools/scripts/sctools/src/sctools.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/tools/scripts/sctools/src/sctools.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tools/scripts/sctools/src/sctools.egg-info/entry_points.txt b/tools/scripts/sctools/src/sctools.egg-info/entry_points.txt deleted file mode 100644 index 9bb8c4ee..00000000 --- a/tools/scripts/sctools/src/sctools.egg-info/entry_points.txt +++ /dev/null @@ -1,13 +0,0 @@ -[console_scripts] -Attach10xBarcodes = sctools.platform:TenXV2.attach_barcodes -AttachBarcodes = sctools.platform:BarcodePlatform.attach_barcodes -CalculateCellMetrics = sctools.platform:GenericPlatform.calculate_cell_metrics -CalculateGeneMetrics = sctools.platform:GenericPlatform.calculate_gene_metrics -CreateCountMatrix = sctools.platform:GenericPlatform.bam_to_count_matrix -GroupQCs = sctools.platform:GenericPlatform.group_qc_outputs -MergeCellMetrics = sctools.platform:GenericPlatform.merge_cell_metrics -MergeCountMatrices = sctools.platform:GenericPlatform.merge_count_matrices -MergeGeneMetrics = sctools.platform:GenericPlatform.merge_gene_metrics -SplitBam = sctools.platform:GenericPlatform.split_bam -TagSortBam = sctools.platform:GenericPlatform.tag_sort_bam -VerifyBamSort = sctools.platform:GenericPlatform.verify_bam_sort diff --git a/tools/scripts/sctools/src/sctools.egg-info/requires.txt b/tools/scripts/sctools/src/sctools.egg-info/requires.txt deleted file mode 100644 index db00b00b..00000000 --- a/tools/scripts/sctools/src/sctools.egg-info/requires.txt +++ /dev/null @@ -1,13 +0,0 @@ -gffutils -numpy -pandas -pysam -pytest -pytest-cov -sphinx -sphinxcontrib-websupport -sphinx_rtd_theme -setuptools_scm>=3.1.0 -setuptools>=40.4.3 -scipy>=1.0.0 -crimson>=0.3.0 diff --git a/tools/scripts/sctools/src/sctools.egg-info/top_level.txt b/tools/scripts/sctools/src/sctools.egg-info/top_level.txt deleted file mode 100644 index 445d7700..00000000 --- a/tools/scripts/sctools/src/sctools.egg-info/top_level.txt +++ /dev/null @@ -1,3 +0,0 @@ -sctools -sctools/metrics -sctools/test diff --git a/tools/scripts/sctools/src/sctools/__init__.py b/tools/scripts/sctools/src/sctools/__init__.py deleted file mode 100644 index 1fec1fb4..00000000 --- a/tools/scripts/sctools/src/sctools/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# flake8: noqa -from . import bam -from . import encodings -from . import barcode -from . import fastq -from . import gtf -from . import stats -from . import reader -from . import metrics -from . import platform -from . import consts -from . import groups -from pkg_resources import get_distribution, DistributionNotFound - - -try: - __version__ = get_distribution(__name__).version -except DistributionNotFound: - pass diff --git a/tools/scripts/sctools/src/sctools/bam.py b/tools/scripts/sctools/src/sctools/bam.py deleted file mode 100644 index d8477386..00000000 --- a/tools/scripts/sctools/src/sctools/bam.py +++ /dev/null @@ -1,728 +0,0 @@ -""" -Tools for Manipulating SAM/BAM format files -=========================================== - -.. currentmodule:: sctools - -This module provides functions and classes to subsample reads from bam files that correspond to -specific chromosomes, split bam files into chunks, assign tags to bam files from paired fastq -records, and iterate over sorted bam files by one or more tags - -This module makes heavy use of the pysam wrapper for HTSlib, a high-performance c-library designed -to manipulate sam files - -Methods -------- -iter_tag_groups function to iterate over reads by an arbitrary tag -iter_cell_barcodes wrapper for iter_tag_groups that iterates over cell barcode tags -iter_genes wrapper for iter_tag_groups that iterates over gene tags -iter_molecules wrapper for iter_tag_groups that iterates over molecule tags -sort_by_tags_and_queryname sort bam by given list of zero or more tags, followed by query name -verify_sort verifies whether bam is correctly sorted by given list of tags, then query name - -Classes -------- -SubsetAlignments class to extract reads specific to requested chromosome(s) -Tagger class to add tags to sam/bam records from paired fastq records -AlignmentSortOrder abstract class to represent alignment sort orders -QueryNameSortOrder alignment sort order by query name -TagSortableRecord class to facilitate sorting of pysam.AlignedSegments -SortError error raised when sorting is incorrect - -References ----------- -htslib : https://github.com/samtools/htslib - -""" - -import functools -from functools import partial, reduce -import math -import os -import warnings -from abc import abstractmethod -from typing import ( - Iterator, - Iterable, - Generator, - List, - Set, - Dict, - Union, - Tuple, - Callable, - Any, - Optional, -) - -import pysam -import shutil -import multiprocessing -import uuid - -from . import consts - -# File descriptor to write log messages to -STDERR = 2 - - -class SubsetAlignments: - """Wrapper for pysam/htslib that extracts reads corresponding to requested chromosome(s) - - Parameters - ---------- - alignment_file : str - sam or bam file - open_mode : {'r', 'rb', None}, optional - open mode for pysam.AlignmentFile. 'r' indicates a sam file, 'rb' indicates a bam file, - and None attempts to autodetect based on the file suffix (Default = None) - - Methods - ------- - indices_by_chromosome - returns indices to line numbers containing the requested number of reads for a specified - chromosome - - Notes - ----- - samtools is a good general-purpose tool for that is capable of most subsampling tasks. It is a - good idea to check the samtools documentation when approaching these types of tasks. - - References - ---------- - samtools documentation : http://www.htslib.org/doc/samtools.html - - """ - - def __init__(self, alignment_file: str, open_mode: str = None): - if open_mode is None: - if alignment_file.endswith(".bam"): - open_mode = "rb" - elif alignment_file.endswith(".sam"): - open_mode = "r" - else: - raise ValueError( - f"Could not autodetect file type for alignment_file {alignment_file} (detectable suffixes: " - f".sam, .bam)" - ) - self._file: str = alignment_file - self._open_mode: str = open_mode - - def indices_by_chromosome( - self, n_specific: int, chromosome: str, include_other: int = 0 - ) -> Union[List[int], Tuple[List[int], List[int]]]: - """Return the list of first `n_specific` indices of reads aligned to `chromosome`. - - Parameters - ---------- - n_specific : int - Number of aligned reads to return indices for - chromosome : str - Only reads from this chromosome are considered valid - include_other : int, optional - The number of reads to include that are NOT aligned to chromosome. These can be aligned - or unaligned reads (default = 0). - - Returns - ------- - chromosome_indices : List[int] - list of indices to reads aligning to `chromosome` - other_indices : List[int], optional - list of indices to reads NOT aligning to chromosome, only returned if include_other is - not 0. - - """ - - # acceptable chromosomes - valid_chromosomes = [str(i) for i in range(1, 23)] + ["M", "MT", "X", "Y"] - valid_chromosomes.extend(["chr" + v for v in valid_chromosomes]) - - # check chromosome - if isinstance(chromosome, int) and chromosome < 23: - chromosome = str(chromosome) # try to convert - if chromosome not in valid_chromosomes: - warnings.warn( - "chromsome %s not in list of expected chromosomes: %r" - % (chromosome, valid_chromosomes) - ) - - with pysam.AlignmentFile(self._file, self._open_mode) as fin: - chromosome = str(chromosome) - chromosome_indices = [] - other_indices = [] - - for i, record in enumerate(fin): - - if not record.is_unmapped: # record is mapped - if chromosome == record.reference_name: - if len(chromosome_indices) < n_specific: - chromosome_indices.append(i) - elif len(other_indices) < include_other: - other_indices.append(i) - elif len(other_indices) < include_other: # record is not mapped - other_indices.append(i) - - # check termination condition (we have the requisite number of reads - if ( - len(chromosome_indices) == n_specific - and len(other_indices) == include_other - ): - break - - if len(chromosome_indices) < n_specific or len(other_indices) < include_other: - warnings.warn( - "Only %d unaligned and %d reads aligned to chromosome %s were found in" - "%s" - % (len(other_indices), len(chromosome_indices), chromosome, self._file) - ) - - if include_other != 0: - return chromosome_indices, other_indices - else: - return chromosome_indices - - -class Tagger: - """Add tags to a bam file from tag generators. - - Parameters - ---------- - bam_file : str - Bam file that tags are to be added to. - - Methods - ------- - tag - tag bam records given tag_generators (often generated from paired bam or fastq files) - # todo this should probably be wrapped up in __init__ to make this more function-like - """ - - def __init__(self, bam_file: str) -> None: - if not isinstance(bam_file, str): - raise TypeError( - f'The argument "bam_file" must be of type str, not {type(bam_file)}' - ) - self.bam_file = bam_file - - # todo add type to tag_generators (make sure it doesn't introduce import issues - def tag(self, output_bam_name: str, tag_generators) -> None: - """Add tags to bam_file. - - Given a bam file and tag generators derived from files sharing the same sort order, - adds tags to the .bam file, and writes the resulting file to output_bam_name. - - Parameters - ---------- - output_bam_name : str - Name of output tagged bam. - tag_generators : List[fastq.TagGenerator] - list of generators that yield fastq.Tag objects - - """ - with pysam.AlignmentFile( - self.bam_file, "rb", check_sq=False - ) as inbam, pysam.AlignmentFile( - output_bam_name, "wb", template=inbam - ) as outbam: - - # zip up all the iterators - for *tag_sets, sam_record in zip(*tag_generators, inbam): - for tag_set in tag_sets: - for tag in tag_set: - sam_record.set_tag(*tag) - outbam.write(sam_record) - - -def get_barcodes_from_bam( - in_bam: str, tags: List[str], raise_missing: bool -) -> Set[str]: - """Get all the distinct barcodes from a bam - - :param in_bam: str - Input bam file. - :param tags: List[str] - Tags in the bam that might contain barcodes. - :param raise_missing: bool - Raise an error if no barcodes can be found. - :return: set - A set of barcodes found in the bam - This set will not contain a None value - """ - barcodes = set() - # Get all the Barcodes from the BAM - with pysam.AlignmentFile(in_bam, "rb", check_sq=False) as input_alignments: - for alignment in input_alignments: - barcode = get_barcode_for_alignment(alignment, tags, raise_missing) - # If no provided tag was found on the record that had a non-null value - if barcode is not None: - barcodes.add(barcode) - return barcodes - - -def get_barcode_for_alignment( - alignment: pysam.AlignedSegment, tags: List[str], raise_missing: bool -) -> str: - """ Get the barcode for an Alignment - - :param alignment: pysam.AlignedSegment - An Alignment from pysam. - :param tags: List[str] - Tags in the bam that might contain barcodes. If multiple Tags are passed, will - return the contents of the first tag that contains a barcode. - :param raise_missing: bool - Raise an error if no barcodes can be found. - :return: str - A barcode for the alignment, or None if one is not found and raise_missing is False. - """ - alignment_barcode = None - for tag in tags: - # The non-existent barcode should be the exceptional case, so try/except is faster than if/else - try: - alignment_barcode = alignment.get_tag(tag) - break # Got the key, don't bother getting the next tag - except KeyError: - continue # Try to get the next tag - - if raise_missing and alignment_barcode is None: - raise RuntimeError( - "Alignment encountered that is missing {} tag(s).".format(tags) - ) - - return alignment_barcode - - -def write_barcodes_to_bins( - in_bam: str, tags: List[str], barcodes_to_bins: Dict[str, int], raise_missing: bool -) -> List[str]: - """ Write barcodes to appropriate bins as defined by barcodes_to_bins - - :param in_bam: str - The bam file to read. - :param tags: List[str] - Tags in the bam that might contain barcodes. - :param barcodes_to_bins: Dict[str, int] - A Dict from barcode to bin. All barcodes of the same type need to be written to the same bin. - These numbered bins are merged after parallelization so that all alignments with the same - barcode are in the same bam. - :param raise_missing: bool - Raise an error if no barcodes can be found. - :return: A list of paths to the written bins. - """ - # Create all the output files - with pysam.AlignmentFile(in_bam, "rb", check_sq=False) as input_alignments: - - # We need a random int appended to the dirname to make sure input bams with the same name don't clash - dirname = ( - os.path.splitext(os.path.basename(in_bam))[0] + "_" + str(uuid.uuid4()) - ) - os.makedirs(dirname) - - files = [] - bins = list(set(barcodes_to_bins.values())) - filepaths = [] - # barcode_to_bins is a dict of barcodes to ints. The ints are contiguous and are used as indices - # in the files array. The files array is an array of open file handles to write to. - for i in range(len(bins)): - out_bam_name = os.path.join(f"{dirname}", f"{dirname}_{i}.bam") - filepaths.append(out_bam_name) - - open_bam = pysam.AlignmentFile(out_bam_name, "w", template=input_alignments) - files.append(open_bam) - - # Loop over input; check each tag in priority order and partition barcodes into files based - # on the highest priority tag that is identified - for alignment in input_alignments: - barcode = get_barcode_for_alignment(alignment, tags, raise_missing) - if barcode is not None: - # Find or set the file associated with the tag and write the record to the correct file - out_file = files[barcodes_to_bins[barcode]] - out_file.write(alignment) - - for file in files: - file.close() - - return filepaths - - -def merge_bams(bams: List[str]) -> str: - """ Merge input bams using samtools. - - This cannot be a local function within `split` because then Python "cannot pickle a local object". - :param bams: Name of the final bam + bams to merge. - Because of how its called using multiprocessing, the bam basename is the first element of the list. - :return: The output bam name. - """ - bam_name = os.path.realpath(bams[0] + ".bam") - bams_to_merge = bams[1:] - pysam.merge("-c", "-p", bam_name, *bams_to_merge) - return bam_name - - -def split( - in_bams: List[str], - out_prefix: str, - tags: List[str], - approx_mb_per_split: float = 1000, - raise_missing: bool = True, - num_processes: int = None, -) -> List[str]: - """split `in_bam` by tag into files of `approx_mb_per_split` - - Parameters - ---------- - in_bams : str - Input bam files. - out_prefix : str - Prefix for all output files; output will be named as prefix_n where n is an integer equal - to the chunk number. - tags : List[str] - The bam tags to split on. The tags are checked in order, and sorting is done based on the - first identified tag. Further tags are only checked if the first tag is missing. This is - useful in cases where sorting is executed over a corrected barcode, but some records only - have a raw barcode. - approx_mb_per_split : float - The target file size for each chunk in mb - raise_missing : bool, optional - if True, raise a RuntimeError if a record is encountered without a tag. Else silently - discard the record (default = True) - num_processes : int, optional - The number of processes to parallelize over. If not set, will use all available processes. - - Returns - ------- - output_filenames : List[str] - list of filenames of bam chunks - - Raises - ------ - ValueError - when `tags` is empty - RuntimeError - when `raise_missing` is true and any passed read contains no `tags` - - """ - - if len(tags) == 0: - raise ValueError("At least one tag must be passed") - - if num_processes is None: - num_processes = multiprocessing.cpu_count() - - # find correct number of subfiles to spawn - bam_mb = sum(os.path.getsize(b) * 1e-6 for b in in_bams) - n_subfiles = int(math.ceil(bam_mb / approx_mb_per_split)) - if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN: - warnings.warn( - f"Number of requested subfiles ({n_subfiles}) exceeds " - f"{consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN}; this may cause OS errors by exceeding fid limits" - ) - if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE: - raise ValueError( - f"Number of requested subfiles ({n_subfiles}) exceeds " - f"{consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE}; this will usually cause OS errors, " - f"think about increasing max_mb_per_split." - ) - - full_pool = multiprocessing.Pool(num_processes) - - # Get all the barcodes over all the bams - os.write(STDERR, b"Retrieving barcodes from bams\n") - result = full_pool.map( - partial(get_barcodes_from_bam, tags=tags, raise_missing=raise_missing), in_bams - ) - - barcodes_list = list(reduce(lambda set1, set2: set1.union(set2), result)) - os.write(STDERR, b"Retrieved barcodes from bams\n") - - # Create the barcodes to bin mapping - os.write(STDERR, b"Allocating bins\n") - barcodes_to_bins_dict = {} - - # barcodes_list will always contain non-None elements from get_barcodes_from_bam - if len(barcodes_list) <= n_subfiles: - for barcode_index in range(len(barcodes_list)): - barcodes_to_bins_dict[barcodes_list[barcode_index]] = barcode_index - else: - for barcode_index in range(len(barcodes_list)): - file_index = barcode_index % n_subfiles - barcodes_to_bins_dict[barcodes_list[barcode_index]] = file_index - - # Split the bams by barcode in parallel - os.write(STDERR, b"Splitting the bams by barcode\n") - # Samtools needs a thread for compression, so we leave half the given processes open. - write_pool_processes = math.ceil(num_processes / 2) if num_processes > 2 else 1 - write_pool = multiprocessing.Pool(write_pool_processes) - scattered_split_result = write_pool.map( - partial( - write_barcodes_to_bins, - tags=list(tags), - raise_missing=raise_missing, - barcodes_to_bins=barcodes_to_bins_dict, - ), - in_bams, - ) - - bin_indices = list(set(barcodes_to_bins_dict.values())) - # Create a list of lists, where the first element of every sub-list is the name of the final output bam - bins = list([f"{out_prefix}_{index}"] for index in bin_indices) - - # A shard is the computation of writing barcodes to bins - # Gather all the files for each bin into the same sub-list. - for shard_index in range(len(scattered_split_result)): - shard = scattered_split_result[shard_index] - for file_index in range(len(shard)): - bins[file_index].append(shard[file_index]) - - write_pool.close() - - # Recombine the binned bams - os.write(STDERR, b"Merging temporary bam files\n") - merged_bams = full_pool.map(partial(merge_bams), bins) - - os.write(STDERR, b"deleting temporary files\n") - for paths in scattered_split_result: - shutil.rmtree(os.path.dirname(paths[0])) - - full_pool.close() - - return merged_bams - - -# todo change this to throw away "None" reads instead of appending them if we are filtering them -def iter_tag_groups( - tag: str, bam_iterator: Iterator[pysam.AlignedSegment], filter_null: bool = False -) -> Generator: - """Iterates over reads and yields them grouped by the provided tag value - - Parameters - ---------- - tag : str - BAM tag to group over - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - filter_null : bool, optional - If False, all reads that lack the requested tag are yielded together. Else, all reads - that lack the tag will be discarded (default = False). - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique value of tag - current_tag : str - the tag that reads in the group all share - - """ - - # get first read and tag set - reads = [next(bam_iterator)] - try: - current_tag = reads[0].get_tag(tag) - except KeyError: - current_tag = None # null tag is a category that gets emitted - - # now iterate over alignment sets - for alignment in bam_iterator: - try: - next_tag = alignment.get_tag(tag) - except KeyError: - next_tag = None # null tag is a category that we will emit - if next_tag == current_tag: - reads.append(alignment) - else: - # only yield if the tag is non-null or filter_null is false - if not filter_null or current_tag is not None: - yield iter(reads), current_tag - # reset to next group - reads = [alignment] - current_tag = next_tag - - if not filter_null or current_tag is not None: - yield iter(reads), current_tag - - -def iter_molecule_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: - """Iterate over all the molecules of a bam file sorted by molecule. - - Parameters - ---------- - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique molecule barcode tag - current_tag : str - the molecule barcode that records in the group all share - - """ - return iter_tag_groups( - tag=consts.MOLECULE_BARCODE_TAG_KEY, bam_iterator=bam_iterator - ) - - -def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: - """Iterate over all the cells of a bam file sorted by cell. - - Parameters - ---------- - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique cell barcode tag - current_tag : str - the cell barcode that reads in the group all share - - """ - return iter_tag_groups(tag=consts.CELL_BARCODE_TAG_KEY, bam_iterator=bam_iterator) - - -def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator: - """Iterate over all the cells of a bam file sorted by gene. - - Parameters - ---------- - bam_iterator : Iterator[pysam.AlignedSegment] - open bam file that can be iterated over - - Yields - ------ - grouped_by_tag : Iterator[pysam.AlignedSegment] - reads sharing a unique gene name tag - current_tag : str - the gene id that reads in the group all share - - """ - return iter_tag_groups(tag=consts.GENE_NAME_TAG_KEY, bam_iterator=bam_iterator) - - -def get_tag_or_default( - alignment: pysam.AlignedSegment, tag_key: str, default: Optional[str] = None -) -> Optional[str]: - """Extracts the value associated to `tag_key` from `alignment`, and returns a default value - if the tag is not present.""" - try: - return alignment.get_tag(tag_key) - except KeyError: - return default - - -class AlignmentSortOrder: - """The base class of alignment sort orders.""" - - @property - @abstractmethod - def key_generator(self) -> Callable[[pysam.AlignedSegment], Any]: - """Returns a callable function that calculates a sort key from given pysam.AlignedSegment.""" - raise NotImplementedError - - -class QueryNameSortOrder(AlignmentSortOrder): - """Alignment record sort order by query name.""" - - @staticmethod - def get_sort_key(alignment: pysam.AlignedSegment) -> str: - return alignment.query_name - - @property - def key_generator(self): - return QueryNameSortOrder.get_sort_key - - def __repr__(self) -> str: - return "query_name" - - -@functools.total_ordering -class TagSortableRecord(object): - """Wrapper for pysam.AlignedSegment that facilitates sorting by tags and query name.""" - - def __init__( - self, - tag_keys: Iterable[str], - tag_values: Iterable[str], - query_name: str, - record: pysam.AlignedSegment = None, - ) -> None: - self.tag_keys = tag_keys - self.tag_values = tag_values - self.query_name = query_name - self.record = record - - @classmethod - def from_aligned_segment( - cls, record: pysam.AlignedSegment, tag_keys: Iterable[str] - ) -> "TagSortableRecord": - """Create a TagSortableRecord from a pysam.AlignedSegment and list of tag keys""" - assert record is not None - tag_values = [get_tag_or_default(record, key, "") for key in tag_keys] - query_name = record.query_name - return cls(tag_keys, tag_values, query_name, record) - - def __lt__(self, other: object) -> bool: - if not isinstance(other, TagSortableRecord): - return NotImplemented - self.__verify_tag_keys_match(other) - for (self_tag_value, other_tag_value) in zip(self.tag_values, other.tag_values): - if self_tag_value < other_tag_value: - return True - elif self_tag_value > other_tag_value: - return False - return self.query_name < other.query_name - - def __eq__(self, other: object) -> bool: - # TODO: Add more error checking - if not isinstance(other, TagSortableRecord): - return NotImplemented - self.__verify_tag_keys_match(other) - for (self_tag_value, other_tag_value) in zip(self.tag_values, other.tag_values): - if self_tag_value != other_tag_value: - return False - return self.query_name == other.query_name - - def __verify_tag_keys_match(self, other) -> None: - if self.tag_keys != other.tag_keys: - format_str = "Cannot compare records using different tag lists: {0}, {1}" - raise ValueError(format_str.format(self.tag_keys, other.tag_keys)) - - def __str__(self) -> str: - return self.__repr__() - - def __repr__(self) -> str: - format_str = "TagSortableRecord(tags: {0}, tag_values: {1}, query_name: {2}" - return format_str.format(self.tag_keys, self.tag_values, self.query_name) - - -def sort_by_tags_and_queryname( - records: Iterable[pysam.AlignedSegment], tag_keys: Iterable[str] -) -> Iterable[pysam.AlignedSegment]: - """Sorts the given bam records by the given tags, followed by query name. - If no tags are given, just sorts by query name. - """ - tag_sortable_records = ( - TagSortableRecord.from_aligned_segment(r, tag_keys) for r in records - ) - sorted_records = sorted(tag_sortable_records) - aligned_segments = (r.record for r in sorted_records) - return aligned_segments - - -def verify_sort(records: Iterable[TagSortableRecord], tag_keys: Iterable[str]) -> None: - """Raise AssertionError if the given records are not correctly sorted by the given tags and query name""" - # Setting tag values and query name to empty string ensures first record will never be less than old_record - old_record = TagSortableRecord( - tag_keys=tag_keys, tag_values=["" for _ in tag_keys], query_name="", record=None - ) - i = 0 - for record in records: - i += 1 - if not record >= old_record: - msg = "Records {0} and {1} are not in correct order:\n{1}:{2} \nis less than \n{0}:{3}" - raise SortError(msg.format(i - 1, i, record, old_record)) - old_record = record - - -class SortError(Exception): - pass diff --git a/tools/scripts/sctools/src/sctools/barcode.py b/tools/scripts/sctools/src/sctools/barcode.py deleted file mode 100644 index f26aac24..00000000 --- a/tools/scripts/sctools/src/sctools/barcode.py +++ /dev/null @@ -1,379 +0,0 @@ -""" -Nucleotide Barcode Manipulation Tools -===================================== - -.. currentmodule:: sctools - -This module contains tools to characterize oligonucleotide barcodes and a simple hamming-base -error-correction approach which corrects barcodes within a specified distance of a "whitelist" of -expected barcodes. - -Classes -------- -Barcodes Class to characterize a set of barcodes -ErrorsToCorrectBarcodesMap Class to carry out error correction routines - -""" - -import itertools -from collections import Counter -from typing import Mapping, Iterator, List, Tuple, Iterable - -import numpy as np -import pysam - -from . import consts -from .encodings import TwoBit -from .stats import base4_entropy - - -class Barcodes: - """Container for a set of nucleotide barcodes. - - Contained barcodes are encoded in 2bit representation for fast operations. Instances of this - class can optionally be constructed from an iterable where barcodes can be present multiple - times. In these cases, barcodes are analyzed based on their observed frequencies. - - Parameters - ---------- - barcodes: Mapping[str, int] - dictionary-like mapping barcodes to the number of times they were observed - barcode_length: int - the length of all barcodes in the set. Different-length barcodes are not supported. - - See Also - -------- - sctools.encodings.TwoBit - - """ - - def __init__(self, barcodes: Mapping[str, int], barcode_length: int): - if not isinstance(barcodes, Mapping): - raise TypeError( - 'The argument "barcodes" must be a dict-like object mapping barcodes to counts' - ) - self._mapping: Mapping[str, int] = barcodes - - if not isinstance(barcode_length, int) and barcode_length > 0: - raise ValueError('The argument "barcode_length" must be a positive integer') - self._barcode_length: int = barcode_length - - def __contains__(self, item) -> bool: - return item in self._mapping - - def __iter__(self) -> Iterator[str]: - return iter(self._mapping) - - def __len__(self) -> int: - return len(self._mapping) - - def __getitem__(self, item) -> int: - return self._mapping[item] - - def summarize_hamming_distances(self) -> Mapping[str, float]: - """Returns descriptive statistics on hamming distances between pairs of barcodes. - - Returns - ------- - descriptive_statistics : Mapping[str, float] - minimum, 25th percentile, median, 75th percentile, maximum, and average hamming - distance between all pairs of barcodes - - References - ---------- - https://en.wikipedia.org/wiki/Hamming_distance - - """ - distances: List = [] - - for a, b in itertools.combinations(self, 2): - distances.append(TwoBit.hamming_distance(a, b)) - - keys: Tuple = ( - "minimum", - "25th percentile", - "median", - "75th percentile", - "maximum", - "average", - ) - values: List = list(np.percentile(distances, [0, 25, 50, 75, 100])) - values.append(np.mean(distances)) - - return dict(zip(keys, values)) - - def base_frequency(self, weighted=False) -> np.ndarray: - """return the frequency of each base at each position in the barcode set - - Notes - ----- - weighting is currently not supported, and must be set to False or base_frequency will raise - NotImplementedError # todo fix - - Parameters - ---------- - weighted: bool, optional - if True, each barcode is counted once for each time it was observed (default = False) - - Returns - ------- - frequencies : np.array - barcode_length x 4 2d numpy array - - Raises - ------ - NotImplementedError - if weighted is True - - """ - base_counts_by_position: np.ndarray = np.zeros( - (self._barcode_length, 4), dtype=np.uint64 - ) - - keys: np.ndarray = np.fromiter(self._mapping.keys(), dtype=np.uint64) - - for i in reversed(range(self._barcode_length)): - binary_base_representations, counts = np.unique( - keys & 3, return_counts=True - ) - if weighted: - raise NotImplementedError - else: - base_counts_by_position[i, binary_base_representations] = counts - - # finished with this nulceotide, move two bits forward to the next one - keys >>= 2 - - return base_counts_by_position - - def effective_diversity(self, weighted=False) -> np.ndarray: - """Returns the effective base diversity of the barcode set by position. - - maximum diversity for each position is 1, and represents a perfect split of 25% per base at - a given position. - - Parameters - ---------- - weighted : bool, optional - if True, each barcode is counted once for each time it was observed (default = False) - - Returns - ------- - effective_diversity : np.array[float] - 1-d array of size barcode_length containing floats in [0, 1] - - """ - return base4_entropy(self.base_frequency(weighted=weighted)) - - @classmethod - def from_whitelist(cls, file_: str, barcode_length: int): - """Creates a barcode set from a whitelist file. - - Parameters - ---------- - file_ : str - location of the whitelist file. Should be formatted one barcode per line. Barcodes - should be encoded in plain text (UTF-8, ASCII), not bit-encoded. Each barcode will be - assigned a count of 1. - barcode_length : int - Length of the barcodes in the file. - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - - """ - tbe = TwoBit(barcode_length) - with open(file_, "rb") as f: - return cls( - Counter(tbe.encode(barcode[:-1]) for barcode in f), barcode_length - ) - - @classmethod - def from_iterable_encoded(cls, iterable: Iterable[int], barcode_length: int): - """Construct an ObservedBarcodeSet from an iterable of encoded barcodes. - - Parameters - ---------- - iterable : Iterable[int] - iterable of barcodes encoded in TwoBit representation - barcode_length : int - the length of the barcodes in `iterable` - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - """ - return cls(Counter(iterable), barcode_length=barcode_length) - - @classmethod - def from_iterable_strings(cls, iterable: Iterable[str], barcode_length: int): - """Construct an ObservedBarcodeSet from an iterable of string barcodes. - - Parameters - ---------- - iterable : Iterable[str] - iterable of barcodes encoded in TwoBit representation - barcode_length : int - the length of the barcodes in `iterable` - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - """ - tbe: TwoBit = TwoBit(barcode_length) - return cls( - Counter(tbe.encode(b.encode()) for b in iterable), - barcode_length=barcode_length, - ) - - @classmethod - def from_iterable_bytes(cls, iterable: Iterable[bytes], barcode_length: int): - """Construct an ObservedBarcodeSet from an iterable of bytes barcodes. - - Parameters - ---------- - iterable : Iterable[bytes] - iterable of barcodes in bytes representation - barcode_length : int - the length of the barcodes in `iterable` - - Returns - ------- - barcodes : Barcodes - class object containing barcodes from a whitelist file - """ - tbe: TwoBit = TwoBit(barcode_length) - return cls( - Counter(tbe.encode(b) for b in iterable), barcode_length=barcode_length - ) - - -class ErrorsToCorrectBarcodesMap: - """Correct any barcode that is within one hamming distance of a whitelisted barcode - - Parameters - ---------- - errors_to_barcodes : Mapping[str, str] - dict-like mapping 1-base errors to the whitelist barcode that they could be generated from - - Methods - ------- - get_corrected_barcode(barcode: str) - Return a barcode if it is whitelist, or the corrected version if within edit distance 1 - correct_bam(bam_file: str, output_bam_file: str) - correct barcodes in a bam file, given a whitelist - - References - ---------- - https://en.wikipedia.org/wiki/Hamming_distance - - """ - - def __init__(self, errors_to_barcodes: Mapping[str, str]): - if not isinstance(errors_to_barcodes, Mapping): - raise TypeError( - f'The argument "errors_to_barcodes" must be a mapping of erroneous barcodes to correct ' - f"barcodes, not {type(errors_to_barcodes)}" - ) - self._map = errors_to_barcodes - - def get_corrected_barcode(self, barcode: str) -> str: - """Return a barcode if it is whitelist, or the corrected version if within edit distance 1 - - Parameters - ---------- - barcode : str - the barcode to return the corrected version of. If the barcode is in the whitelist, - the input barcode is returned unchanged. - - Returns - ------- - corrected_barcode : str - corrected version of the barcode - - Raises - ------ - KeyError - if the passed barcode is not within 1 hamming distance of any whitelist barcode - - References - ---------- - https://en.wikipedia.org/wiki/Hamming_distance - - """ - return self._map[barcode] - - @staticmethod - def _prepare_single_base_error_hash_table( - barcodes: Iterable[str], - ) -> Mapping[str, str]: - """Generate a map of correct barcodes and single base error codes to whitelist barcodes - - Parameters - ---------- - barcodes : Iterable[str] - :param Iterable barcodes: iterable of string barcodes - :return dict: mapping between erroneous barcodes with single-base mutations and the barcode - they were generated from - """ - error_map = {} - for barcode in barcodes: - - # include correct barcode - error_map[barcode] = barcode - - # include all single-base errors - for i, nucleotide in enumerate(barcode): - errors = set("ACGTN") - errors.discard(nucleotide) - for e in errors: - error_map[barcode[:i] + e + barcode[i + 1 :]] = barcode - return error_map - - @classmethod - def single_hamming_errors_from_whitelist(cls, whitelist_file: str): - """Factory method to generate instance of class from a file containing "correct" barcodes. - - Parameters - ---------- - whitelist_file : str - Text file containing barcode per line. - - Returns - ------- - errors_to_barcodes_map : ErrorsToCorrectBarcodesMap - instance of cls, built from whitelist - - """ - with open(whitelist_file, "r") as f: - return cls( - cls._prepare_single_base_error_hash_table((line[:-1] for line in f)) - ) - - def correct_bam(self, bam_file: str, output_bam_file: str) -> None: - """Correct barcodes in a (potentially unaligned) bamfile, given a whitelist. - - Parameters - ---------- - bam_file : str - BAM format file in same order as the fastq files - output_bam_file : str - BAM format file containing cell, umi, and sample tags. - - """ - with pysam.AlignmentFile(bam_file, "rb") as fin, pysam.AlignmentFile( - output_bam_file, "wb", template=fin - ) as fout: - for alignment in fin: - try: - tag = self.get_corrected_barcode(alignment.get_tag("CR")) - except KeyError: # pass through the uncorrected barcode. - tag = alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) - alignment.set_tag( - tag=consts.CELL_BARCODE_TAG_KEY, value=tag, value_type="Z" - ) - fout.write(alignment) diff --git a/tools/scripts/sctools/src/sctools/consts.py b/tools/scripts/sctools/src/sctools/consts.py deleted file mode 100644 index e07980cb..00000000 --- a/tools/scripts/sctools/src/sctools/consts.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Global constants -================ - -.. currentmodule:: sctools - -This module contains global constants, such as various barcoded BAM tags, and sctools-specific -constants. -""" - -# BAM tag constants - -RAW_SAMPLE_BARCODE_TAG_KEY = "SR" -QUALITY_SAMPLE_BARCODE_TAG_KEY = "SY" - -MOLECULE_BARCODE_TAG_KEY = "UB" -RAW_MOLECULE_BARCODE_TAG_KEY = "UR" -QUALITY_MOLECULE_BARCODE_TAG_KEY = "UY" - -CELL_BARCODE_TAG_KEY = "CB" -RAW_CELL_BARCODE_TAG_KEY = "CR" -QUALITY_CELL_BARCODE_TAG_KEY = "CY" - -GENE_NAME_TAG_KEY = "GE" -NUMBER_OF_HITS_TAG_KEY = "NH" - -ALIGNMENT_LOCATION_TAG_KEY = "XF" -INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTRONIC" -CODING_ALIGNMENT_LOCATION_TAG_VALUE = "CODING" -UTR_ALIGNMENT_LOCATION_TAG_VALUE = "UTR" -INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE = "INTERGENIC" - -# bam.py constants - -MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500 -MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000 - - -# modes of the count matrix runs -SINGLE_CELL_COUNT_MATRIX = 0 -SINGLE_NUCLEI_COUNT_MATRIX = 1 diff --git a/tools/scripts/sctools/src/sctools/count.py b/tools/scripts/sctools/src/sctools/count.py deleted file mode 100644 index b8d2e740..00000000 --- a/tools/scripts/sctools/src/sctools/count.py +++ /dev/null @@ -1,400 +0,0 @@ -""" -Construct Count Matrices -======================== - -This module defines methods that enable (optionally) distributed construction of count matrices. -This module outputs coordinate sparse matrices that are converted to CSR matrices prior to delivery -for compact storage, and helper functions to convert this format into other commonly used formats. - -Methods -------- -from_sorted_tagged_bam( - bam_file: str, annotation_file: str, cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag: str=consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag: str=consts.GENE_NAME_TAG_KEY, open_mode: str='rb') -from_mtx(matrix_mtx: str, row_index_file: str, col_index_file: str) - - -Notes ------ -Memory usage of this module can be roughly approximated by the chunk_size parameter in Optimus. -The memory usage is equal to approximately 6*8 bytes per molecules in the file. -""" - -import itertools -import operator -from typing import List, Dict, Tuple, Set, Optional, Generator - -import numpy as np -import pysam -import scipy.sparse as sp -from scipy.io import mmread - -from sctools import consts, bam - - -class CountMatrix: - def __init__( - self, matrix: sp.csr_matrix, row_index: np.ndarray, col_index: np.ndarray - ): - self._matrix = matrix - self._row_index = row_index - self._col_index = col_index - - @property - def matrix(self): - return self._matrix - - @property - def row_index(self): - return self._row_index - - @property - def col_index(self): - return self._col_index - - @staticmethod - def _get_alignments_grouped_by_query_name_generator( - bam_file: str, - cell_barcode_tag: str, - molecule_barcode_tag: str, - open_mode: str = "rb", - ) -> Generator[ - Tuple[str, Optional[str], Optional[str], List[pysam.AlignedSegment]], None, None - ]: - """Iterates through a query_name-sorted BAM file, groups all alignments with the same query name - - Parameters - ---------- - bam_file : str - input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that - order - cell_barcode_tag : str - Tag that specifies the cell barcode for each read. - molecule_barcode_tag : str - Tag that specifies the molecule barcode for each read. - - Returns - ------- - a generator for tuples (query_name, cell_barcode, molecule_barcode, alignments) - """ - with pysam.AlignmentFile(bam_file, mode=open_mode) as bam_records: - for (query_name, grouper) in itertools.groupby( - bam_records, key=lambda record: record.query_name - ): - alignments: List[pysam.AlignedSegment] = list(grouper) - cell_barcode: Optional[str] = bam.get_tag_or_default( - alignments[0], cell_barcode_tag - ) - molecule_barcode: Optional[str] = bam.get_tag_or_default( - alignments[0], molecule_barcode_tag - ) - yield query_name, cell_barcode, molecule_barcode, alignments - - """Looks through a list of gene locations to find the one that the given read_start ovelaps - - Parameters - ---------- - _gene_locations: Array - array with gene start end locations and names - search_start: - index of gene to start searching from - search_end: - index of gene up to which to search to - read_start: - position at which the read starts at - - Returns - ------- - name of gene with overlap or None if no overlap is found - - """ - - @classmethod - def binary_overlap(cls, _gene_locations, search_start, search_end, read_start): - while search_start <= search_end: - current_gene_index = int((search_start + search_end) / 2) - if ( - _gene_locations[current_gene_index][0][0] - < read_start - < _gene_locations[current_gene_index][0][1] - ): - return _gene_locations[current_gene_index][1] - elif _gene_locations[current_gene_index][0][0] < read_start: - search_start = current_gene_index + 1 - else: - search_end = current_gene_index - 1 - return None - - # todo add support for generating a matrix of invalid barcodes - # todo add support for splitting spliced and unspliced reads - # todo add support for generating a map of cell barcodes - # todo add the option for stringent checks on the input (e.g. BAM sort order) - # todo once the stringent checks are in place, safely move on to the hashset-free implementation - @classmethod - def from_sorted_tagged_bam( - cls, - bam_file: str, - gene_name_to_index: Dict[str, int], - chromosomes_gene_locations_extended: Dict[str, List[tuple]] = None, - cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag: str = consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag: str = consts.GENE_NAME_TAG_KEY, - open_mode: str = "rb", - ) -> "CountMatrix": - """Generate a count matrix from a sorted, tagged bam file - - Notes - ----- - - Input bam file must be sorted by query name. - - - The sort order of the input BAM file is not strictly checked. If the input BAM file not sorted - by query_name, the output counts will be wrong without any warnings being issued. - - This method returns counts that correspond to both spliced and unspliced reads. - - Description of the algorithm - ---------------------------- - The implemented counting strategy is intended to closely match that of CellRanger 2.1.1 - (see the references). The following pseudo-code describes the counting algorithm: - - for each query_name (i.e. unique sequenced read): - - if only a single alignment exists, _consider_ the read - - if multiple alignments exist, - - if a unique gene name is associated to all alignments that have a gene name tag, - _consider_ the read; otherwise, the read is useless and neglect it - - if the read is to be _considered_, - - if the triple (cell barcode, molecule barcode, gene name) is not encountered before, - count it as evidence for a unique transcript; otherwise, consider the read as duplicate - and neglect it - - Parameters - ---------- - bam_file : str - input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that - order - chromosomes_gene_locations_extended : dict - Location of genes by chromosome - (default = None) - cell_barcode_tag : str, optional - Tag that specifies the cell barcode for each read. Reads without this tag will be ignored - (default = consts.CELL_BARCODE_TAG_KEY) - molecule_barcode_tag : str, optional - Tag that specifies the molecule barcode for each read. Reads without this tag will be - ignored (default = consts.MOLECULE_BARCODE_TAG_KEY) - gene_name_tag - Tag that specifies the gene name for each read. Reads without this tag will be ignored - (default = consts.GENE_NAME_TAG_KEY) - gene_name_to_index : dict - A map from gene names to their counts matrix column index - open_mode : {'r', 'rb'}, optional - indicates that the passed file is a bam file ('rb') or sam file ('r') (default = 'rb'). - - Returns - ------- - count_matrix : CountMatrix - cells x genes sparse count matrix in compressed sparse row format (cells are compressed) - - Notes - ----- - All matrices produced by this function called on different BAM chunks that share the same annotation - file can be concatenated using the scipy sparse vstack function, since by definition, the cell barcodes - contained in different BAM chunks are mutually exclusive. for example: - - >>> import scipy.sparse as sp - >>> A = sp.coo_matrix([[1, 2], [3, 4]]).tocsr() - >>> B = sp.coo_matrix([[5, 6]]).tocsr() - >>> sp.vstack([A, B]).toarray() - array([[1, 2], - [3, 4], - [5, 6]]) - - See Also - -------- - samtools sort (-t parameter): - C library that can sort files as required. - http://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS - - TagSortBam.CellSortBam: - WDL task that accomplishes the sorting necessary for this module. - https://github.com/HumanCellAtlas/skylab/blob/master/library/tasks/TagSortBam.wdl - - Relevant parmalinks to the counting algorithm in CellRanger: - [1] https://github.com/10XGenomics/cellranger/blob/aba5d379169ff0d4bee60e3d100df35752b90383/mro/stages/counter/ - attach_bcs_and_umis/__init__.py - [2] https://github.com/10XGenomics/cellranger/blob/aba5d379169ff0d4bee60e3d100df35752b90383/lib/rust/ - annotate_reads/src/main.rs - """ - # map the gene from reach record to an index in the sparse matrix - n_genes = len(gene_name_to_index) - - # track which tuples (cell_barcode, molecule_barcode, gene_name) we've encountered so far - observed_cell_molecule_gene_set: Set[Tuple[str, str, str]] = set() - - # COO sparse matrix entries - data: List[int] = [] - cell_indices: List[int] = [] - gene_indices: List[int] = [] - - # track which cells we've seen, and what the current cell number is - n_cells = 0 - cell_barcode_to_index: Dict[str, int] = {} - - grouped_records_generator = cls._get_alignments_grouped_by_query_name_generator( - bam_file, cell_barcode_tag, molecule_barcode_tag, open_mode=open_mode - ) - - for ( - query_name, - cell_barcode, - molecule_barcode, - input_alignments, - ) in grouped_records_generator: - - # modify alignments to include the gene name to the alignments to INTRONIC regions - alignments = input_alignments - - # only keep queries w/ well-formed UMIs - gene_name = None - if cell_barcode is None or molecule_barcode is None: - continue - - if len(alignments) == 1: - primary_alignment = alignments[0] - if ( - primary_alignment.has_tag(gene_name_tag) - and primary_alignment.has_tag("XF") - and primary_alignment.get_tag("XF") != "INTERGENIC" - ): - gene_name = primary_alignment.get_tag(gene_name_tag) - # overlaps multiple genes, drop query, and unfortunately there only one - # one alignment for this query - if len(gene_name.split(",")) != 1: - continue - else: - continue # drop query - else: # multi-map - implicated_gene_names: Set[str] = set() - for alignment in alignments: - if ( - alignment.has_tag(gene_name_tag) - and alignment.has_tag("XF") - and alignment.get_tag("XF") != "INTERGENIC" - ): - # consider its gene name only if it has only gene name - gene_name = alignment.get_tag(gene_name_tag) - if len(gene_name.split(",")) == 1: - implicated_gene_names.add(alignment.get_tag(gene_name_tag)) - - if len(implicated_gene_names) == 1: # only one gene - gene_name = implicated_gene_names.__iter__().__next__() - else: - continue # drop query - - if gene_name is None: - continue - - if ( - cell_barcode, - molecule_barcode, - gene_name, - ) in observed_cell_molecule_gene_set: - continue # optical/PCR duplicate -> drop query - else: - observed_cell_molecule_gene_set.add( - (cell_barcode, molecule_barcode, gene_name) - ) - - # find the indices that this molecule should correspond to - gene_index = gene_name_to_index[gene_name] - - # if we've seen this cell before, get its index, else set it - try: - cell_index = cell_barcode_to_index[cell_barcode] - except KeyError: - cell_index = n_cells - cell_barcode_to_index[cell_barcode] = n_cells - n_cells += 1 - - # record the molecule data - data.append(1) # one count of this molecule - cell_indices.append(cell_index) - gene_indices.append(gene_index) - - # convert into coo_matrix - coordinate_matrix = sp.coo_matrix( - (data, (cell_indices, gene_indices)), - shape=(n_cells, n_genes), - dtype=np.uint32, - ) - - # convert to a csr sparse matrix and return - col_index = np.asarray( - [ - k - for k, v in sorted( - gene_name_to_index.items(), key=operator.itemgetter(1) - ) - ] - ) - row_index = np.asarray( - [ - k - for k, v in sorted( - cell_barcode_to_index.items(), key=operator.itemgetter(1) - ) - ] - ) - - return cls(coordinate_matrix.tocsr(), row_index, col_index) - - def save(self, prefix: str) -> None: - sp.save_npz(prefix + ".npz", self._matrix, compressed=True) - np.save(prefix + "_row_index.npy", self._row_index) - np.save(prefix + "_col_index.npy", self._col_index) - - @classmethod - def load(cls, prefix: str) -> "CountMatrix": - matrix = sp.load_npz(prefix + ".npz") - row_index = np.load(prefix + "_row_index.npy") - col_index = np.load(prefix + "_col_index.npy") - return cls(matrix, row_index, col_index) - - @classmethod - def merge_matrices(cls, input_prefixes: str) -> "CountMatrix": - col_indices = [np.load(p + "_col_index.npy") for p in input_prefixes] - row_indices = [np.load(p + "_row_index.npy") for p in input_prefixes] - matrices = [sp.load_npz(p + ".npz") for p in input_prefixes] - - matrix: sp.csr_matrix = sp.vstack(matrices, format="csr") - # todo test that col_indices are all same shape - col_index = col_indices[0] - row_index = np.concatenate(row_indices) - return cls(matrix, row_index, col_index) - - @classmethod - def from_mtx( - cls, matrix_mtx: str, row_index_file: str, col_index_file: str - ) -> "CountMatrix": - """ - - Parameters - ---------- - matrix_mtx : str - file containing count matrix in matrix market sparse format - row_index_file : str - newline delimited row index file - col_index_file : str - newline delimited column index file - - Returns - ------- - CountMatrix - instance of class - """ - matrix: sp.csr_matrix = mmread(matrix_mtx).tocsr() - with open(row_index_file, "r") as fin: - row_index = np.array(fin.readlines()) - with open(col_index_file, "r") as fin: - col_index = np.array(fin.readlines()) - return cls(matrix, row_index, col_index) diff --git a/tools/scripts/sctools/src/sctools/encodings.py b/tools/scripts/sctools/src/sctools/encodings.py deleted file mode 100644 index 85f1cef8..00000000 --- a/tools/scripts/sctools/src/sctools/encodings.py +++ /dev/null @@ -1,296 +0,0 @@ -""" -Compressed Barcode Encoding Methods -=================================== - -.. currentmodule:: sctools - -This module defines several classes to encode DNA sequences in memory-efficient forms, using 2 bits -to encode bases of a 4-letter DNA alphabet (ACGT) or 3 bits to encode a 5-letter DNA alphabet -that includes the ambiguous call often included by Illumina base calling software (ACGTN). The -classes also contain several methods useful for efficient querying and manipulation of the encoded -sequence. - -Classes -------- -Encoding Encoder base class -ThreeBit Three bit DNA encoder / decoder -TwoBit Two bit DNA encoder / decoder - -""" - -import random -from typing import Mapping, AnyStr, Set - - -class Encoding: - """ - - Attributes - ---------- - encoding_map : TwoBitEncodingMap - Class that mimics a Mapping[bytes, str] where bytes must be a single byte encoded character - (encoder) - decoding_map : Mapping[int, bytes] - Dictionary that maps integers to bytes human-readable representations (decoder) - bits_per_base : int - number of bits used to encode each base - - Methods - ------- - encode(bytes_encoded: bytes) - encode a DNA string in a compressed representation - decode(integer_encoded: int) - decode a compressed DNA string into a human readable bytes format - gc_content(integer_encoded: int) - calculate the GC content of an encoded DNA string - hamming_distance(a: int, b: int) - calculate the hamming distance between two encoded DNA strings - - """ - - encoding_map: Mapping[AnyStr, int] = NotImplemented - decoding_map: Mapping[int, AnyStr] = NotImplemented - bits_per_base: int = NotImplemented - - @classmethod - def encode(cls, bytes_encoded: bytes) -> int: - """Encode a DNA bytes string. - - Parameters - ---------- - bytes_encoded : bytes - bytes DNA string - - Returns - ------- - encoded : int - Encoded DNA sequence - - """ - raise NotImplementedError - - def decode(self, integer_encoded: int) -> bytes: - """Decode a DNA bytes string. - - Parameters - ---------- - integer_encoded : bytes - Integer encoded DNA string - - Returns - ------- - decoded : bytes - Bytes decoded DNA sequence - - """ - raise NotImplementedError - - def gc_content(self, integer_encoded: int) -> int: - """Return the number of G or C nucleotides in `integer_encoded` - - Parameters - ---------- - integer_encoded : int - Integer encoded DNA string - - Returns - ------- - gc_content, int - number of bases in `integer_encoded` input that are G or C. - - """ - raise NotImplementedError - - @staticmethod - def hamming_distance(a, b) -> int: - """Calculate the hamming distance between two DNA sequences - - The hamming distance counts the number of bases that are not the same nucleotide - - Parameters - ---------- - a, b : int - integer encoded - - - Returns - ------- - d : int - hamming distance between a and b - """ - raise NotImplementedError - - -class TwoBit(Encoding): - """Encode a DNA sequence using a 2-bit encoding. - - Two-bit encoding uses 0 for an encoded nucleotide. As such, it cannot distinguish between - the end of sequence and trailing A nucleotides, and thus decoding these strings requires - knowledge of their length. Therefore, it is only appropriate for encoding fixed sequence - lengths - - In addition, in order to encode in 2-bit, N-nucleotides must be randomized to one of A, C, - G, and T. - - Parameters - ---------- - sequence_length : int - number of nucleotides that are being encoded - - """ - - __doc__ += Encoding.__doc__ - - def __init__(self, sequence_length: int): - self.sequence_length: int = sequence_length - - class TwoBitEncodingMap: - """Dict-like class that maps bytes to 2-bit integer representations - - Generates random nucleotides for ambiguous nucleotides e.g. N - - """ - - map_ = { - ord("A"): 0, - ord("C"): 1, - ord("T"): 2, - ord("G"): 3, - ord("a"): 0, - ord("c"): 1, - ord("t"): 2, - ord("g"): 3, - } - - iupac_ambiguous: Set[int] = {ord(c) for c in "MRWSYKVHDBNmrwsykvhdbn"} - - def __getitem__(self, byte: int) -> int: - try: - return self.map_[byte] - except KeyError: - if byte not in self.iupac_ambiguous: - raise KeyError(f"{chr(byte)} is not a valid IUPAC nucleotide code") - return random.randint(0, 3) - - encoding_map: TwoBitEncodingMap = TwoBitEncodingMap() - decoding_map: Mapping[int, bytes] = {0: b"A", 1: b"C", 2: b"T", 3: b"G"} - bits_per_base: int = 2 - - @classmethod - def encode(cls, bytes_encoded: bytes) -> int: - encoded = 0 - for character in bytes_encoded: - encoded <<= 2 - encoded += cls.encoding_map[character] - return encoded - - def decode(self, integer_encoded: int) -> bytes: - decoded = b"" - for _ in range(self.sequence_length): - decoded = self.decoding_map[integer_encoded & 3] + decoded - integer_encoded >>= 2 - return decoded - - def gc_content(self, integer_encoded: int) -> int: - i = 0 - for _ in range(self.sequence_length): - i += integer_encoded & 1 - integer_encoded >>= 2 - return i - - @staticmethod - def hamming_distance(a: int, b: int) -> int: - difference = a ^ b - d_hamming = 0 - while difference: - if difference & 3: - d_hamming += 1 - difference >>= 2 - return d_hamming - - -class ThreeBit(Encoding): - """Encode a DNA sequence using a 3-bit encoding. - - Since no bases are encoded as 0, an empty triplet is interpreted as the end of the encoded - string; Three-bit encoding can be used to encode and decode strings without knowledge of their - length. - - """ - - __doc__ += Encoding.__doc__ - - def __init__(self, *args, **kwargs): - """ - Notes - ----- - args and kwargs are not used, but allow ThreeBit to be initialized the same way as TwoBit, - despite not requiring a sequence length parameter. - - """ - pass - - class ThreeBitEncodingMap: - """Dict-like class that maps bytes to 3-bit integer representations - - All IUPAC ambiguous codes are treated as "N" - - """ - - # C: 1, A: 2, G: 3, T: 4, N: 6; # note, not using 0 - map_ = { - ord("C"): 1, - ord("A"): 2, - ord("G"): 3, - ord("T"): 4, - ord("N"): 6, - ord("c"): 1, - ord("a"): 2, - ord("g"): 3, - ord("t"): 4, - ord("n"): 6, - } - - def __getitem__(self, byte: int) -> int: - try: - return self.map_[byte] - except KeyError: - return 6 # any non-standard nucleotide gets "N" - - encoding_map: ThreeBitEncodingMap = ThreeBitEncodingMap() - decoding_map: Mapping[int, bytes] = {1: b"C", 2: b"A", 3: b"G", 4: b"T", 6: b"N"} - bits_per_base: int = 3 - - @classmethod - def encode(cls, bytes_encoded: bytes) -> int: - encoded = 0 - for character in bytes_encoded: - encoded <<= 3 - encoded += cls.encoding_map[character] - return encoded - - @classmethod - def decode(cls, integer_encoded: int) -> bytes: - decoded = b"" - while integer_encoded: - decoded = cls.decoding_map[integer_encoded & 7] + decoded - integer_encoded >>= 3 - return decoded - - @classmethod - def gc_content(cls, integer_encoded: int) -> int: - i = 0 - while integer_encoded: - i += integer_encoded & 1 - integer_encoded >>= 3 - return i - - @staticmethod - def hamming_distance(a: int, b: int) -> int: - difference = a ^ b - d_hamming = 0 - while difference: - if difference & 7: - d_hamming += 1 - difference >>= 3 - return d_hamming diff --git a/tools/scripts/sctools/src/sctools/fastq.py b/tools/scripts/sctools/src/sctools/fastq.py deleted file mode 100644 index c6749de0..00000000 --- a/tools/scripts/sctools/src/sctools/fastq.py +++ /dev/null @@ -1,404 +0,0 @@ -""" -Efficient Fastq Iterators and Representations -============================================= - -.. currentmodule:: sctools - -This module implements classes for representing fastq records, reading and writing them, and -extracting parts of fastq sequence for transformation into bam format tags - -Methods -------- -extract_barcode(record, embedded_barcode) - extract a barcode, defined by `embedded_barcode` from `record` - -Classes -------- -Record Represents fastq records (input as bytes) -StrRecord Represents fastq records (input as str) -Reader Opens and iterates over fastq files -EmbeddedBarcodeGenerator Generates barcodes from a fastq file -BarcodeGeneratorWithCorrectedCellBarcodes Generates (corrected) barcodes from a fastq file - -References ----------- -https://en.wikipedia.org/wiki/FASTQ_format - -""" - -from collections import namedtuple -from typing import Iterable, AnyStr, Iterator, Union, Tuple - -from . import reader, consts -from .barcode import ErrorsToCorrectBarcodesMap - - -# todo the inheritance pattern of this class is a bit confusing, particularly the str vs. bytes -# in the daughter classes -class Record: - """Fastq Record. - - Parameters - ---------- - record : Iterable[bytes] - Iterable of 4 bytes strings that comprise a fastq record - - Attributes - ---------- - name : bytes - fastq record name - sequence : bytes - fastq nucleotide sequence - name2 : bytes - second fastq record name field (rarely used) - quality : bytes - base call quality for each nucleotide in sequence - - Methods - ------- - average_quality() - The average quality of the fastq record - - """ - - __slots__ = ["_name", "_sequence", "_name2", "_quality"] - - def __init__(self, record: Iterable[AnyStr]): - # use the setter functions - self.name, self.sequence, self.name2, self.quality = record - - @property - def name(self) -> AnyStr: - return self._name - - @name.setter - def name(self, value): - """fastq record name""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ name must be bytes") - elif not value.startswith(b"@"): - raise ValueError("FASTQ name must start with @") - else: - self._name = value - - @property - def sequence(self) -> AnyStr: - return self._sequence - - @sequence.setter - def sequence(self, value): - """FASTQ nucleotide sequence""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ sequence must be str or bytes") - else: - self._sequence = value - - @property - def name2(self) -> AnyStr: - return self._name2 - - @name2.setter - def name2(self, value): - """second FASTQ record name field (rarely used)""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ name2 must be str or bytes") - else: - self._name2 = value - - @property - def quality(self) -> AnyStr: - return self._quality - - @quality.setter - def quality(self, value): - """FASTQ record base call quality scores""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ quality must be str or bytes") - else: - self._quality = value - - def __bytes__(self): - return b"".join((self.name, self.sequence, self.name2, self.quality)) - - def __str__(self): - return b"".join((self.name, self.sequence, self.name2, self.quality)).decode() - - def __repr__(self): - return "Name: %s\nSequence: %s\nName2: %s\nQuality: %s\n" % ( - self.name, - self.sequence, - self.name2, - self.quality, - ) - - def __len__(self): - return len(self.sequence) - - def average_quality(self) -> float: - """return the average quality of this record""" - # -33 due to solexa/illumina phred conversion - return sum(c for c in self.quality[:-1]) / (len(self.quality) - 1) - 33 - - -class StrRecord(Record): - """Fastq Record. - - Parameters - ---------- - record : Iterable[str] - Iterable of 4 bytes strings that comprise a FASTQ record - - Attributes - ---------- - name : str - FASTQ record name - sequence : str - FASTQ nucleotide sequence - name2 : str - second FASTQ record name field (rarely used) - quality : str - base call quality for each nucleotide in sequence - - Methods - ------- - average_quality() - The average quality of the FASTQ record - - """ - - def __bytes__(self): - return "".join((self.name, self.sequence, self.name2, self.quality)).encode() - - def __str__(self): - return "".join((self.name, self.sequence, self.name2, self.quality)) - - # todo is this method necessary? - @property - def name(self) -> str: - return self._name - - @name.setter - def name(self, value): - """FASTQ record name""" - if not isinstance(value, (bytes, str)): - raise TypeError("FASTQ name must be str or bytes") - if not value.startswith("@"): - raise ValueError("FASTQ name must start with @") - else: - self._name = value - - def average_quality(self) -> float: - """return the average quality of this record""" - b = self.quality[:-1].encode() - return ( - sum(c for c in b) / len(b) - 33 - ) # -33 due to solexa/illumina phred conversion - - -class Reader(reader.Reader): - """Fastq Reader that defines some special methods for reading and summarizing FASTQ data. - - Simple reader class that exposes an __iter__ and __len__ method - - Examples - -------- - #todo add examples - - See Also - -------- - sctools.reader.Reader - - References - ---------- - https://en.wikipedia.org/wiki/FASTQ_format - - """ - - @staticmethod - def _record_grouper(iterable): - """Groups contents of an iterator, yielding 4 objects at a time instead of one - - This is a somewhat complex python function. It creates 4 iterators on the same iterable; - each moves the pointer to the position in the iterable forward when called, yielding 4 - objects at a time - - Returns - ------- - grouped_iterator : Iterator[Str], Iterator[Str], Iterator[Str], Iterator[Str] - - """ - args = [iter(iterable)] * 4 - return zip(*args) - - def __iter__(self) -> Iterator[Tuple[str]]: - """Iterate over a FASTQ file, returning records - - Yields - ------ - fastq_record : Tuple[str] - tuple of length 4 containing the name, sequence, name2, and quality for a FASTQ record - - """ - record_type = StrRecord if self._mode == "r" else Record - for record in self._record_grouper(super().__iter__()): - yield record_type(record) - - -# namedtuple that defines the start and end position of a barcode sequence and provides the name -# for both a quality and sequence tag -EmbeddedBarcode = namedtuple("Tag", ["start", "end", "sequence_tag", "quality_tag"]) - - -def extract_barcode( - record, embedded_barcode -) -> Tuple[Tuple[str, str, str], Tuple[str, str, str]]: - """Extracts barcodes from a FASTQ record at positions defined by an EmbeddedBarcode object. - - Parameters - ---------- - record : FastqRecord - Record to extract from - embedded_barcode : EmbeddedBarcode - Defines the barcode start and end positions and the tag name for the sequence and quality - tags - - Returns - ------- - sequence_tag : Tuple[str, str, 'Z'] - sequence tag identifier, sequence, SAM tag type ('Z' implies a string tag) - quality_tag : Tuple[str, str, 'Z'] - quality tag identifier, quality, SAM tag type ('Z' implies a string tag) - - """ - seq = record.sequence[embedded_barcode.start : embedded_barcode.end] - qual = record.quality[embedded_barcode.start : embedded_barcode.end] - return ( - (embedded_barcode.sequence_tag, seq, "Z"), - (embedded_barcode.quality_tag, qual, "Z"), - ) - - -# todo the reader subclasses need better docs -class EmbeddedBarcodeGenerator(Reader): - """Generate barcodes from a FASTQ file(s) from positions defined by EmbeddedBarcode(s) - - Extracted barcode objects are produced in a form that is consumable by pysam's bam and sam - set_tag methods. - - Parameters - ---------- - embedded_barcodes : Iterable[EmbeddedBarcode] - tag objects defining start and end of the sequence containing the tag, and the tag - identifiers for sequence and quality tags - fastq_files : str | List, optional - FASTQ file or files to be read. (default = sys.stdin) - mode : {'r', 'rb'}, optional - open mode for FASTQ files. If 'r', return string. If 'rb', return bytes (default = 'r') - - """ - - def __init__(self, fastq_files, embedded_barcodes, *args, **kwargs): - super().__init__(files=fastq_files, *args, **kwargs) - self.embedded_barcodes = embedded_barcodes - - def __iter__(self): - """iterates over barcodes extracted from FASTQ""" - for record in super().__iter__(): # iterates records; we extract barcodes. - barcodes = [] - for barcode in self.embedded_barcodes: - barcodes.extend(extract_barcode(record, barcode)) - yield barcodes - - -# todo the reader subclasses need better docs -class BarcodeGeneratorWithCorrectedCellBarcodes(Reader): - """Generate barcodes from FASTQ file(s) from positions defined by EmbeddedBarcode(s) - - Extracted barcode objects are produced in a form that is consumable by pysam's bam and sam - set_tag methods. In this class, one EmbeddedBarcode must be defined as an - `embedded_cell_barcode`, which is checked against a whitelist and error corrected during - generation - - Parameters - ---------- - fastq_files : str | List, optional - FASTQ file or files to be read. (default = sys.stdin) - mode : {'r', 'rb'}, optional - open mode for fastq files. If 'r', return string. If 'rb', return bytes (default = 'r') - whitelist : str - whitelist file containing "correct" cell barcodes for an experiment - embedded_cell_barcodes : EmbeddedBarcode - EmbeddedBarcode containing information about the position and names of cell barcode tags - other_embedded_barcodes : Iterable[EmbeddedBarcode], optional - tag objects defining start and end of the sequence containing the tag, and the tag - identifiers for sequence and quality tags (default = None) - - Methods - ------- - extract_cell_barcode(record: Record, cb: str) - - """ - - def __init__( - self, - fastq_files: Union[str, Iterable[str]], - embedded_cell_barcode: EmbeddedBarcode, - whitelist: str, - other_embedded_barcodes: Iterable[EmbeddedBarcode] = tuple(), - *args, - **kwargs - ): - - super().__init__(files=fastq_files, *args, **kwargs) - if isinstance(other_embedded_barcodes, (list, tuple)): - self.embedded_barcodes = other_embedded_barcodes - else: - raise TypeError( - "if passed, other_embedded_barcodes must be a list or tuple" - ) - - self._error_mapping = ErrorsToCorrectBarcodesMap.single_hamming_errors_from_whitelist( - whitelist - ) - self.embedded_cell_barcode = embedded_cell_barcode - - def __iter__(self): - """iterates over barcodes extracted from fastq""" - for record in super().__iter__(): # iterates records; we extract barcodes. - barcodes = [] - - barcodes.extend( - self.extract_cell_barcode(record, self.embedded_cell_barcode) - ) - for barcode in self.embedded_barcodes: - barcodes.extend(extract_barcode(record, barcode)) - - yield barcodes - - def extract_cell_barcode(self, record: Tuple[str], cb: EmbeddedBarcode): - """Extract a cell barcode from a fastq record - - Parameters - ---------- - record : Tuple[str] - fastq record comprised of four strings: name, sequence, name2, and quality - cb : EmbeddedBarcode - defines the position and tag identifier for a call barcode - - Returns - ------- - sequence_tag : Tuple[str, str, 'Z'] - raw sequence tag identifier, sequence, SAM tag type ('Z' implies a string tag) - quality_tag : Tuple[str, str, 'Z'] - quality tag identifier, quality, SAM tag type ('Z' implies a string tag) - corrected_tag : Optional[Tuple[str, str, 'Z']] - Whitelist verified sequence tag. Only present if the raw sequence tag is in the - whitelist or within 1 hamming distance of one of its barcodes - - """ - seq_tag, qual_tag = extract_barcode(record, cb) - try: - corrected_cb = self._error_mapping.get_corrected_barcode(seq_tag[1]) - return seq_tag, qual_tag, (consts.CELL_BARCODE_TAG_KEY, corrected_cb, "Z") - except KeyError: - return seq_tag, qual_tag diff --git a/tools/scripts/sctools/src/sctools/groups.py b/tools/scripts/sctools/src/sctools/groups.py deleted file mode 100644 index 2a3592f2..00000000 --- a/tools/scripts/sctools/src/sctools/groups.py +++ /dev/null @@ -1,195 +0,0 @@ -""" -Group QC outputs - -""" - -from crimson import picard -import os -import pandas as pd - - -def write_aggregated_picard_metrics_by_row(file_names, output_name): - """Command line entrypoint to parse, aggreagete and write Picard row metrics. - Parameters - ---------- - args: - file_names: array of files. the basename of inputs should be formated - as 'samplename_qc',such as - "samplename_qc.alignment_summary_metrics.txt" and "samplename_qc.insert_size_metrics.txt" - output_name: prefix of output file name without extension. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - # initial output - metrics = {} - d = pd.DataFrame() - for file_name in file_names: - cell_id = os.path.basename(file_name).split("_qc")[0] - metrics[cell_id] = {} - parsed = picard.parse(file_name) - class_name = parsed["metrics"]["class"].split(".")[2] - # Alignment metrics return multiple lines, - # but only output PAIRED-READS/third line - contents = parsed["metrics"]["contents"] - if class_name == "AlignmentSummaryMetrics": - # parse out PE, R1 and R2. If the reads are unpaired, the contents - # will be a single dict rather than a list of dicts. - if isinstance(contents, dict): - contents = [contents] - rows = {} - for m in contents: - cat = m["CATEGORY"] - rows.update( - { - k + "." + cat: v - for k, v in m.items() - if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"] - } - ) - # sometimes(very rare), insertion metrics also return multiple lines - # results to include TANDEM repeats. but we only output the first line. - elif class_name == "InsertSizeMetrics": - # if the element counts is less than 21, - # it means insertion metrics returns multiple line results. - if len(contents) < 21: - rows = contents[0] - else: - rows = contents - else: - # other metrics(so far) only return one line results. - rows = contents - metrics[cell_id].update( - { - k: rows[k] - for k in rows - if k not in ["SAMPLE", "LIBRARY", "READ_GROUP", "CATEGORY"] - } - ) - df = pd.DataFrame.from_dict(metrics, orient="columns") - df.insert(0, "Class", class_name) - d = d.append(df) - d_T = d.T - d_T.to_csv(output_name + ".csv") - - -def write_aggregated_picard_metrics_by_table(file_names, output_name): - """Command line entrypoint to parse and write Picard table metrics. - Parameters - ---------- - args: - file_names: array of files.the basename of inputs should be formated as 'samplename_qc' - output_name: prefix of output file name. the basename of outputs - includes the Picard metrics class name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - for file_name in file_names: - cell_id = os.path.basename(file_name).split("_qc")[0] - class_name = os.path.basename(file_name).split(".")[1] - parsed = picard.parse(file_name) - dat = pd.DataFrame.from_dict(parsed["metrics"]["contents"]) - dat.insert(0, "Sample", cell_id) - dat.to_csv(output_name + "_" + class_name + ".csv", index=False) - - -def write_aggregated_qc_metrics(file_names, output_name): - """Command line entrypoint to merge Picard metrics along with RSEM and HISAT2 log - Parameters - ---------- - args: - file_names: array of files,such as Picard row metric, hisat2 metrics. - output_name: prefix of output file name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - df = pd.DataFrame() - for file_name in file_names: - dat = pd.read_csv(file_name, index_col=0) - print(dat.index) - print(df.head()) - df = pd.concat([df, dat], axis=1, join="outer") - df.to_csv(output_name + ".csv", index=True) - - -def parse_hisat2_log(file_names, output_name): - """Command line entrypoint parse, aggreagete and write HISAT2 logs - Parameters - ---------- - args: - file_names: array of HISAT2 log files. Basename of file indicates - the alignment references 'samplename_qc.log' indicates the genome reference and - 'samplename_rsem.log' indicates the transcriptome reference alignment. - output_name: prefix of output file name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - metrics = {} - tag = "NONE" - for file_name in file_names: - if "_qc" in file_name: - cell_id = os.path.basename(file_name).split("_qc")[0] - tag = "HISAT2G" - elif "_rsem" in file_name: - cell_id = os.path.basename(file_name).split("_rsem")[0] - tag = "HISAT2T" - with open(file_name) as f: - dat = f.readlines() - d = [x.strip().split(":") for x in dat] - # remove the first row of each section. - d.pop(0) - metrics[cell_id] = {x[0]: x[1].strip().split(" ")[0] for x in d} - df = pd.DataFrame.from_dict(metrics, orient="columns") - df.insert(0, "Class", tag) - df_T = df.T - df_T.to_csv(output_name + ".csv") - - -def parse_rsem_cnt(file_names, output_name): - """Command line entrypoint parse, aggreagete and write RSEM cnt - Parameters - ---------- - args: - file_names: array of RSEM cnt files. The basename of inputs should be - 'samplename_rsem.cnt' - output_name: prefix of output file name. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - metrics = {} - for file_name in file_names: - cell_id = os.path.basename(file_name).split("_rsem")[0] - i = 0 - with open(file_name) as f: - while i < 3: - if i == 0: - [N0, N1, N2, N_tot] = f.readline().strip().split(" ") - elif i == 1: - [n_unique, n_multi, n_uncertain] = f.readline().strip().split(" ") - elif i == 2: - [n_hits, read_type] = f.readline().strip().split(" ") - i = i + 1 - metrics[cell_id] = { - "unalignable reads": N0, - "alignable reads": N1, - "filtered reads": N2, - "total reads": N_tot, - "unique aligned": n_unique, - "multiple mapped": n_multi, - "total alignments": n_hits, - "strand": read_type, - "uncertain reads": n_uncertain, - } - df = pd.DataFrame.from_dict(metrics, orient="columns") - df.insert(0, "Class", "RSEM") - df_T = df.T - df_T.to_csv(output_name + ".csv") diff --git a/tools/scripts/sctools/src/sctools/gtf.py b/tools/scripts/sctools/src/sctools/gtf.py deleted file mode 100644 index 7f574a9e..00000000 --- a/tools/scripts/sctools/src/sctools/gtf.py +++ /dev/null @@ -1,446 +0,0 @@ -""" -GTF Records and Iterators -========================= - -.. currentmodule:: sctools - -This module defines a GTF record class and a Reader class to iterate over GTF-format files - -Classes -------- -Record Data class that exposes GTF record fields by name -Reader GTF file reader that yields GTF Records - -References ----------- -https://useast.ensembl.org/info/website/upload/gff.html -""" - -import logging -import string -import re -from typing import List, Dict, Generator, Iterable, Union, Set - -from . import reader - -_logger = logging.getLogger(__name__) - - -class GTFRecord: - """Data class for storing and interacting with GTF records - - Subclassed to produce exon, transcript, and gene-specific record types. - A GTF record has 8 fixed fields which are followed by optional fields separated by ;\t, which - are stored by this class in the attributes field and accessible by get_attribute. Fixed fields - are accessible by name. - - Parameters - ---------- - record : str - an unparsed GTF record - - Attributes - ---------- - seqname : str - The name of the sequence (often chromosome) this record is found on. - chromosome : str - Synonym for seqname. - source : str - The group responsible for generating this annotation. - feature : str - The type of record (e.g. gene, exon, ...). - start : str - The start position of this feature relative to the beginning of seqname. - end : str - The end position of this feature relative to the beginning of seqname.... - score : str - The annotation score. Rarely used. - strand : {'+', '-'} - The strand of seqname that this annotation is found on - frame : {'0', '1', '2'} - '0' indicates that the first base of the feature is the first base of a codon, - '1' that the second base is the first base of a codon, and so on - size : int - the number of nucleotides spanned by this feature - - Methods - ------- - get_attribute(key: str) - attempt to retrieve a variable field with name equal to `key` - set_attribute(key: str, value: str) - set variable field `key` equal to `value`. Overwrites `key` if already present. - - """ - - __slots__ = ["_fields", "_attributes"] - - _del_letters: str = string.ascii_letters - _del_non_letters: str = "".join( - set(string.printable).difference(string.ascii_letters) - ) - - def __init__(self, record: str): - fields: List[str] = record.strip(";\n").split("\t") - - self._fields: List[str] = fields[:8] - - self._attributes: Dict[str, str] = {} - for field in fields[8].split(";"): - try: - key, _, value = field.strip().partition(" ") - self._attributes[key] = value.strip('"') - except Exception: - raise RuntimeError( - f'Error parsing field "{field}" of GTF record "{record}"' - ) - - def __repr__(self): - return "" % self.__str__() - - def __bytes__(self): - return self.__str__().encode() - - def __str__(self): - return "\t".join(self._fields) + self._format_attribute() + "\n" - - def __hash__(self) -> int: - return hash(self.__str__()) - - def _format_attribute(self): - return " ".join('%s "%s";' % (k, v) for k, v in self._attributes.items()) - - @property - def seqname(self) -> str: - return self._fields[0] - - @property - def chromosome(self) -> str: - return self._fields[0] # synonym for seqname - - @property - def source(self) -> str: - return self._fields[1] - - @property - def feature(self) -> str: - return self._fields[2] - - @property - def start(self) -> int: - return int(self._fields[3]) - - @property - def end(self) -> int: - return int(self._fields[4]) - - @property - def score(self) -> str: - return self._fields[5] - - @property - def strand(self) -> str: - return self._fields[6] - - @property - def frame(self) -> str: - return self._fields[7] - - @property - def size(self) -> int: - size = self.end - self.start - if size < 0: - raise ValueError(f"Invalid record: negative size {size} (start > end)") - else: - return size - - def get_attribute(self, key) -> str: - """access an item from the attribute field of a GTF file. - - Parameters - ---------- - key : str - Item to retrieve - - Returns - ------- - value : str - Contents of variable attribute `key` - - Raises - ------ - KeyError - if there is no variable attribute `key` associated with this record - - """ - return self._attributes.get(key) - - def set_attribute(self, key, value) -> None: - """Set variable attribute `key` equal to `value` - - If attribute `key` is already set for this record, its contents are overwritten by `value` - - Parameters - ---------- - key : str - attribute name - value : str - attribute content - - """ - self._attributes[key] = value - - def __eq__(self, other): - return hash(self) == hash(other) - - def __ne__(self, other): - return not self.__eq__(other) - - -class Reader(reader.Reader): - """GTF file iterator - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Methods - ------- - filter(retain_types: Iterable[str]) - Iterate over a GTF file, only yielding records in `retain_types`. - __iter__() - iterate over GTF records in file, yielding `Record` objects - - See Also - -------- - sctools.reader.Reader - - """ - - def __init__(self, files="-", mode="r", header_comment_char="#"): - super().__init__( - files, mode, header_comment_char - ) # has different default args from super - - def __iter__(self): - for line in super().__iter__(): - yield GTFRecord(line) - - def filter(self, retain_types: Iterable[str]) -> Generator: - """Iterate over a GTF file, returning only record whose feature type is in retain_types. - - Features are stored in GTF field 2. - - Parameters - ---------- - retain_types : Iterable[str] - Record feature types to retain. - - Yields - ------ - gtf_record : Record - gtf `Record` object - - """ - retain_types = set(retain_types) - for record in self: - if record.feature in retain_types: - yield record - - -# todo this lenient behavior is deemed to change in the future (warning -> exception) -def _resolve_multiple_gene_names(gene_name: str): - _logger.warning( - f'Multiple entries encountered for "{gene_name}". Please validate the input GTF file(s). ' - f"Skipping the record for now; in the future, this will be considered as a " - f"malformed GTF file." - ) - - -def get_mitochondrial_gene_names( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Set[str]: - """Extract mitocholdrial gene names from GTF file(s) and returns a set of mitochondrial - gene id occurrence in the given file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Set(str) - A set of the mitochondrial gene ids - """ - - mitochondrial_gene_ids: Set[str] = set() - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["gene"] - ): - gene_name = record.get_attribute("gene_name") - gene_id = record.get_attribute("gene_id") - - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - if re.match("^mt-", gene_name, re.IGNORECASE): - if gene_id not in mitochondrial_gene_ids: - mitochondrial_gene_ids.add(gene_id) - - return mitochondrial_gene_ids - - -def extract_gene_names( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Dict[str, int]: - """Extract gene names from GTF file(s) and returns a map from gene names to their corresponding - occurrence orders in the given file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Dict[str, int] - A map from gene names to their linear index - """ - gene_name_to_index: Dict[str, int] = dict() - gene_index = 0 - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["gene"] - ): - gene_name = record.get_attribute("gene_name") - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - if gene_name in gene_name_to_index: - _resolve_multiple_gene_names(gene_name) - continue - gene_name_to_index[gene_name] = gene_index - gene_index += 1 - return gene_name_to_index - - -def extract_extended_gene_names( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Dict[str, List[tuple]]: - """Extract extended gene names from GTF file(s) and returns a map from gene names to their corresponding - occurrence locations the given file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Dict[str, List[tuple]] - A dictionary of chromosome names mapping to a List of tuples, each containing - a range as the the first element and a gene name as the second. - Dict[str, List(Tuple((start,end), gene))) - """ - gene_name_to_start_end = dict() - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["gene"] - ): - gene_name = record.get_attribute("gene_name") - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - # find gene collisions - if gene_name in gene_name_to_start_end: - _resolve_multiple_gene_names(gene_name) - continue - if record.chromosome not in gene_name_to_start_end: - gene_name_to_start_end[record.chromosome] = dict() - gene_name_to_start_end[record.chromosome][gene_name] = ( - record.start, - record.end, - ) - gene_locations = dict() - # For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start - for chromosome in gene_name_to_start_end: - gene_locations[chromosome] = [ - (locs, key) for key, locs in gene_name_to_start_end[chromosome].items() - ] - # Sort by starting location - gene_locations[chromosome].sort(key=lambda x: x[0]) - return gene_locations - - -def extract_gene_exons( - files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" -) -> Dict[str, List[tuple]]: - """Extract extended gene names from GTF file(s) and returns a map from gene names to the the - list of exons in the ascending order of the start positions file(s). - - Parameters - ---------- - files : Union[str, List], optional - File(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). - header_comment_char : str, optional - lines beginning with this character are skipped (default = '#') - - Returns - ------- - Dict[str, List[tuple]] - A dictionary of chromosome names mapping to a List of tuples, each containing - a the exons in the ascending order of the start positions. - Dict[str, List(Tuple((start,end), gene))) - """ - gene_name_to_start_end = dict() - for record in Reader(files, mode, header_comment_char).filter( - retain_types=["exon"] - ): - gene_name = record.get_attribute("gene_name") - if gene_name is None: - raise ValueError( - f"Malformed GTF file detected. Record is of type gene but does not have a " - f'"gene_name" field: {record}' - ) - if record.chromosome not in gene_name_to_start_end: - gene_name_to_start_end[record.chromosome] = dict() - - if gene_name not in gene_name_to_start_end[record.chromosome]: - gene_name_to_start_end[record.chromosome][gene_name] = [] - - gene_name_to_start_end[record.chromosome][gene_name].append( - (record.start, record.end) - ) - - gene_locations_exons = dict() - # For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start - for chromosome in gene_name_to_start_end: - gene_locations_exons[chromosome] = [ - (locs, key) for key, locs in gene_name_to_start_end[chromosome].items() - ] - # Sort by starting location - gene_locations_exons[chromosome].sort(key=lambda x: x[0]) - return gene_locations_exons diff --git a/tools/scripts/sctools/src/sctools/metrics/README.md b/tools/scripts/sctools/src/sctools/metrics/README.md deleted file mode 100644 index 8ee554ae..00000000 --- a/tools/scripts/sctools/src/sctools/metrics/README.md +++ /dev/null @@ -1,59 +0,0 @@ -## Metric Processing -This module implements a metric suite that generates information on data quality at the level of -both cells and genes. This QC information aligns with the cells and genes that make up the -expression matrix, providing easy access to information that the user can examine to make decisions -about which cells or genes are of adequate quality to include in downstream processing. - -Metric processing in sctools can be run on large individual files, but also implements a map-reduce -architecture execution at production scale. Specifically, the workflow is as follows: - -1. Chunk the input bam file using `SplitBam`, which generates several chunks, each of which is -guaranteed to contain all data for any cell it contains -2. Sort each chunk by cell, gene, and molecule tags to ensure that all the reads associated with -a molecule are stored sequentially by cell (`CalculateCellMetrics`) or by gene -(`CalculateGeneMetrics`) -3. For each cell or gene, parse the information by molecule, which typically loads fewer than -10,000 records into memory at a time. -4. Merge data across chunks using `MergeCellMetrics` or `MergeGeneMetrics`. - -This map-reduce approach is currently implemented by the -[HCA 3' pipeline](https://github.com/HumanCellAtlas/skylab/blob/master/pipelines/optimus/Optimus.wdl), -but an abbreviated WDL could be made in the future which would contain: - -``` -1. SplitBamByCellBarcode -2. scatter[CalculateMetrics] -3. MergeMetrics -``` - -## Implementation Details: - -This module implements 4 base classes that carry out metric processing. These are: - -``` -MetricAggregator: - - CellMetricAggregator - - GeneMetricAggregator - -MetricGatherer: - - CellMetricGatherer - - GeneMetricGatherer - -MetricCSVWriter - -MergeMetrics: - - MergeCellMetrics - - MergeGeneMetrics -``` -MetricGatherer defines generator functions to group records into molecules, the bam parsing pattern -necessary to process data iteratively. - -MetricAggregator stores the information for a unit of the relevant data (cell, gene), -and processses all the records with the `.parse_records()` method. - -When all records of a single unit (cell, gene) have been processed, `.finalize()` is called to -calculate any higher-order metrics (for example, the variance in quality scores across reads of the -cell or gene), and it is written to file by `MetricSCVWriter`. - -MergeMetrics merges multiple metric outputs from the scattered chunks. This is a trivial -concatenation in the case of cell metrics, and a more complex merge in the case of gene metrics. diff --git a/tools/scripts/sctools/src/sctools/metrics/__init__.py b/tools/scripts/sctools/src/sctools/metrics/__init__.py deleted file mode 100644 index 9ba20677..00000000 --- a/tools/scripts/sctools/src/sctools/metrics/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# flake8: noqa -from . import aggregator -from . import gatherer -from . import merge diff --git a/tools/scripts/sctools/src/sctools/metrics/aggregator.py b/tools/scripts/sctools/src/sctools/metrics/aggregator.py deleted file mode 100644 index 2d85199d..00000000 --- a/tools/scripts/sctools/src/sctools/metrics/aggregator.py +++ /dev/null @@ -1,595 +0,0 @@ -""" -Sequence Metric Aggregators -=========================== - -.. currentmodule:: sctools.metrics - -This module provides classes useful for aggregating metric information for individual cells or -genes. These classes consume BAM files that have been pre-sorted such that all sequencing reads -that correspond to the molecules of a cell (CellMetrics) or the molecules of a gene (GeneMetrics) -are yielded sequentially. - -Classes -------- - -.. autosummary:: - :toctree: generated/ - - MetricAggregatorBase Aggregator Base Class - GeneMetrics Class to iteratively calculate metrics for a gene (by molecule) - CellMetrics Class to iteratively calculate metrics for a cell (by molecule) - -Notes ------ -This module can be rewritten with dataclass when python 3.7 stabilizes, see -https://www.python.org/dev/peps/pep-0557/ - - -See Also --------- -sctools.metrics.gatherer -sctools.metrics.merge -sctools.metrics.writer - -""" - - -from typing import Iterable, Tuple, Counter, List, Sequence - -import numpy as np -import pysam - -from sctools import consts -from sctools.stats import OnlineGaussianSufficientStatistic - - -class MetricAggregator: - """Metric Aggregator Base Class - - The ``MetricAggregator`` class defines a set of metrics that can be extracted from an - aligned bam file. It defines all the metrics that are general across genes and cells. This - class is subclassed by ``GeneMetrics`` and ``CellMetrics``, which define data-specific metrics - in the ``parse_extra_fields`` method. An instance of ``GeneMetrics`` or ``CellMetrics`` is - instantiated for each gene or molecule in a bam file, respectively. - - Attributes - ---------- - n_reads : int - The number of reads associated with this entity - noise_reads : int, NotImplemented - Number of reads that are categorized by 10x genomics cellranger as "noise". Refers to - long polymers, or reads with high numbers of N (ambiguous) nucleotides - perfect_molecule_barcodes : int - The number of reads with molecule barcodes that have no errors (cell barcode tag == raw barcode tag) - reads_mapped_exonic : int - The number of reads for this entity that are mapped to exons - reads_mapped_intronic : int - The number of reads for this entity that are mapped to introns - reads_mapped_utr : int - The number of reads for this entity that are mapped to 3' untranslated regions (UTRs) - reads_mapped_uniquely : int - The number of reads mapped to a single unambiguous location in the genome - reads_mapped_multiple : int - The number of reads mapped to multiple genomic positions with equal confidence - # todo make sure equal confidence is accurate - duplicate_reads : int - The number of reads that are duplicates (see README.md for defition of a duplicate) - spliced_reads : int - The number of reads that overlap splicing junctions - antisense_reads : int - The number of reads that are mapped to the antisense strand instead of the transcribed - strand - molecule_barcode_fraction_bases_above_30_mean : float - The average fraction of bases in molecule barcodes that receive quality scores greater than - 30 across the reads of this entity - molecule_barcode_fraction_bases_above_30_variance : float - The variance in the fraction of bases in molecule barcodes that receive quality scores - greater than 30 across the reads of this entity - genomic_reads_fraction_bases_quality_above_30_mean : float - The average fraction of bases in the genomic read that receive quality scores greater than - 30 across the reads of this entity (included for 10x cell ranger count comparison) - genomic_reads_fraction_bases_quality_above_30_variance : float - The variance in the fraction of bases in the genomic read that receive quality scores - greater than 30 across the reads of this entity (included for 10x cell ranger count - comparison) - genomic_read_quality_mean : float - Average quality of Illumina base calls in the genomic reads corresponding to this entity - genomic_read_quality_variance : float - Variance in quality of Illumina base calls in the genomic reads corresponding to this - entity - n_molecules : float - Number of molecules corresponding to this entity. See README.md for the definition of a - Molecule - n_fragments : float - Number of fragments corresponding to this entity. See README.md for the definition of a - Fragment - reads_per_molecule : float - The average number of reads associated with each molecule in this entity - reads_per_fragment : float - The average number of reads associated with each fragment in this entity - fragments_per_molecule : float - The average number of fragments associated with each molecule in this entity - fragments_with_single_read_evidence : int - The number of fragments associated with this entity that are observed by only one read - molecules_with_single_read_evidence : int - The number of molecules associated with this entity that are observed by only one read - - Methods - ------- - parse_extra_fields(tags, record), NotImplemented - Abstract method that must be implemented by subclasses. Called by ``parse_molecule()`` - to gather information for subclass-specific metrics - parse_molecule(tags, record) - Extract information from a set of sequencing reads that correspond to a molecule and store - the data in the MetricAggregator class. - finalize() - Some metrics cannot be calculated until all the information for an entity has been - aggregated, for example, the number of `fragments_per_molecule`. Finalize calculates all - such higher-order metrics - - """ - - def __init__(self): - - # type definitions - Chromosome: int - Strand: bool # reverse = True, see pysam.AlignedSegment.is_reverse - Position: int - Fragment: Tuple[Chromosome, Position, Strand] # noqa: F821 - - # count information - self.n_reads: int = 0 - self.noise_reads: int = 0 # long polymers, N-sequences; NotImplemented - self._fragment_histogram: Counter[Fragment] = Counter() # noqa: F821 - self._molecule_histogram: Counter[str] = Counter() - - # molecule information - self._molecule_barcode_fraction_bases_above_30 = ( - OnlineGaussianSufficientStatistic() - ) - self.perfect_molecule_barcodes = 0 - - self._genomic_reads_fraction_bases_quality_above_30 = ( - OnlineGaussianSufficientStatistic() - ) - self._genomic_read_quality = OnlineGaussianSufficientStatistic() - - # alignment location information - self.reads_mapped_exonic = 0 - self.reads_mapped_intronic = 0 - self.reads_mapped_utr = 0 - - # todo implement this once we have a gene model - # self.reads_mapped_outside_window = 0 # reads should be within 1000 bases of UTR - # self._read_distance_from_termination_site = OnlineGaussianSufficientStatistic() - - # alignment uniqueness information - self.reads_mapped_uniquely = 0 - self.reads_mapped_multiple = 0 - self.duplicate_reads = 0 - - # alignment splicing information - self.spliced_reads = 0 - self.antisense_reads = 0 - self._plus_strand_reads = 0 # strand balance # todo implement property here - - # higher-order methods, filled in by finalize() when all data is extracted - self.molecule_barcode_fraction_bases_above_30_mean: float = None - self.molecule_barcode_fraction_bases_above_30_variance: float = None - self.genomic_reads_fraction_bases_quality_above_30_mean: float = None - self.genomic_reads_fraction_bases_quality_above_30_variance: float = None - self.genomic_read_quality_mean: float = None - self.genomic_read_quality_variance: float = None - self.n_molecules: float = None - self.n_fragments: float = None - self.reads_per_molecule: float = None - self.reads_per_fragment: float = None - self.fragments_per_molecule: float = None - self.fragments_with_single_read_evidence: int = None - self.molecules_with_single_read_evidence: int = None - - @staticmethod - def _quality_string_to_numeric(quality_sequence: Iterable[str]) -> List[int]: - """Convert an HTSlib ASCII quality string to an integer representation. - - Parameters - ---------- - quality_sequence : Iterable[str] - An iterable of Illumina base call qualities in ASCII encoding - - Returns - ------- - numeric_qualities : List[int] - A list of Illumina base call qualities converted to integers - - """ - return [ - ord(c) - 33 for c in quality_sequence - ] # todo look up if this is accurate - - @staticmethod - def _quality_above_threshold( - threshold: int, quality_sequence: Sequence[int] - ) -> float: - """Calculate the fraction of bases called with a quality above ``threshold``. - - Parameters - ---------- - threshold: int - The quality threshold - quality_sequence: Sequence[int] - A sequence of Illumina base qualities - - Returns - ------- - fraction : float - The fraction of bases in ``quality_sequence`` with quality greater than ``threshold`` - - """ - return sum(1 for base in quality_sequence if base > threshold) / len( - quality_sequence - ) - - def _is_noise(self, record: pysam.AlignedSegment) -> bool: - return NotImplemented # todo required because 10x measures this - - def parse_molecule( - self, tags: Sequence[str], records: Iterable[pysam.AlignedSegment] - ) -> None: - """Parse information from all records of a molecule. - - The parsed information is stored in the MetricAggregator in-place. - - Parameters - ---------- - tags : Sequence[str] - all the tags that define this molecule. one of {[CB, GE, UB], [GE, CB, UB]} - records : Iterable[pysam.AlignedSegment] - the sam records associated with the molecule - - """ - for record in records: - - # todo think about how I could use the duplicate tag to reduce computation; duplicates - # should normally come in order in a sorted file - - # extract sub-class-specific information - self.parse_extra_fields(tags=tags, record=record) - - self.n_reads += 1 - # self.noise_reads += self.is_noise(record) # todo implement me - - # the tags passed to this function define a molecule, this increments the counter, - # identifying a new molecule only if a new tag combination is observed - self._molecule_histogram[tags] += 1 - - self._molecule_barcode_fraction_bases_above_30.update( - self._quality_above_threshold( - 30, - self._quality_string_to_numeric( - record.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY) - ), - ) - ) - - # we should be tolerant and handle it if the pysam.AlignedSegment.get_tag - # cannot retrieve the data by a tag since it's not a fatal error - try: - self.perfect_molecule_barcodes += record.get_tag( - consts.RAW_MOLECULE_BARCODE_TAG_KEY - ) == record.get_tag(consts.MOLECULE_BARCODE_TAG_KEY) - except KeyError: - # An error occurred while retrieving the data from the optional alighment section, which - # indicates that the read did not have a corrected UMI sequence. In the future we would like to - # keep track of these reads. - pass - - self._genomic_reads_fraction_bases_quality_above_30.update( - self._quality_above_threshold(30, record.query_alignment_qualities) - ) - - mean_alignment_quality: float = np.mean(record.query_alignment_qualities) - self._genomic_read_quality.update(mean_alignment_quality) - - # the remaining portions deal with aligned reads, so if the read is not mapped, we are - # done with it - if record.is_unmapped: - continue - - # get components that define a unique sequence fragment and increment the histogram - position: int = record.pos - strand: bool = record.is_reverse - reference: int = record.reference_id - self._fragment_histogram[reference, position, strand, tags] += 1 - - alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY) - if alignment_location == consts.CODING_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_exonic += 1 - elif alignment_location == consts.INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_intronic += 1 - elif alignment_location == consts.UTR_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_utr += 1 - - # todo check if read maps outside window (needs gene model) - # todo create distances from terminate side (needs gene model) - - # uniqueness - number_mappings = record.get_tag(consts.NUMBER_OF_HITS_TAG_KEY) - if number_mappings == 1: - self.reads_mapped_uniquely += 1 - else: - self.reads_mapped_multiple += ( - 1 # todo without multi-mapping, this number is zero! - ) - - if record.is_duplicate: - self.duplicate_reads += 1 - - # cigar N field (3) indicates a read is spliced if the value is non-zero - cigar_stats, num_blocks = record.get_cigar_stats() - if cigar_stats[3]: - self.spliced_reads += 1 - - # todo figure out antisense and make this notation clearer; info likely in dropseqtools - self._plus_strand_reads += not record.is_reverse - - def parse_extra_fields( - self, tags: Sequence[str], record: pysam.AlignedSegment - ) -> None: - """Defined by subclasses to extract class-specific information from molecules""" - raise NotImplementedError - - def finalize(self) -> None: - """Calculate metrics that require information from all molecules of an entity - - ``finalize()`` replaces attributes in-place that were initialized by the constructor as - ``None`` with a value calculated across all molecule data that has been aggregated. - - """ - - self.molecule_barcode_fraction_bases_above_30_mean: float = self._molecule_barcode_fraction_bases_above_30.mean - - self.molecule_barcode_fraction_bases_above_30_variance: float = self._molecule_barcode_fraction_bases_above_30.calculate_variance() - - self.genomic_reads_fraction_bases_quality_above_30_mean: float = self._genomic_reads_fraction_bases_quality_above_30.mean - - self.genomic_reads_fraction_bases_quality_above_30_variance: float = self._genomic_reads_fraction_bases_quality_above_30.calculate_variance() - - self.genomic_read_quality_mean: float = self._genomic_read_quality.mean - - self.genomic_read_quality_variance: float = self._genomic_read_quality.calculate_variance() - - self.n_molecules: int = len(self._molecule_histogram.keys()) - - self.n_fragments: int = len(self._fragment_histogram.keys()) - - try: - self.reads_per_molecule: float = self.n_reads / self.n_molecules - except ZeroDivisionError: - self.reads_per_molecule: float = float("nan") - - try: - self.reads_per_fragment: float = self.n_reads / self.n_fragments - except ZeroDivisionError: - self.reads_per_fragment: float = float("nan") - - try: - self.fragments_per_molecule: float = self.n_fragments / self.n_molecules - except ZeroDivisionError: - self.fragments_per_molecule: float = float("nan") - - self.fragments_with_single_read_evidence: int = sum( - 1 for v in self._fragment_histogram.values() if v == 1 - ) - - self.molecules_with_single_read_evidence: int = sum( - 1 for v in self._molecule_histogram.values() if v == 1 - ) - - -class CellMetrics(MetricAggregator): - """Cell Metric Aggregator - - Aggregator that captures metric information about a cell by parsing all of the molecules in - an experiment that were annotated with a specific cell barcode, as recorded in the ``CB`` tag. - - Attributes - ---------- - perfect_cell_barcodes : int - The number of reads whose cell barcodes contain no errors (tag ``CB`` == ``CR``) - reads_mapped_intergenic : int - The number of reads mapped to an intergenic region for this cell - reads_mapped_too_many_loci : int - The number of reads that were mapped to too many loci across the genome and as a - consequence, are reported unmapped by the aligner - cell_barcode_fraction_bases_above_30_variance : float - The variance of the fraction of Illumina base calls for the cell barcode sequence that - are greater than 30, across molecules - cell_barcode_fraction_bases_above_30_mean : float - The average fraction of Illumina base calls for the cell barcode sequence that - are greater than 30, across molecules - n_genes : int - The number of genes detected by this cell - genes_detected_multiple_observations : int - The number of genes that are observed by more than one read in this cell - n_mitochondrial_genes: int - The number of mitochondrial genes detected by this cell - n_mitochondrial_molecules: int - The number of molecules from mitochondrial genes detected for this cell - pct_mitochondrial_molecules: int - The percentage of molecules from mitochondrial genes detected for this cell - - """ - - extra_docs = """ - Examples - -------- - # todo implement me - - See Also - -------- - GeneMetrics - - """ - - __doc__ += MetricAggregator.__doc__ + extra_docs - - def __init__(self): - super().__init__() - - # barcode quality data - self._cell_barcode_fraction_bases_above_30 = OnlineGaussianSufficientStatistic() - self.perfect_cell_barcodes = 0 # inv: fraction cells with errors - - # track non-transcriptomic reads - self.reads_mapped_intergenic = 0 - self.reads_unmapped = 0 - self.reads_mapped_too_many_loci = 0 - - self._genes_histogram = Counter() - - # todo think about whether we can build molecule models that map to things that aren't genes - # i.e. to integentic regions or intronic regions. This could be a part of multi-mapping - # self.molecules_mapped_intergenic = 0 - - self.cell_barcode_fraction_bases_above_30_variance: float = None - self.cell_barcode_fraction_bases_above_30_mean: float = None - self.n_genes: int = None - self.genes_detected_multiple_observations: int = None - self.n_mitochondrial_genes: int = None - self.n_mitochondrial_molecules: int = None - self.pct_mitochondrial_molecules: float = None - - def finalize(self, mitochondrial_genes=set()): - super().finalize() - - self.cell_barcode_fraction_bases_above_30_mean: float = self._cell_barcode_fraction_bases_above_30.mean - - self.cell_barcode_fraction_bases_above_30_variance: float = self._cell_barcode_fraction_bases_above_30.calculate_variance() - - self.n_genes: int = len(self._genes_histogram.keys()) - - self.genes_detected_multiple_observations: int = sum( - 1 for v in self._genes_histogram.values() if v > 1 - ) - - self.n_mitochondrial_genes: int = sum( - 1 for g in self._genes_histogram.keys() if g in mitochondrial_genes - ) - - self.n_mitochondrial_molecules: int = sum( - c for g, c in self._genes_histogram.items() if g in mitochondrial_genes - ) - - if self.n_mitochondrial_molecules: - tot_molecules = sum(self._genes_histogram.values()) - self.pct_mitochondrial_molecules = ( - self.n_mitochondrial_molecules / tot_molecules * 100.0 - ) - else: - self.pct_mitochondrial_molecules = 0.00 - - def parse_extra_fields( - self, tags: Sequence[str], record: pysam.AlignedSegment - ) -> None: - """Parses a record to extract gene-specific information - - Gene-specific metric data is stored in-place in the MetricAggregator - - Parameters - ---------- - tags : Sequence[str] - The GE, UB and CB tags that define this molecule - record : pysam.AlignedSegment - SAM record to be parsed - - """ - self._cell_barcode_fraction_bases_above_30.update( - self._quality_above_threshold( - 30, - self._quality_string_to_numeric( - record.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY) - ), - ) - ) - - # Exclude reads that do not have a CB tag from the perfect_cell_barcodes count - if record.has_tag(consts.CELL_BARCODE_TAG_KEY): - raw_cell_barcode_tag = record.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) - cell_barcode_tag = record.get_tag(consts.CELL_BARCODE_TAG_KEY) - self.perfect_cell_barcodes += raw_cell_barcode_tag == cell_barcode_tag - - try: - alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY) - if alignment_location == consts.INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE: - self.reads_mapped_intergenic += 1 - except KeyError: - self.reads_unmapped += 1 - - # todo track reads_mapped_too_many_loci after multi-alignment is done - self._genes_histogram[tags[2]] += 1 # note that no gene == None - - -class GeneMetrics(MetricAggregator): - """Gene Metric Aggregator - - Aggregator that captures metric information about a gene by parsing all of the molecules in - an experiment that were annotated with a specific gene ID, as recorded in the ``GE`` tag. - - Attributes - ---------- - number_cells_detected_multiple : int - The number of cells which observe more than one read of this gene - number_cells_expressing : int - The number of cells that detect this gene - - """ - - extra_docs = """ - Examples - -------- - # todo implement me - - See Also - -------- - CellMetrics - - """ - - __doc__ += MetricAggregator.__doc__ + extra_docs - - def __init__(self): - super().__init__() - - self._cells_histogram = Counter() - # todo we don't tag exon right now. Not sure if we want to or not - # self._exon_histogram = Counter() - - self.number_cells_detected_multiple: int = None - self.number_cells_expressing: int = None - - def finalize(self): - super().finalize() - - self.number_cells_expressing: int = len(self._cells_histogram.keys()) - - self.number_cells_detected_multiple: int = sum( - 1 for c in self._cells_histogram.values() if c > 1 - ) - - def parse_extra_fields( - self, tags: Sequence[str], record: pysam.AlignedSegment - ) -> None: - """Parses a record to extract cell-specific information - - Cell-specific metric data is stored in-place in the MetricAggregator - - Parameters - ---------- - tags : Sequence[str] - The CB, UB and GE tags that define this molecule - record : pysam.AlignedSegment - SAM record to be parsed - - """ - self._cells_histogram[tags[1]] += 1 diff --git a/tools/scripts/sctools/src/sctools/metrics/gatherer.py b/tools/scripts/sctools/src/sctools/metrics/gatherer.py deleted file mode 100644 index 91f7287f..00000000 --- a/tools/scripts/sctools/src/sctools/metrics/gatherer.py +++ /dev/null @@ -1,232 +0,0 @@ -""" -Sequence Metric Gatherers -========================= - -..currentmodule:: sctools.metrics - -This module defines classes to gather metrics across the cells or genes of an experiment and write -them to gzip-compressed csv files - -Classes -------- - -.. autosummary:: - :toctree: generated/ - - MetricGatherer Gatherer Base Class - GatherCellMetrics Class to gather metrics on all cells in an experiment - GatherGeneMetrics Class to gather metrics on all genes in an experiment - -See Also --------- -sctools.metrics.aggregator -sctools.metrics.merge -sctools.metrics.writer - -""" - -from contextlib import closing - -import pysam -from typing import Set - -from sctools.bam import iter_cell_barcodes, iter_genes, iter_molecule_barcodes -from sctools.metrics.aggregator import CellMetrics, GeneMetrics -from sctools.metrics.writer import MetricCSVWriter - - -class MetricGatherer: - """Gathers Metrics from an experiment - - Because molecules tend to have relatively small numbers of reads, the memory footprint of - this method is typically small (tens of megabytes). - - Parameters - ---------- - bam_file : str - the bam file containing the reads that metrics should be calculated from. Can be a chunk - of cells or an entire experiment - output_stem : str - the file stem for the gzipped csv output - - Methods - ------- - extract_metrics - extracts metrics from ``bam_file`` and writes them to output_stem.csv.gz - - """ - - def __init__( - self, - bam_file: str, - output_stem: str, - mitochondrial_gene_ids: Set[str] = set(), - compress: bool = True, - ): - self._bam_file = bam_file - self._output_stem = output_stem - self._compress = compress - self._mitochondrial_gene_ids = mitochondrial_gene_ids - - @property - def bam_file(self) -> str: - """the bam file that metrics are generated from""" - return self._bam_file - - def extract_metrics(self, mode="rb") -> None: - """extract metrics from the provided bam file and write the results to csv. - - Parameters - ---------- - mode : {'r', 'rb'}, default 'rb' - the open mode for pysam.AlignmentFile. 'r' indicates the input is a sam file, and 'rb' - indicates a bam file. - - """ - raise NotImplementedError - - -class GatherCellMetrics(MetricGatherer): - - extra_docs = """ - Notes - ----- - ``bam_file`` must be sorted by gene (``GE``), molecule (``UB``), and cell (``CB``), where gene - varies fastest. - - Examples - -------- - >>> from sctools.metrics.gatherer import GatherCellMetrics - >>> import os, tempfile - - >>> # example data - >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam' - >>> temp_dir = tempfile.mkdtemp() - >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True) - >>> g.extract_metrics() - - See Also - -------- - GatherGeneMetrics - - """ - - __doc__ += extra_docs - - def extract_metrics(self, mode: str = "rb") -> None: - """Extract cell metrics from self.bam_file - - Parameters - ---------- - mode : str, optional - Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb'). - - """ - # open the files - with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing( - MetricCSVWriter(self._output_stem, self._compress) - ) as cell_metrics_output: - - # write the header - cell_metrics_output.write_header(vars(CellMetrics())) - - # break up the bam file into sub-iterators over cell barcodes - for cell_iterator, cell_tag in iter_cell_barcodes( - bam_iterator=bam_iterator - ): - metric_aggregator = CellMetrics() - - # break up cell barcodes by molecule barcodes - for molecule_iterator, molecule_tag in iter_molecule_barcodes( - bam_iterator=cell_iterator - ): - - # break up molecule barcodes by gene ids - for gene_iterator, gene_tag in iter_genes( - bam_iterator=molecule_iterator - ): - - # process the data - metric_aggregator.parse_molecule( - tags=(cell_tag, molecule_tag, gene_tag), - records=gene_iterator, - ) - - # write a record for each cell - metric_aggregator.finalize( - mitochondrial_genes=self._mitochondrial_gene_ids - ) - cell_metrics_output.write(cell_tag, vars(metric_aggregator)) - - -class GatherGeneMetrics(MetricGatherer): - - extra_docs = """ - Notes - ----- - ``bam_file`` must be sorted by molecule (``UB``), cell (``CB``), and gene (``GE``), where - molecule varies fastest. - - Examples - -------- - >>> from sctools.metrics.gatherer import GatherCellMetrics - >>> import os, tempfile - - >>> # example data - >>> bam_file = os.path.abspath(__file__) + '../test/data/test.bam' - >>> temp_dir = tempfile.mkdtemp() - >>> g = GatherCellMetrics(bam_file=bam_file, output_stem=temp_dir + 'test', compress=True) - >>> g.extract_metrics() - - See Also - -------- - GatherGeneMetrics - - """ - - __doc__ += extra_docs - - def extract_metrics(self, mode: str = "rb") -> None: - """Extract gene metrics from self.bam_file - - Parameters - ---------- - mode : str, optional - Open mode for self.bam. 'r' -> sam, 'rb' -> bam (default = 'rb'). - - """ - # open the files - with pysam.AlignmentFile(self.bam_file, mode=mode) as bam_iterator, closing( - MetricCSVWriter(self._output_stem, self._compress) - ) as gene_metrics_output: - - # write the header - gene_metrics_output.write_header(vars(GeneMetrics())) - - # break up the bam file into sub-iterators over gene ids - for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator): - metric_aggregator = GeneMetrics() - - # in case of multi-genes ignore as in the counting stage - if gene_tag and len(gene_tag.split(",")) > 1: - continue - - # break up gene ids by cell barcodes - for cell_iterator, cell_tag in iter_cell_barcodes( - bam_iterator=gene_iterator - ): - - # break up cell barcodes by molecular barcodes - for molecule_iterator, molecule_tag in iter_molecule_barcodes( - bam_iterator=cell_iterator - ): - - # process the data - metric_aggregator.parse_molecule( - tags=(gene_tag, cell_tag, molecule_tag), - records=molecule_iterator, - ) - - # write a record for each gene id - metric_aggregator.finalize() - gene_metrics_output.write(gene_tag, vars(metric_aggregator)) diff --git a/tools/scripts/sctools/src/sctools/metrics/merge.py b/tools/scripts/sctools/src/sctools/metrics/merge.py deleted file mode 100644 index aa4d4831..00000000 --- a/tools/scripts/sctools/src/sctools/metrics/merge.py +++ /dev/null @@ -1,191 +0,0 @@ -""" -Merge Sequence Metrics -====================== - -..currentmodule:: sctools.metrics - -This module defines classes to merge multiple metrics files that have been gathered from bam files -containing disjoint sets of cells. This is a common use pattern, as sequencing datasets are often -chunked to enable horizontal scaling using scatter-gather patterns. - -Classes -------- -MergeMetrics Merge Metrics base class -MergeCellMetrics Class to merge cell metrics -MergeGeneMetrics Class to merge gene metrics - -See Also --------- -sctools.metrics.gatherer -sctools.metrics.aggregator -sctools.metrics.writer - -""" - -from typing import List, Sequence - -import pandas as pd -import numpy as np - - -class MergeMetrics: - """Merges multiple metrics files into a single gzip compressed csv file - - Parameters - ---------- - metric_files : Sequence[str] - metrics files to merge - output_file : str - file name for the merged output - - Methods - ------- - execute - merge metrics files - # todo this should probably be wrapped into __init__ to make this more like a function - - """ - - def __init__(self, metric_files: Sequence[str], output_file: str): - self._metric_files = metric_files - if not output_file.endswith(".csv.gz"): - output_file += ".csv.gz" - self._output_file = output_file - - def execute(self) -> None: - raise NotImplementedError # merge the metrics - - -class MergeCellMetrics(MergeMetrics): - def execute(self) -> None: - """Concatenate input cell metric files - - Since bam files that metrics are calculated from contain disjoint sets of cells, cell - metrics can simply be concatenated together. - - """ - metric_dataframes: List[pd.DataFrame] = [ - pd.read_csv(f, index_col=0) for f in self._metric_files - ] - concatenated_frame: pd.DataFrame = pd.concat(metric_dataframes, axis=0) - concatenated_frame.to_csv(self._output_file, compression="gzip") - - -class MergeGeneMetrics(MergeMetrics): - def execute(self) -> None: - """Merge input gene metric files - - The bam files that metrics are calculated from contain disjoint sets of cells, each - of which can measure the same genes. - As a result, the metric values must be summed (count based metrics) averaged over - (fractional, averge, or variance metrics) or recalculated (metrics that depend on other - metrics). - - """ - - count_data_to_sum = [ - "n_reads", - "noise_reads", - "perfect_molecule_barcodes", - "reads_mapped_exonic", - "reads_mapped_intronic", - "reads_mapped_utr", - "reads_mapped_uniquely", - "reads_mapped_multiple", - "duplicate_reads", - "spliced_reads", - "antisense_reads", - "n_molecules", - "n_fragments", - "fragments_with_single_read_evidence", - "molecules_with_single_read_evidence", - "number_cells_detected_multiple", - "number_cells_expressing", - ] - - sum_operations = {c: "sum" for c in count_data_to_sum} - - def weighted_average(data_frame: pd.DataFrame) -> pd.Series: - """Calculate the average of each metric, weighted by number of reads per chunk - - Parameters - ---------- - data_frame : pd.DataFrame - chunks x metrics data frame - - Returns - ------- - weighted_average_metrics : pd.Series - The average of each metric across chunks, weighted by the number of reads per chunk - - """ - weights = data_frame["n_reads"].values - - columns_to_average_by_read = [ - "molecule_barcode_fraction_bases_above_30_mean", - "molecule_barcode_fraction_bases_above_30_variance", - "genomic_reads_fraction_bases_quality_above_30_mean", - "genomic_reads_fraction_bases_quality_above_30_variance", - "genomic_read_quality_mean", - "genomic_read_quality_variance", - ] - - return pd.Series( - { - c: np.average(data_frame[c], weights=weights) - for c in columns_to_average_by_read - } - ) - - def recalculate_operation(data_frame) -> pd.DataFrame: - """Recalculate metrics that are dependent on other metric values - - Other metrics should be merged before this function is executed - - Parameters - ---------- - data_frame : pd.DataFrame - chunks x metrics data frame - - Returns - ------- - recalculated_metrics : pd.DataFrame - data frame containing recalculated metrics - - """ - return pd.DataFrame( - data={ - "reads_per_molecule": data_frame["n_reads"] - / data_frame["n_molecules"], - "fragments_per_molecule": data_frame["n_fragments"] - / data_frame["n_molecules"], - "reads_per_fragment": data_frame["n_reads"] - / data_frame["n_fragments"], - } - ) - - # pick one file as a nucleus and merge each subsequent dataframe into it - nucleus = pd.read_csv(self._metric_files[0], index_col=0) - for filename in self._metric_files[1:]: - leaf = pd.read_csv(filename, index_col=0) - - # concatenate this leaf with the nucleus metrics file - concatenated = pd.concat([nucleus, leaf], axis=0) - - # group all duplicate gene names together - grouped = concatenated.groupby(level=0, axis=0) - - # execute the merging operations - summed_columns = grouped.agg(sum_operations) - averaged_columns = grouped.apply(weighted_average) - - # stitch the columns back together, add the metrics that need to be recalculated - merged = pd.concat([summed_columns, averaged_columns], axis=1) - recalculated_columns = recalculate_operation(merged) - merged = pd.concat([merged, recalculated_columns], axis=1) - - # set as nucleus and continue - nucleus = merged - - # write the data - nucleus.to_csv(self._output_file, compression="gzip") diff --git a/tools/scripts/sctools/src/sctools/metrics/writer.py b/tools/scripts/sctools/src/sctools/metrics/writer.py deleted file mode 100644 index 2379418c..00000000 --- a/tools/scripts/sctools/src/sctools/metrics/writer.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Metric Writers -============== - -..currentmodule:: sctools.metrics - -This module defines a class to write metrics to csv as the data is generated, cell by cell or gene -by gene. This strategy keeps memory usage low, as no more than a single molecule's worth of sam -records and one cell or gene's worth of metric data are in-memory at a time. - -Classes -------- -MetricCSVWriter Class to write metrics to file - -See Also --------- -sctools.metrics.gatherer -sctools.metrics.aggregator -sctools.metrics.merge - -""" -from typing import TextIO, List, Mapping, Any -from numbers import Number -import gzip - - -class MetricCSVWriter: - """Writes metric information iteratively to (optionally compressed) csv. - - Parameters - ---------- - output_stem : str - File stem for the output file. - compress : bool, optional - Whether or not to compress the output file (default = True). - - Methods - ------- - write_header - Write the metric header to file. - write - Write an array of cell or gene metrics to file. - close - Close the metric file. - - """ - - def __init__(self, output_stem: str, compress=True): - - # check and fix extension: - if compress: - if not output_stem.endswith(".csv.gz"): - output_stem += ".csv.gz" - else: - if not output_stem.endswith(".csv"): - output_stem += ".csv" - self._filename: str = output_stem - - # open the file - if compress: - self._open_fid: TextIO = gzip.open(self._filename, "wt") - else: - self._open_fid: TextIO = open(self._filename, "w") - self._header: List[str] = None - - @property - def filename(self) -> str: - """filename with correct suffix added""" - return self._filename - - def write_header(self, record: Mapping[str, Any]) -> None: - """Write the metric keys to file, producing the header line of the csv file. - - Parameters - ---------- - record : Mapping[str, Any] - Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance, - producing a dictionary of keys to metric values. - - """ - self._header = list(key for key in record.keys() if not key.startswith("_")) - self._open_fid.write("," + ",".join(self._header) + "\n") - - def write(self, index: str, record: Mapping[str, Number]) -> None: - """Write the array of metric values for a cell or gene to file. - - Parameters - ---------- - index : str - The name of the cell or gene that these metrics summarize - record : Mapping[str, Number] - Output of ``vars()`` called on an sctools.metrics.aggregator.MetricAggregator instance, - producing a dictionary of keys to metric values. - - """ - ordered_fields = [str(record[k]) for k in self._header] - - # genes and cells can be None, call repr to convert to string when this induces a TypeError - try: - self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n") - except TypeError: - index = repr(index) - self._open_fid.write(index + "," + ",".join(ordered_fields) + "\n") - - def close(self) -> None: - """Close the metrics file.""" - self._open_fid.close() diff --git a/tools/scripts/sctools/src/sctools/platform.py b/tools/scripts/sctools/src/sctools/platform.py deleted file mode 100644 index 460f26ac..00000000 --- a/tools/scripts/sctools/src/sctools/platform.py +++ /dev/null @@ -1,1126 +0,0 @@ -""" -Command Line Interface for SC Tools: -==================================== - -.. currentmodule:: sctools - -This module defines the command line interface for SC Tools. Tools are separated into those that -are specific to particular chemistries (e.g. Smart-seq 2) or experimental platforms (e.g. 10x -Genomics v2) and those that are general across any sequencing experiment. - -Currently, only general modules and those used for 10x v2 are implemented - -Classes -------- -GenericPlatform Class containing all general command line utilities -TenXV2 Class containing 10x v2 specific command line utilities - -""" - -import argparse -from typing import Iterable, List, Dict, Set, Optional, Sequence -from itertools import chain - -import pysam -from sctools import fastq, bam, metrics, count, consts, gtf, groups - - -class GenericPlatform: - """Platform-agnostic command line functions available in SC Tools. - - Platform-Agnostic Methods - ------------------------- - tag_sort_bam(): - sort a bam file by zero or more tags and then by queryname - verify_bam_sort(): - verifies whether bam file is correctly sorted by given list of zero or more tags, then queryname - split_bam() - split a bam file into subfiles of equal size - calculate_gene_metrics() - calculate information about genes captured by a sequencing experiment - calculate_cell_metrics() - calculate information about cells captured by a sequencing experiment - merge_gene_metrics() - merge multiple gene metrics files into a single output - merge_cell_metrics() - merge multiple cell metrics files into a single output - bam_to_count() - construct a compressed sparse row count file from a tagged, aligned bam file - merge_count_matrices() - merge multiple csr-format count matrices into a single csr matrix - group_qc_outputs() - aggregate Picard, HISAT2 and RSME QC statisitics - """ - - @classmethod - def tag_sort_bam(cls, args: Iterable = None) -> int: - """Command line entrypoint for sorting a bam file by zero or more tags, followed by queryname. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - description = "Sorts bam by list of zero or more tags, followed by query name" - parser = argparse.ArgumentParser(description=description) - parser.add_argument("-i", "--input_bam", required=True, help="input bamfile") - parser.add_argument("-o", "--output_bam", required=True, help="output bamfile") - parser.add_argument( - "-t", - "--tags", - nargs="+", - action="append", - help="tag(s) to sort by, separated by space, e.g. -t CB GE UB", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - tags = cls.get_tags(args.tags) - with pysam.AlignmentFile(args.input_bam, "rb") as f: - header = f.header - records = f.fetch(until_eof=True) - sorted_records = bam.sort_by_tags_and_queryname(records, tags) - with pysam.AlignmentFile(args.output_bam, "wb", header=header) as f: - for record in sorted_records: - f.write(record) - - return 0 - - @classmethod - def verify_bam_sort(cls, args: Iterable = None) -> int: - """Command line entrypoint for verifying bam is properly sorted by zero or more tags, followed by queryname. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - description = "Verifies whether bam is sorted by the list of zero or more tags, followed by query name" - parser = argparse.ArgumentParser(description=description) - parser.add_argument("-i", "--input_bam", required=True, help="input bamfile") - parser.add_argument( - "-t", - "--tags", - nargs="+", - action="append", - help="tag(s) to use to verify sorting, separated by space, e.g. -t CB GE UB", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - tags = cls.get_tags(args.tags) - with pysam.AlignmentFile(args.input_bam, "rb") as f: - aligned_segments = f.fetch(until_eof=True) - sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(r, tags) - for r in aligned_segments - ) - bam.verify_sort(sortable_records, tags) - - print( - "{0} is correctly sorted by {1} and query name".format(args.input_bam, tags) - ) - return 0 - - @classmethod - def get_tags(cls, raw_tags: Optional[Sequence[str]]) -> Iterable[str]: - if raw_tags is None: - raw_tags = [] - # Flattens into single list when tags specified like -t A -t B -t C - return [t for t in chain.from_iterable(raw_tags)] - - @classmethod - def split_bam(cls, args: Iterable = None) -> int: - """Command line entrypoint for splitting a bamfile into subfiles of equal size. - - prints filenames of chunks to stdout - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-b", "--bamfile", nargs="+", required=True, help="input bamfile" - ) - parser.add_argument( - "-p", "--output-prefix", required=True, help="prefix for output chunks" - ) - parser.add_argument( - "-s", - "--subfile-size", - required=False, - default=1000, - type=float, - help="approximate size target for each subfile (in MB)", - ) - parser.add_argument( - "--num-processes", - required=False, - default=None, - type=int, - help="Number of processes to parallelize over", - ) - parser.add_argument( - "-t", - "--tags", - nargs="+", - help="tag(s) to split bamfile over. Tags are checked sequentially, " - "and tags after the first are only checked if the first tag is " - "not present.", - ) - parser.set_defaults(raise_missing=True) - parser.add_argument( - "--drop-missing", - action="store_false", - help="drop records without tag specified by -t/--tag (default " - "behavior is to raise an exception", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - filenames = bam.split( - args.bamfile, - args.output_prefix, - args.tags, - approx_mb_per_split=args.subfile_size, - raise_missing=args.drop_missing, - num_processes=args.num_processes, - ) - - print(" ".join(filenames)) - return 0 - - @classmethod - def calculate_gene_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for calculating gene metrics from a sorted bamfile. - - Writes metrics to .csv - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input-bam", required=True, help="Input bam file name." - ) - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - gene_metric_gatherer = metrics.gatherer.GatherGeneMetrics( - args.input_bam, args.output_filestem - ) - gene_metric_gatherer.extract_metrics() - return 0 - - @classmethod - def calculate_cell_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for calculating cell metrics from a sorted bamfile. - - Writes metrics to .csv - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", "--input-bam", required=True, help="Input bam file name." - ) - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - parser.add_argument( - "-a", - "--gtf-annotation-file", - required=False, - default=None, - help="gtf annotation file that bam_file was aligned against", - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - # load mitochondrial gene ids from the annotation file - mitochondrial_gene_ids: Set(str) = set() - if args.gtf_annotation_file: - mitochondrial_gene_ids = gtf.get_mitochondrial_gene_names( - args.gtf_annotation_file - ) - - cell_metric_gatherer = metrics.gatherer.GatherCellMetrics( - args.input_bam, args.output_filestem, mitochondrial_gene_ids - ) - cell_metric_gatherer.extract_metrics() - return 0 - - @classmethod - def merge_gene_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for merging multiple gene metrics files. - - Merges multiple metrics inputs into a single metrics file that matches the shape and - order of the generated count matrix. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument("metric_files", nargs="+", help="Input metric files") - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - merge = metrics.merge.MergeGeneMetrics(args.metric_files, args.output_filestem) - merge.execute() - return 0 - - @classmethod - def merge_cell_metrics(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for merging multiple cell metrics files. - - Merges multiple metrics inputs into a single metrics file that matches the shape and - order of the generated count matrix. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument("metric_files", nargs="+", help="Input metric files") - parser.add_argument( - "-o", "--output-filestem", required=True, help="Output file stem." - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - merge = metrics.merge.MergeCellMetrics(args.metric_files, args.output_filestem) - merge.execute() - return 0 - - @classmethod - def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for constructing a count matrix from a tagged bam file. - - Constructs a count matrix from an aligned bam file sorted by cell barcode, molecule - barcode, and gene id. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.set_defaults( - cell_barcode_tag=consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag=consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag=consts.GENE_NAME_TAG_KEY, - sn_rna_seq_mode=False, - ) - parser.add_argument("-b", "--bam-file", help="input_bam_file", required=True) - parser.add_argument( - "-o", "--output-prefix", help="file stem for count matrix", required=True - ) - parser.add_argument( - "-a", - "--gtf-annotation-file", - required=True, - help="gtf annotation file that bam_file was aligned against", - ) - parser.add_argument( - "-c", - "--cell-barcode-tag", - help=f"tag that identifies the cell barcode (default = {consts.CELL_BARCODE_TAG_KEY})", - ) - parser.add_argument( - "-m", - "--molecule-barcode-tag", - help=f"tag that identifies the molecule barcode (default = {consts.MOLECULE_BARCODE_TAG_KEY})", - ) - parser.add_argument( - "-g", - "--gene-id-tag", - help=f"tag that identifies the gene name (default = {consts.GENE_NAME_TAG_KEY})", - ) - - parser.add_argument( - "-n", - "--sn-rna-seq-mode", - action="store_true", - help=f"snRNA Seq mode (default = False)", - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - # assume bam file unless the file explicitly has a sam suffix - open_mode = "r" if args.bam_file.endswith(".sam") else "rb" - - # load gene names from the annotation file - gene_name_to_index: Dict[str, int] = gtf.extract_gene_names( - args.gtf_annotation_file - ) - - # For snRNA-seq we need the extended gene information - if args.sn_rna_seq_mode: - gene_locations = gtf.extract_extended_gene_names(args.gtf_annotation_file) - else: - gene_locations = None - - matrix = count.CountMatrix.from_sorted_tagged_bam( - bam_file=args.bam_file, - gene_name_to_index=gene_name_to_index, - chromosomes_gene_locations_extended=gene_locations, - cell_barcode_tag=args.cell_barcode_tag, - molecule_barcode_tag=args.molecule_barcode_tag, - gene_name_tag=args.gene_id_tag, - open_mode=open_mode, - ) - matrix.save(args.output_prefix) - - return 0 - - @classmethod - def merge_count_matrices(cls, args: Iterable[str] = None) -> int: - """Command line entrypoint for constructing a count matrix from a tagged bam file. - - Constructs a count matrix from an aligned bam file sorted by cell barcode, molecule - barcode, and gene id. - - Parameters - ---------- - args : Iterable[str], optional - Arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--input-prefixes", - nargs="+", - help="prefix for count matrices to be concatenated. e.g. test_counts " - "for test_counts.npz, test_counts_col_index.npy, and test_counts_" - "row_index.npy", - ) - parser.add_argument( - "-o", "--output-stem", help="file stem for merged csr matrix", required=True - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - count_matrix = count.CountMatrix.merge_matrices(args.input_prefixes) - count_matrix.save(args.output_stem) - - return 0 - - @classmethod - def group_qc_outputs(cls, args: Iterable[str] = None) -> int: - """Commandline entrypoint for parsing picard metrics files, hisat2 and rsem statistics log files. - Parameters - ---------- - args: - file_names: array of files - output_name: prefix of output file name. - metrics_type: Picard, PicardTable, HISAT2, RSEM and Core. - Returns - ---------- - return: 0 - return if the program completes successfully. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", - "--file_names", - dest="file_names", - nargs="+", - required=True, - help="a list of files to be parsed out.", - ) - parser.add_argument( - "-o", - "--output_name", - dest="output_name", - required=True, - help="The output file name", - ) - parser.add_argument( - "-t", - "--metrics_type", - dest="metrics_type", - choices=["Picard", "PicardTable", "Core", "HISAT2", "RSEM"], - required=True, - help="a list of string to represent metrics types,such Picard, PicardTable, HISAT2,RSEM, Core", - ) - - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - - if args.metrics_type == "Picard": - groups.write_aggregated_picard_metrics_by_row( - args.file_names, args.output_name - ) - elif args.metrics_type == "PicardTable": - groups.write_aggregated_picard_metrics_by_table( - args.file_names, args.output_name - ) - elif args.metrics_type == "Core": - groups.write_aggregated_qc_metrics(args.file_names, args.output_name) - elif args.metrics_type == "HISAT2": - groups.parse_hisat2_log(args.file_names, args.output_name) - elif args.metrics_type == "RSEM": - groups.parse_rsem_cnt(args.file_names, args.output_name) - return 0 - - -class TenXV2(GenericPlatform): - """Command Line Interface for 10x Genomics v2 RNA-sequencing programs - - This class defines several methods that are created as CLI tools when sctools is installed - (see setup.py) - - Attributes - ---------- - cell_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the cell barcode and the tags to - assign the sequence and quality of the cell barcode - molecule_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the molecule barcode and the tags - to assign the sequence and quality of the molecule barcode - sample_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the sample barcode and the tags - to assign the sequence and quality of the sample barcode - - Methods - ------- - attach_barcodes() - Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse - (r2) bam file - - """ - - # 10x contains three barcodes embedded within sequencing reads. The below objects define the - # start and end points of those barcodes relative to the start of the sequence, and the - # GA4GH standard tags that the extracted barcodes should be labeled with in the BAM file. - cell_barcode = fastq.EmbeddedBarcode( - start=0, - end=16, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - molecule_barcode = fastq.EmbeddedBarcode( - start=16, - end=26, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - sample_barcode = fastq.EmbeddedBarcode( - start=0, - end=8, - quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY, - ) - - @classmethod - def _tag_bamfile( - cls, - input_bamfile_name: str, - output_bamfile_name: str, - tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator], - ) -> None: - """Adds tags from fastq file(s) to a bam file. - - Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from - `input_bamfile_name`, and writes the result to `output_bamfile_name` - - Parameters - ---------- - input_bamfile_name : str - input bam - output_bamfile_name : str - output bam - tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] - Iterable of generators that yield barcodes from fastq files - - """ - bam_tagger = bam.Tagger(input_bamfile_name) - bam_tagger.tag(output_bamfile_name, tag_generators) - - @classmethod - def _make_tag_generators( - cls, r1, i1=None, whitelist=None - ) -> List[fastq.EmbeddedBarcodeGenerator]: - """Create tag generators from fastq files. - - Tag generators are iterators that run over fastq records, they extract and yield all of the - barcodes embedded in each fastq record. For 10x, this means extracting the cell, umi, and - optionally, the sample barcode. - - Parameters - ---------- - r1 : str - forward fastq file - i1 : str, optional - index fastq file - whitelist : str, optional - A file that contains a list of acceptable cell barcodes - - Returns - ------- - tag_generators, List[EmbeddedBarcodeGenerator] - EmbeddedBarcodeGenerators containing barcodes from 10x fastq records - - """ - tag_generators = [] - - # generator for cell and molecule barcodes - if whitelist is not None: - tag_generators.append( - fastq.BarcodeGeneratorWithCorrectedCellBarcodes( - fastq_files=r1, - embedded_cell_barcode=cls.cell_barcode, - whitelist=whitelist, - other_embedded_barcodes=[cls.molecule_barcode], - ) - ) - else: - tag_generators.append( - fastq.EmbeddedBarcodeGenerator( - fastq_files=r1, - embedded_barcodes=[cls.cell_barcode, cls.molecule_barcode], - ) - ) - - # generator for sample barcodes - if i1 is not None: - tag_generators.append( - fastq.EmbeddedBarcodeGenerator( - fastq_files=i1, embedded_barcodes=[cls.sample_barcode] - ) - ) - return tag_generators - - @classmethod - def attach_barcodes(cls, args=None): - """Command line entrypoint for attaching barcodes to a bamfile. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, for testing (see test/test_entrypoints.py for example). The default - value of None, when passed to `parser.parse_args` causes the parser to - read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--r1", - required=True, - help="read 1 fastq file for a 10x genomics v2 experiment", - ) - parser.add_argument( - "--u2", - required=True, - help="unaligned bam containing cDNA fragments. Can be converted from fastq read 2" - "using picard FastqToSam", - ) - parser.add_argument( - "--i1", - default=None, - help="(optional) i7 index fastq file for a 10x genomics experiment", - ) - parser.add_argument( - "-o", "--output-bamfile", required=True, help="filename for tagged bam" - ) - parser.add_argument( - "-w", - "--whitelist", - default=None, - help="optional cell barcode whitelist. If provided, corrected barcodes " - "will also be output when barcodes are observed within 1ED of a " - "whitelisted barcode", - ) - if args is not None: - args = parser.parse_args(args) - else: - args = parser.parse_args() - tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) - cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) - - return 0 - - -class BarcodePlatform(GenericPlatform): - """Command Line Interface for extracting and attaching barcodes with specified positions - generalizing TenXV2 attach barcodes - - Sample, cell and/or molecule barcodes can be extracted and attached to an unmapped bam when the - corresponding barcode's start position and and length are provided. The sample barcode is extracted - from the index i7 fastq file and the cell and molecule barcode are extracted from the r1 fastq file - - This class defines several methods that are created as CLI tools when sctools is installed - (see setup.py) - - Attributes - ---------- - cell_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the cell barcode and the tags to - assign the sequence and quality of the cell barcode - molecule_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the molecule barcode and the tags - to assign the sequence and quality of the molecule barcode - sample_barcode : fastq.EmbeddedBarcode - A data class that defines the start and end position of the sample barcode and the tags - to assign the sequence and quality of the sample barcode - - Methods - ------- - attach_barcodes() - Attach barcodes from the forward (r1) and optionally index (i1) fastq files to the reverse - (r2) bam file - - """ - - cell_barcode = None - molecule_barcode = None - sample_barcode = None - - @classmethod - def _validate_barcode_args(cls, args): - """Validates that the barcode start position is greater than 0 - - Parameters - ---------- - args : object - arguments list, The default value of None, when passed to `parser.parse_args` - causes the parser to read `sys.argv` - - Returns - ------- - args : object - return arguments list if valid - - """ - # check that if a barcode start position is provided, its length is also (and vice versa) - cls._validate_barcode_length_and_position( - args.cell_barcode_start_pos, args.cell_barcode_length - ) - cls._validate_barcode_length_and_position( - args.molecule_barcode_start_pos, args.molecule_barcode_length - ) - cls._validate_barcode_length_and_position( - args.sample_barcode_start_pos, args.sample_barcode_length - ) - - # check that an index fastq is provided sample barcode length and position are given - if args.i1 is None and args.sample_barcode_length: - raise argparse.ArgumentError( - "An i7 index fastq file must be given to attach a sample barcode" - ) - - # check that cell and molecule barcodes don't overlap - if args.cell_barcode_length and args.molecule_barcode_length: - cls._validate_barcode_input( - args.molecule_barcode_start_pos, - args.cell_barcode_start_pos + args.cell_barcode_length, - ) - - return args - - @classmethod - def _validate_barcode_length_and_position( - cls, barcode_start_position, barcode_length - ): - """Checks that either that both barcode length and position are given or that neither are given as arguments - - Parameters - ---------- - barcode_start_position : int - the user defined start position (base pairs) of the barcode - - barcode_length : int - the user defined length (base pairs) of the barcode - - Returns - ------- - given_value : int - return given value if valid - - """ - barcode_start_pos_exists = bool(barcode_start_position) or ( - barcode_start_position == 0 - ) - barcode_length_exists = bool(barcode_length) - # (XOR boolean logic) - if barcode_start_pos_exists != barcode_length_exists: - raise argparse.ArgumentError( - "Invalid position/length, both position and length must be provided by the user together" - ) - - @classmethod - def _validate_barcode_input(cls, given_value, min_value): - """Validates that the barcode input is greater than a min value - - Parameters - ---------- - given_value : int - the given value that must be greater than the min_value, - (barcode length or barcode starting position) - - min_value : int - the min value that the given_value must be greater than - - Returns - ------- - given_value : int - return given value if valid - - """ - if given_value < min_value: - raise argparse.ArgumentTypeError("Invalid barcode length/position") - return given_value - - @classmethod - def _validate_barcode_start_pos(cls, given_value): - """Validates that the barcode start position is greater than 0 - - Parameters - ---------- - given_value : Union[int, str] - the given start position of the barcode to validate - - Returns - ------- - given_value : int - returns the start position if it is valid - - """ - return cls._validate_barcode_input(int(given_value), 0) - - @classmethod - def _validate_barcode_length(cls, given_value): - """Validates that the barcode length is greater than 1 - - Parameters - ---------- - given_value : Union[int, str] - the given length of the barcode to validate - - Returns - ------- - given_value : int - returns the length if it is valid - - """ - return cls._validate_barcode_input(int(given_value), 1) - - @classmethod - def _tag_bamfile( - cls, - input_bamfile_name: str, - output_bamfile_name: str, - tag_generators: Iterable[fastq.EmbeddedBarcodeGenerator], - ) -> None: - """Adds tags from fastq file(s) to a bam file. - - Attaches tags extracted from fastq files by `tag_generators`, attaches them to records from - `input_bamfile_name`, and writes the result to `output_bamfile_name` - - Parameters - ---------- - input_bamfile_name : str - input bam - output_bamfile_name : str - output bam - tag_generators : Iterable[fastq.EmbeddedBarcodeGenerator] - Iterable of generators that yield barcodes from fastq files - - """ - bam_tagger = bam.Tagger(input_bamfile_name) - bam_tagger.tag(output_bamfile_name, tag_generators) - - @classmethod - def _make_tag_generators( - cls, r1, i1=None, whitelist=None - ) -> List[fastq.EmbeddedBarcodeGenerator]: - """Create tag generators from fastq files. - - Tag generators are iterators that run over fastq records, they extract and yield all of the - barcodes embedded in each fastq record. This means extracting the cell, umi, and/or the sample barcode. - - Parameters - ---------- - r1 : str - forward fastq file, where possibly the cell and/or molecule barcode is found - i1 : str, optional - index fastq file, where the sample barcode is found - whitelist : str, optional - A file that contains a list of acceptable cell barcodes - - Returns - ------- - tag_generators : List[EmbeddedBarcodeGenerator] - EmbeddedBarcodeGenerators containing barcodes from the given fastq - - """ - tag_generators = [] - barcode_args = {"fastq_files": r1} - - if i1: - sample_barcode_args = dict(barcode_args) - sample_barcode_args["embedded_barcodes"] = [cls.sample_barcode] - tag_generators.append(fastq.EmbeddedBarcodeGenerator(**sample_barcode_args)) - - if whitelist: - barcode_args["whitelist"] = whitelist - if cls.cell_barcode: - barcode_args["embedded_cell_barcode"] = cls.cell_barcode - if cls.molecule_barcode: - barcode_args["other_embedded_barcodes"] = [cls.molecule_barcode] - tag_generators.append( - fastq.BarcodeGeneratorWithCorrectedCellBarcodes(**barcode_args) - ) - - else: - # for all the barcodes that have a length and starting position specified - barcode_args["embedded_barcodes"] = [ - barcode - for barcode in [cls.cell_barcode, cls.molecule_barcode] - if barcode - ] - tag_generators.append(fastq.EmbeddedBarcodeGenerator(**barcode_args)) - - return tag_generators - - @classmethod - def attach_barcodes(cls, args=None): - """Command line entrypoint for attaching barcodes to a bamfile. - - Parameters - ---------- - args : Iterable[str], optional - arguments list, The default value of None, when passed to `parser.parse_args` - causes the parser to read `sys.argv` - - Returns - ------- - return_call : 0 - return call if the program completes successfully - - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--r1", - required=True, - help="read 1 fastq file, where the cell and molecule barcode is found", - ) - parser.add_argument( - "--u2", - required=True, - help="unaligned bam, can be converted from fastq read 2" - "using picard FastqToSam", - ) - parser.add_argument( - "-o", "--output-bamfile", required=True, help="filename for tagged bam" - ) - parser.add_argument( - "-w", - "--whitelist", - default=None, - help="optional cell barcode whitelist. If provided, corrected barcodes " - "will also be output when barcodes are observed within 1ED of a " - "whitelisted barcode", - ) - parser.add_argument( - "--i1", - default=None, - help="(optional) i7 index fastq file, where the sample barcode is found", - ) - parser.add_argument( - "--sample-barcode-start-position", - dest="sample_barcode_start_pos", - default=None, - help="the user defined start position (base pairs) of the sample barcode", - type=cls._validate_barcode_start_pos, - ) - parser.add_argument( - "--sample-barcode-length", - dest="sample_barcode_length", - default=None, - help="the user defined length (base pairs) of the sample barcode", - type=cls._validate_barcode_length, - ) - parser.add_argument( - "--cell-barcode-start-position", - dest="cell_barcode_start_pos", - default=None, - help="the user defined start position, in base pairs, of the cell barcode", - type=cls._validate_barcode_start_pos, - ) - parser.add_argument( - "--cell-barcode-length", - dest="cell_barcode_length", - default=None, - help="the user defined length, in base pairs, of the cell barcode", - type=cls._validate_barcode_length, - ) - parser.add_argument( - "--molecule-barcode-start-position", - dest="molecule_barcode_start_pos", - default=None, - help="the user defined start position, in base pairs, of the molecule barcode " - "(must be not overlap cell barcode if cell barcode is provided)", - type=cls._validate_barcode_start_pos, - ) - parser.add_argument( - "--molecule-barcode-length", - dest="molecule_barcode_length", - default=None, - help="the user defined length, in base pairs, of the molecule barcode", - type=cls._validate_barcode_length, - ) - - # parse and validate the args - if args: - args = parser.parse_args(args) - else: - args = parser.parse_args() - cls._validate_barcode_args(args) - - # if the length and there for the start pos have been given as args - # get the appropriate barcodes - if args.cell_barcode_length: - cls.cell_barcode = fastq.EmbeddedBarcode( - start=args.cell_barcode_start_pos, - end=args.cell_barcode_start_pos + args.cell_barcode_length, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - if args.molecule_barcode_length: - cls.molecule_barcode = fastq.EmbeddedBarcode( - start=args.molecule_barcode_start_pos, - end=args.molecule_barcode_start_pos + args.molecule_barcode_length, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - if args.sample_barcode_length: - cls.sample_barcode = fastq.EmbeddedBarcode( - start=args.sample_barcode_start_pos, - end=args.sample_barcode_start_pos + args.sample_barcode_length, - quality_tag=consts.QUALITY_SAMPLE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_SAMPLE_BARCODE_TAG_KEY, - ) - - # make the tags and attach the barcodes - tag_generators = cls._make_tag_generators(args.r1, args.i1, args.whitelist) - cls._tag_bamfile(args.u2, args.output_bamfile, tag_generators) - - return 0 diff --git a/tools/scripts/sctools/src/sctools/reader.py b/tools/scripts/sctools/src/sctools/reader.py deleted file mode 100644 index bc26f1cf..00000000 --- a/tools/scripts/sctools/src/sctools/reader.py +++ /dev/null @@ -1,204 +0,0 @@ -""" -Sequence File Iterators -======================= - -.. currentmodule:: sctools - -This module defines a general iterator and some helper functions for iterating over files -that contain sequencing data - -Methods -------- -infer_open(file_: str, mode: str) - helper function that determines the compression type of a file without relying on its extension -zip_readers(*readers, indices=None) - helper function that iterates over one or more readers, optionally extracting only the records - that correspond to indices - -Classes -------- -Reader Basic reader that loops over one or more input files. - -See Also --------- -sctools.gtf.Reader -sctools.fastq.Reader - -""" - -import os -import gzip -import bz2 -from copy import copy -from functools import partial -from typing import Callable, Iterable, Generator, Set, List - - -def infer_open(file_: str, mode: str) -> Callable: - """Helper function to infer the correct compression type of an input file - - Identifies files that are .gz or .bz2 compressed without requiring file extensions - - Parameters - ---------- - file_ : str - the file to open - mode : {'r', 'rb'} - the mode to open the file in. 'r' returns strings, 'rb' returns bytes - - Returns - ------- - open_function : Callable - the correct open function for the file's compression with mode pre-set through functools - partial - - """ - with open(file_, "rb") as f: - data: bytes = f.read(3) - - # gz and bzip treat 'r' = bytes, 'rt' = string - if data[:2] == b"\x1f\x8b": # gzip magic number - inferred_openhook: Callable = gzip.open - inferred_mode: str = "rt" if mode == "r" else mode - - elif data == b"BZh": # bz2 magic number - inferred_openhook: Callable = bz2.open - inferred_mode: str = "rt" if mode == "r" else mode - - else: - inferred_openhook: Callable = open - inferred_mode: str = mode - - return partial(inferred_openhook, mode=inferred_mode) - - -class Reader: - """Basic reader object that seamlessly loops over multiple input files. - - Is subclassed to create readers for specific file types (e.g. fastq, gtf, etc.) - - Parameters - ---------- - files : Union[str, List], optional - The file(s) to read. If '-', read sys.stdin (default = '-') - mode : {'r', 'rb'}, optional - The open mode for files. If 'r', yield string data, if 'rb', yield bytes data - (default = 'r'). - header_comment_char : str, optional - If not None, skip lines beginning with this character (default = None). - - """ - - def __init__(self, files="-", mode="r", header_comment_char=None): - if isinstance(files, str): - self._files = [files] - elif isinstance(files, Iterable): # test items of iterable - files = list(files) - if all(isinstance(f, str) for f in files): - self._files = files - else: - raise TypeError("All passed files must be type str") - else: - raise TypeError("Files must be a string filename or a list of such names.") - - # set open mode: - if mode not in {"r", "rb"}: - raise ValueError("Mode must be one of 'r', 'rb'") - self._mode = mode - - if isinstance(header_comment_char, str) and mode == "rb": - self._header_comment_char = header_comment_char.encode() - else: - self._header_comment_char = header_comment_char - - @property - def filenames(self) -> List[str]: - return self._files - - def __len__(self): - """Return the length of the Reader object. - - Notes - ----- - This function requires reading the complete file, and should typically not be - used with sys.stdin, as it will consume the input. - - """ - return sum(1 for _ in self) - - def __iter__(self): - for file_ in self._files: - - f = infer_open(file_, self._mode)(file_) - - # iterate over the file, dropping header lines if requested - try: - file_iterator = iter(f) - if self._header_comment_char is not None: - first_record = next(file_iterator) - while first_record.startswith(self._header_comment_char): - first_record = next(file_iterator) - - yield first_record # avoid loss of first non-comment line - - for record in file_iterator: # now, run to exhaustion - yield record - finally: # clean up - f.close() - - @property - def size(self) -> int: - """return the collective size of all files being read in bytes""" - return sum(os.stat(f).st_size for f in self._files) - - def select_record_indices(self, indices: Set) -> Generator: - """Iterate over provided indices only, skipping other records. - - Parameters - ---------- - indices : Set[int] - indices to include in the output - - Yields - ------ - record, str - records from file corresponding to indices - - """ - indices = copy( - indices - ) # passed indices is a reference, need own copy to modify - for idx, record in enumerate(self): - if idx in indices: - yield record - indices.remove(idx) - - # stopping condition - if not indices: - break - - -def zip_readers(*readers, indices=None) -> Generator: - """Zip together multiple reader objects, yielding records simultaneously. - - If indices is passed, only return lines in file that correspond to indices - - Parameters - ---------- - *readers : List[Reader] - Reader objects to simultaneously iterate over - indices : Set[int], optional - indices to include in the output - - Yields - ------ - records : Tuple[str] - one record per reader passed - - """ - if indices: - iterators = zip(*(r.select_record_indices(indices) for r in readers)) - else: - iterators = zip(*readers) - for record_tuple in iterators: - yield record_tuple diff --git a/tools/scripts/sctools/src/sctools/stats.py b/tools/scripts/sctools/src/sctools/stats.py deleted file mode 100644 index a303f5fd..00000000 --- a/tools/scripts/sctools/src/sctools/stats.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Statistics Functions for Sequence Data Analysis -=============================================== - -.. currentmodule:: sctools - -This module implements statistical modules for sequence analysis - -Methods -------- -base4_entropy(x: np.array, axis: int=1) - calculate the entropy of a 4 x sequence length base frequency matrix - -Classes -------- -OnlineGaussianSuficientStatistic Empirical (online) calculation of mean and variance - -""" - -from typing import Tuple -import numpy as np - - -def base4_entropy(x, axis=1): - """Calculate entropy in base four of a data matrix x - - Useful for measuring DNA entropy (with 4 nucleotides) as the output is restricted to [0, 1] - - Parameters - ---------- - x : np.ndarray - array of dimension one or more containing numeric types - axis : int, optional - axis to calculate entropy across. Values in this axis are treated as observation frequencies - - Returns - ------- - entropy : np.ndarray - array of input dimension - 1 containin entropy values bounded in [0, 1] - - """ - - # convert to probabilities - if axis == 1: - x = np.divide(x, np.sum(x, axis=axis)[:, None]) - else: - x = np.divide(x, np.sum(x, axis=axis)) - - with np.errstate(divide="ignore"): - r = np.log(x) / np.log(4) - - # convention: 0 * log(0) = 0, != -INF. - r[np.isinf(r)] = 0 - - return np.abs(-1 * np.sum(x * r, axis=axis)) - - -class OnlineGaussianSufficientStatistic: - """ - Implementation of Welford's online mean and variance algorithm - - Methods - ------- - update(new_value: float) - incorporate new_value into the online estimate of mean and variance - mean() - return the mean value - calculate_variance() - calculate and return the variance - mean_and_variance() - return both mean and variance - - """ - - __slots__ = ["_count", "_mean", "_mean_squared_error"] - - def __init__(self): - self._mean_squared_error: float = 0.0 - self._mean: float = 0.0 - self._count: int = 0 - - def update(self, new_value: float) -> None: - self._count += 1 - delta = new_value - self._mean - self._mean += delta / self._count - delta2 = new_value - self._mean - self._mean_squared_error += delta * delta2 - - @property - def mean(self) -> float: - """return the mean value""" - return self._mean - - def calculate_variance(self): - """calculate and return the variance""" - if self._count < 2: - return float("nan") - else: - return self._mean_squared_error / (self._count - 1) - - def mean_and_variance(self) -> Tuple[float, float]: - """calculate and return the mean and variance""" - return self.mean, self.calculate_variance() diff --git a/tools/scripts/sctools/src/sctools/test/__init__.py b/tools/scripts/sctools/src/sctools/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tools/scripts/sctools/src/sctools/test/characterize-cell-testing-data.ipynb b/tools/scripts/sctools/src/sctools/test/characterize-cell-testing-data.ipynb deleted file mode 100644 index 37fc8747..00000000 --- a/tools/scripts/sctools/src/sctools/test/characterize-cell-testing-data.ipynb +++ /dev/null @@ -1,1057 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load Testing Data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import pysam\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:24: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n" - ] - } - ], - "source": [ - "def parse_record(record):\n", - " \"\"\"line parser to build dataframe, supports missing tags in test data\"\"\"\n", - " data = {\n", - " 'qname': record.query_name,\n", - " 'flag': record.flag,\n", - " 'reference': record.reference_id,\n", - " 'position': record.pos,\n", - " 'mapq': record.query_alignment_qualities,\n", - " 'cigar': record.cigarstring,\n", - " 'rnext': record.rnext, \n", - " 'pnext': record.pnext,\n", - " 'tlen': record.tlen, \n", - " 'sequence': record.seq,\n", - " 'quality': record.qual,\n", - " }\n", - " for name, tag in record.get_tags():\n", - " data[name] = tag\n", - " return pd.Series(data)\n", - "\n", - "input_sam_file = 'data/small-cell-sorted.bam'\n", - "with pysam.AlignmentFile(input_sam_file, 'rb') as f:\n", - " records = []\n", - " for record in f:\n", - " records.append(parse_record(record))\n", - "\n", - "data = pd.concat(records, axis=1).T\n", - "\n", - "results_scalar = {} # will hold the calculations we make\n", - "\n", - "# add a strand field\n", - "data['strand'] = [f & 16 for f in data['flag']]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build Expectations for Testing Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Reads" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "656\n" - ] - } - ], - "source": [ - "results_scalar['n_reads'] = len(data)\n", - "print(results_scalar['n_reads'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Genes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "11\n" - ] - } - ], - "source": [ - "results_scalar['n_genes'] = len(data.groupby(['GE']))\n", - "print(results_scalar['n_genes'])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.9827586206896552\n" - ] - } - ], - "source": [ - "mean_n_genes = data.groupby(['CB']).apply(lambda x: len(set(x['GE']))).mean()\n", - "print(mean_n_genes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Gene table should have 8 entries plus a header for a total of 9 lines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Molecules\n", - "\n", - "Molecules are defined as a unique triplet of CB, UB, and GE" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "249\n" - ] - } - ], - "source": [ - "results_scalar['n_molecules'] = len(data.groupby(['CB', 'UB', 'GE']))\n", - "print(results_scalar['n_molecules'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Fragments\n", - "\n", - "Fragments are defined as molecules are (CB, UB, GE) but must additionally have a unique position" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "499\n" - ] - } - ], - "source": [ - "results_scalar['n_fragments'] = len(data.groupby(['CB', 'UB', 'GE', 'position']))\n", - "print(results_scalar['n_fragments'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Most Abundant Gene\n", - "\n", - "Based on the above, at least one of the genes has to be observed more than once. Which is it? " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MTATP6P1 300\n" - ] - } - ], - "source": [ - "results_scalar['most_abundant'] = data.groupby(['GE']).size().idxmax()\n", - "results_scalar['most_abundant_gene_n_observations'] = data.groupby(['GE']).size().max()\n", - "print(results_scalar['most_abundant'], results_scalar['most_abundant_gene_n_observations'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cell with most reads" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "94" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.groupby(['CB']).apply(lambda x: len(x)).max()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## perfect molecule barcodes" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['perfect_molecule_barcodes'] = 0\n", - "for c, r in zip(data['UB'], data['UR']):\n", - " if c == r:\n", - " results_scalar['perfect_molecule_barcodes'] += 1" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "655" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_scalar['perfect_molecule_barcodes']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the alignment metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_exonic'] = sum(data['XF'] == 'CODING')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_intronic'] = sum(data['XF'] == 'INTRONIC')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_utr'] = sum(data['XF'] == 'UTR')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_uniquely'] = sum(data['NH'] == 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['duplicate_reads'] = sum((data['flag'] & 1024).astype(bool))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['spliced_reads'] = sum(1 for v in data['cigar'] if 'N' in v)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicate_reads': 107,\n", - " 'most_abundant': 'MTATP6P1',\n", - " 'most_abundant_gene_n_observations': 300,\n", - " 'n_fragments': 499,\n", - " 'n_genes': 11,\n", - " 'n_molecules': 249,\n", - " 'n_reads': 656,\n", - " 'perfect_molecule_barcodes': 655,\n", - " 'reads_mapped_exonic': 609,\n", - " 'reads_mapped_intronic': 28,\n", - " 'reads_mapped_uniquely': 656,\n", - " 'reads_mapped_utr': 19,\n", - " 'spliced_reads': 2}" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_scalar" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the higher-order metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "calc_func_fraction_from_acii = lambda x: sum(1 for c in x if ord(c) > 63) / len(x)\n", - "calc_func_fraction = lambda x: sum(1 for c in x if c > 30) / len(x)\n", - "calc_func_mean = lambda x: np.mean([c for c in x])\n", - "\n", - "data['num_UY_qual_fraction'] = data['UY'].apply(calc_func_fraction_from_acii)\n", - "\n", - "data['num_base_qual_fraction'] = data['mapq'].apply(calc_func_fraction)\n", - "data['num_base_qual_mean'] = data['mapq'].apply(calc_func_mean)\n", - "\n", - "grouped_by_cell = data.groupby(['CB'])" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_series = {}" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# vector values\n", - "# I changed these to retain the index to make merging into a dataframe easier, and guarantee same order. \n", - "results_series['molecule_barcode_fraction_bases_above_30_mean'] = grouped_by_cell.mean()['num_UY_qual_fraction']\n", - "results_series['molecule_barcode_fraction_bases_above_30_variance'] = grouped_by_cell.var()['num_UY_qual_fraction']\n", - "\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_mean'] = grouped_by_cell.mean()['num_base_qual_fraction']\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_variance'] = grouped_by_cell.var()['num_base_qual_fraction']\n", - "results_series['genomic_read_quality_mean'] = grouped_by_cell.mean()['num_base_qual_mean']\n", - "results_series['genomic_read_quality_variance'] = grouped_by_cell.var()['num_base_qual_mean']\n", - "\n", - "reads_per_cell = data.groupby(['CB']).size()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AS 96\n", - "CB AAACCTGAGAAACCTA\n", - "CR AAACCTGAGAAACCTA\n", - "CY AAFFFJJJJJJJJJJJ\n", - "GE NaN\n", - "GS NaN\n", - "HI 1\n", - "MD 98\n", - "NH 1\n", - "NM 0\n", - "RG A\n", - "SR GTAATTGC\n", - "SY AAAFFJ\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# compare two numpy arrays that are slightly different\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0meps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m11\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1e-8\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mallclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompare_me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompare_me\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0meps\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (58,) (11,) " - ] - } - ], - "source": [ - "# compare two numpy arrays that are slightly different\n", - "eps = np.random.rand(11) * 1e-8\n", - "np.allclose(compare_me, compare_me + eps)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# it is actually discriminative, though\n", - "np.allclose(compare_me, np.arange(11))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Look at the metrics output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "cell_metrics = pd.read_csv('data/cell_metrics.csv', index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "cell_metrics['n_genes']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "!cat data/cell_metrics.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "hide_input": false, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "nav_menu": {}, - "toc": { - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 6, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tools/scripts/sctools/src/sctools/test/characterize-gene-testing-data.ipynb b/tools/scripts/sctools/src/sctools/test/characterize-gene-testing-data.ipynb deleted file mode 100644 index a6a31002..00000000 --- a/tools/scripts/sctools/src/sctools/test/characterize-gene-testing-data.ipynb +++ /dev/null @@ -1,1159 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load Testing Data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import pysam\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:24: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n" - ] - } - ], - "source": [ - "def parse_record(record):\n", - " \"\"\"line parser to build dataframe, supports missing tags in test data\"\"\"\n", - " data = {\n", - " 'qname': record.query_name,\n", - " 'flag': record.flag,\n", - " 'reference': record.reference_id,\n", - " 'position': record.pos,\n", - " 'mapq': record.query_alignment_qualities,\n", - " 'cigar': record.cigarstring,\n", - " 'rnext': record.rnext, \n", - " 'pnext': record.pnext,\n", - " 'tlen': record.tlen, \n", - " 'sequence': record.seq,\n", - " 'quality': record.qual,\n", - " }\n", - " for name, tag in record.get_tags():\n", - " data[name] = tag\n", - " return pd.Series(data)\n", - "\n", - "input_sam_file = 'data/small-gene-sorted.bam'\n", - "with pysam.AlignmentFile(input_sam_file, 'rb') as f:\n", - " records = []\n", - " for record in f:\n", - " records.append(parse_record(record))\n", - "\n", - "data = pd.concat(records, axis=1).T\n", - "\n", - "results_scalar = {} # will hold the calculations we make" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build Expectations for Testing Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Reads" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "300\n" - ] - } - ], - "source": [ - "results_scalar['n_reads'] = len(data)\n", - "print(results_scalar['n_reads'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Genes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8\n" - ] - } - ], - "source": [ - "results_scalar['n_genes'] = len(data.groupby(['GE']))\n", - "print(results_scalar['n_genes'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Gene table should have 8 entries plus a header for a total of 9 lines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Molecules\n", - "\n", - "Molecules are defined as a unique triplet of CB, UB, and GE" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "88\n" - ] - } - ], - "source": [ - "results_scalar['n_molecules'] = len(data.groupby(['CB', 'UB', 'GE']))\n", - "print(results_scalar['n_molecules'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Number of Fragments\n", - "\n", - "Fragments are defined as molecules are (CB, UB, GE) but must additionally have a unique position" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "217\n" - ] - } - ], - "source": [ - "results_scalar['n_fragments'] = len(data.groupby(['CB', 'UB', 'GE', 'position']))\n", - "print(results_scalar['n_fragments'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Most Abundant Gene\n", - "\n", - "Based on the above, at least one of the genes has to be observed more than once. Which is it? " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "AL627309.7 245\n" - ] - } - ], - "source": [ - "results_scalar['most_abundant'] = data.groupby(['GE']).size().idxmax()\n", - "results_scalar['most_abundant_gene_n_observations'] = data.groupby(['GE']).size().max()\n", - "print(results_scalar['most_abundant'], results_scalar['most_abundant_gene_n_observations'])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['perfect_molecule_barcodes'] = 0\n", - "for c, r in zip(data['UB'], data['UR']):\n", - " if c == r:\n", - " results_scalar['perfect_molecule_barcodes'] += 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the alignment metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'most_abundant': 'AL627309.7',\n", - " 'most_abundant_gene_n_observations': 245,\n", - " 'n_fragments': 217,\n", - " 'n_genes': 8,\n", - " 'n_molecules': 88,\n", - " 'n_reads': 300,\n", - " 'perfect_molecule_barcodes': 300}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_scalar" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_exonic'] = sum(data['XF'] == 'CODING')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_intronic'] = sum(data['XF'] == 'INTRONIC')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_utr'] = sum(data['XF'] == 'UTR')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['reads_mapped_uniquely'] = sum(data['NH'] == 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['duplicate_reads'] = sum((data['flag'] & 1024).astype(bool))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_scalar['spliced_reads'] = sum(1 for v in data['cigar'] if 'N' in v)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the higher-order metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "calc_func_fraction_from_acii = lambda x: sum(1 for c in x if ord(c) > 63) / len(x)\n", - "calc_func_fraction = lambda x: sum(1 for c in x if c > 30) / len(x)\n", - "calc_func_mean = lambda x: np.mean([c for c in x])\n", - "\n", - "data['num_UY_qual_fraction'] = data['UY'].apply(calc_func_fraction_from_acii)\n", - "\n", - "data['num_base_qual_fraction'] = data['mapq'].apply(calc_func_fraction)\n", - "data['num_base_qual_mean'] = data['mapq'].apply(calc_func_mean)\n", - "\n", - "grouped_by_gene = data.groupby(['GE'])" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "results_series = {}" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# vector values\n", - "# I changed these to retain the index to make merging into a dataframe easier, and guarantee same order. \n", - "results_series['molecule_barcode_fraction_bases_above_30_mean'] = grouped_by_gene.mean()['num_UY_qual_fraction']\n", - "results_series['molecule_barcode_fraction_bases_above_30_variance'] = grouped_by_gene.var()['num_UY_qual_fraction']\n", - "\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_mean'] = grouped_by_gene.mean()['num_base_qual_fraction']\n", - "results_series['genomic_reads_fraction_bases_quality_above_30_variance'] = grouped_by_gene.var()['num_base_qual_fraction']\n", - "results_series['genomic_read_quality_mean'] = grouped_by_gene.mean()['num_base_qual_mean']\n", - "results_series['genomic_read_quality_variance'] = grouped_by_gene.var()['num_base_qual_mean']\n", - "\n", - "reads_per_gene = data.groupby(['GE']).size()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "molecules_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB']).size()))\n", - "fragments_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB', 'position']).size()))\n", - "reads_per_molecule = reads_per_gene / molecules_per_gene\n", - "reads_per_fragment = reads_per_gene / fragments_per_gene\n", - "fragments_per_molecule = fragments_per_gene / molecules_per_gene\n", - "results_series['reads_per_molecule'] = reads_per_molecule\n", - "results_series['reads_per_fragment'] = reads_per_fragment\n", - "results_series['fragments_per_molecule'] = fragments_per_molecule\n", - "\n", - "# scalar values\n", - "results_scalar['fragments_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE', 'position']).size() == 1)\n", - "results_scalar['molecules_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE']).size() == 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "fragments_per_molecule np.array([1.0000, 1.0000, 1.0000, 1.8750, 2.9831, 1.2500, 1.0000, 1.3077])\n", - "genomic_read_quality_mean np.array([36.2143, 24.8469, 25.4792, 35.3664, 34.0956, 33.0364, 20.7423, 27.3078])\n", - "genomic_read_quality_variance np.array([nan, nan, nan, 18.4553, 21.6745, 33.6572, nan, 53.5457])\n", - "genomic_reads_fraction_bases_quality_above_30_mean np.array([0.8878, 0.3980, 0.4271, 0.8148, 0.7681, 0.7216, 0.1546, 0.5089])\n", - "genomic_reads_fraction_bases_quality_above_30_variance np.array([nan, nan, nan, 0.0282, 0.0346, 0.0537, nan, 0.0849])\n", - "molecule_barcode_fraction_bases_above_30_mean np.array([1.0000, 1.0000, 0.8000, 0.9885, 0.9833, 0.9857, 0.7000, 0.9444])\n", - "molecule_barcode_fraction_bases_above_30_variance np.array([nan, nan, nan, 0.0011, 0.0051, 0.0014, nan, 0.0120])\n", - "reads_per_fragment np.array([1.0000, 1.0000, 1.0000, 1.7333, 1.3920, 1.4000, 1.0000, 1.0588])\n", - "reads_per_molecule np.array([1.0000, 1.0000, 1.0000, 3.2500, 4.1525, 1.7500, 1.0000, 1.3846])\n" - ] - } - ], - "source": [ - "# write out the array information for the testing file\n", - "for k, vals in pd.DataFrame(results_series).iteritems():\n", - " print(k, 'np.array([' + ', '.join('{:.4f}'.format(i) for i in vals.values) + '])')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Write Results to File for Automated Testing" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "pd.Series(results_scalar).to_csv('%s_testing_knowledge_scalar.csv' % input_sam_file.replace('.bam', ''))\n", - "pd.DataFrame(results_series).to_csv('%s_testing_knowledge_series.csv' % input_sam_file.replace('.bam', ''))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# do a comparison of the whole 2d dataframe at once\n", - "np.allclose(\n", - " pd.DataFrame(results_series).fillna(0).values, # fill nans with zero, call values to get the numpy array the dataframe is based on\n", - " pd.read_csv('data/small-gene-sorted_testing_knowledge_series.csv', index_col=0, header=0).fillna(0).values\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# to get most_abundant alone: " - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "test_read_scalar = pd.read_csv('data/small-gene-sorted_testing_knowledge_scalar.csv', index_col=0, header=None, squeeze=True)\n", - "\n", - "# extract this, we're going to drop it from the array to do some conversion to numeric\n", - "most_abundant = test_read_scalar['most_abundant'] \n", - "\n", - "# drop most abundant, convert to float, fill any NaN values with 0, and call .values to get the numpy array pandas objects are based on.\n", - "for_comparison = test_read_scalar.drop('most_abundant').astype(float).fillna(0).values\n", - "\n", - "\n", - "# note, have to drop the string value and convert to float before this works. \n", - "np.allclose(\n", - " pd.Series(results_scalar).drop('most_abundant').fillna(0).values, # do the same thing as above to the one in memory\n", - " for_comparison\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "GE\n", - "ACAP3 36.214286\n", - "AGRN 24.846939\n", - "AL627309.1 25.479167\n", - "AL627309.5 35.366414\n", - "AL627309.7 34.095625\n", - "AL645608.2 33.036443\n", - "AL645608.3 20.742268\n", - "AL645608.4 27.307758\n", - "Name: genomic_read_quality_mean, dtype: float64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# get a metric from a dataframe: \n", - "df = pd.DataFrame(results_series)\n", - "df['genomic_read_quality_mean']" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "# get a numpy array from the dataframe\n", - "compare_me = df['genomic_read_quality_mean'].values" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# compare two numpy arrays that are slightly different\n", - "eps = np.random.rand(8) * 1e-8\n", - "np.allclose(compare_me, compare_me + eps)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# it is actually discriminative, though\n", - "np.allclose(compare_me, np.arange(8))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Look at the metrics output" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - } - }, - "outputs": [], - "source": [ - "gene_metrics = pd.read_csv('data/gene_metrics.csv', index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - }, - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
n_readsnoise_readsperfect_molecule_barcodesreads_mapped_exonicreads_mapped_intronicreads_mapped_utrreads_mapped_uniquelyreads_mapped_multipleduplicate_readsspliced_reads...genomic_read_quality_variancen_moleculesn_fragmentsreads_per_moleculereads_per_fragmentfragments_per_moleculefragments_with_single_read_evidencemolecules_with_single_read_evidencenumber_cells_detected_multiplenumber_cells_expressing
ACAP31011001001...NaN111.0000001.0000001.0000001101
AGRN1011001001...NaN111.0000001.0000001.0000001101
AL627309.11011001001...NaN111.0000001.0000001.0000001101
AL627309.52602626002601126...18.4552938153.2500001.7333331.8750007268
AL627309.72450245245002450760...21.674500591764.1525421.3920452.983051124223857
AL645608.27077007020...33.657186451.7500001.4000001.2500004224
AL645608.31011001000...NaN111.0000001.0000001.0000001101
AL645608.418018180018010...53.54574013171.3846151.0588241.3076921612113
\n", - "

8 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " n_reads noise_reads perfect_molecule_barcodes \\\n", - "ACAP3 1 0 1 \n", - "AGRN 1 0 1 \n", - "AL627309.1 1 0 1 \n", - "AL627309.5 26 0 26 \n", - "AL627309.7 245 0 245 \n", - "AL645608.2 7 0 7 \n", - "AL645608.3 1 0 1 \n", - "AL645608.4 18 0 18 \n", - "\n", - " reads_mapped_exonic reads_mapped_intronic reads_mapped_utr \\\n", - "ACAP3 1 0 0 \n", - "AGRN 1 0 0 \n", - "AL627309.1 1 0 0 \n", - "AL627309.5 26 0 0 \n", - "AL627309.7 245 0 0 \n", - "AL645608.2 7 0 0 \n", - "AL645608.3 1 0 0 \n", - "AL645608.4 18 0 0 \n", - "\n", - " reads_mapped_uniquely reads_mapped_multiple duplicate_reads \\\n", - "ACAP3 1 0 0 \n", - "AGRN 1 0 0 \n", - "AL627309.1 1 0 0 \n", - "AL627309.5 26 0 11 \n", - "AL627309.7 245 0 76 \n", - "AL645608.2 7 0 2 \n", - "AL645608.3 1 0 0 \n", - "AL645608.4 18 0 1 \n", - "\n", - " spliced_reads ... \\\n", - "ACAP3 1 ... \n", - "AGRN 1 ... \n", - "AL627309.1 1 ... \n", - "AL627309.5 26 ... \n", - "AL627309.7 0 ... \n", - "AL645608.2 0 ... \n", - "AL645608.3 0 ... \n", - "AL645608.4 0 ... \n", - "\n", - " genomic_read_quality_variance n_molecules n_fragments \\\n", - "ACAP3 NaN 1 1 \n", - "AGRN NaN 1 1 \n", - "AL627309.1 NaN 1 1 \n", - "AL627309.5 18.455293 8 15 \n", - "AL627309.7 21.674500 59 176 \n", - "AL645608.2 33.657186 4 5 \n", - "AL645608.3 NaN 1 1 \n", - "AL645608.4 53.545740 13 17 \n", - "\n", - " reads_per_molecule reads_per_fragment fragments_per_molecule \\\n", - "ACAP3 1.000000 1.000000 1.000000 \n", - "AGRN 1.000000 1.000000 1.000000 \n", - "AL627309.1 1.000000 1.000000 1.000000 \n", - "AL627309.5 3.250000 1.733333 1.875000 \n", - "AL627309.7 4.152542 1.392045 2.983051 \n", - "AL645608.2 1.750000 1.400000 1.250000 \n", - "AL645608.3 1.000000 1.000000 1.000000 \n", - "AL645608.4 1.384615 1.058824 1.307692 \n", - "\n", - " fragments_with_single_read_evidence \\\n", - "ACAP3 1 \n", - "AGRN 1 \n", - "AL627309.1 1 \n", - "AL627309.5 7 \n", - "AL627309.7 124 \n", - "AL645608.2 4 \n", - "AL645608.3 1 \n", - "AL645608.4 16 \n", - "\n", - " molecules_with_single_read_evidence \\\n", - "ACAP3 1 \n", - "AGRN 1 \n", - "AL627309.1 1 \n", - "AL627309.5 2 \n", - "AL627309.7 22 \n", - "AL645608.2 2 \n", - "AL645608.3 1 \n", - "AL645608.4 12 \n", - "\n", - " number_cells_detected_multiple number_cells_expressing \n", - "ACAP3 0 1 \n", - "AGRN 0 1 \n", - "AL627309.1 0 1 \n", - "AL627309.5 6 8 \n", - "AL627309.7 38 57 \n", - "AL645608.2 2 4 \n", - "AL645608.3 0 1 \n", - "AL645608.4 1 13 \n", - "\n", - "[8 rows x 26 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gene_metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "run_control": { - "frozen": false, - "read_only": false - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ",n_reads,noise_reads,perfect_molecule_barcodes,reads_mapped_exonic,reads_mapped_intronic,reads_mapped_utr,reads_mapped_uniquely,reads_mapped_multiple,duplicate_reads,spliced_reads,antisense_reads,molecule_barcode_fraction_bases_above_30_mean,molecule_barcode_fraction_bases_above_30_variance,genomic_reads_fraction_bases_quality_above_30_mean,genomic_reads_fraction_bases_quality_above_30_variance,genomic_read_quality_mean,genomic_read_quality_variance,n_molecules,n_fragments,reads_per_molecule,reads_per_fragment,fragments_per_molecule,fragments_with_single_read_evidence,molecules_with_single_read_evidence,number_cells_detected_multiple,number_cells_expressing\n", - "ACAP3,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.8877551020408163,nan,36.214285714285715,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AGRN,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.3979591836734694,nan,24.846938775510203,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AL627309.1,1,0,1,1,0,0,1,0,0,1,0,0.8,nan,0.4270833333333333,nan,25.479166666666668,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AL627309.5,26,0,26,26,0,0,26,0,11,26,0,0.9884615384615385,0.0010615384615384619,0.8148357472599155,0.02818637889146239,35.36641405113152,18.45529287710208,8,15,3.25,1.7333333333333334,1.875,7,2,6,8\n", - "AL627309.7,245,0,245,245,0,0,245,0,76,0,0,0.9832653061224491,0.005087654734024759,0.7681442526176698,0.03459077695708153,34.09562493869249,21.67450015630017,59,176,4.1525423728813555,1.3920454545454546,2.983050847457627,124,22,38,57\n", - "AL645608.2,7,0,7,7,0,0,7,0,2,0,0,0.9857142857142857,0.00142857142857143,0.7215743440233235,0.05371769699133296,33.03644314868805,33.65718648975626,4,5,1.75,1.4,1.25,4,2,2,4\n", - "AL645608.3,1,0,1,1,0,0,1,0,0,0,0,0.7,nan,0.15463917525773196,nan,20.742268041237114,nan,1,1,1.0,1.0,1.0,1,1,0,1\n", - "AL645608.4,18,0,18,18,0,0,18,0,1,0,0,0.9444444444444444,0.012026143790849672,0.5089380971044231,0.08488064356706926,27.307757608823714,53.545739760471115,13,17,1.3846153846153846,1.0588235294117647,1.3076923076923077,16,12,1,13\n" - ] - } - ], - "source": [ - "!cat data/gene_metrics.csv" - ] - } - ], - "metadata": { - "hide_input": false, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "nav_menu": {}, - "toc": { - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 6, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tools/scripts/sctools/src/sctools/test/data/1k-august-2016.txt b/tools/scripts/sctools/src/sctools/test/data/1k-august-2016.txt deleted file mode 100644 index 54b0b83b..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/1k-august-2016.txt +++ /dev/null @@ -1,1001 +0,0 @@ -AAACCTGAGAAACCAT -AAACCTGAGAAACCGC -AAACCTGAGAAACCTA -AAACCTGAGAAACGAG -AAACCTGAGAAACGCC -AAACCTGAGAAAGTGG -AAACCTGAGAACAACT -AAACCTGAGAACAATC -AAACCTGAGAACTCGG -AAACCTGAGAACTGTA -AAACCTGAGAAGAAGC -AAACCTGAGAAGATTC -AAACCTGAGAAGCCCA -AAACCTGAGAAGGACA -AAACCTGAGAAGGCCT -AAACCTGAGAAGGGTA -AAACCTGAGAAGGTGA -AAACCTGAGAAGGTTT -AAACCTGAGAATAGGG -AAACCTGAGAATCTCC -AAACCTGAGAATGTGT -AAACCTGAGAATGTTG -AAACCTGAGAATTCCC -AAACCTGAGAATTGTG -AAACCTGAGACAAAGG -AAACCTGAGACAAGCC -AAACCTGAGACAATAC -AAACCTGAGACACGAC -AAACCTGAGACACTAA -AAACCTGAGACAGACC -AAACCTGAGACAGAGA -AAACCTGAGACAGGCT -AAACCTGAGACATAAC -AAACCTGAGACCACGA -AAACCTGAGACCCACC -AAACCTGAGACCGGAT -AAACCTGAGACCTAGG -AAACCTGAGACCTTTG -AAACCTGAGACGACGT -AAACCTGAGACGCAAC -AAACCTGAGACGCACA -AAACCTGAGACGCTTT -AAACCTGAGACTAAGT -AAACCTGAGACTACAA -AAACCTGAGACTAGAT -AAACCTGAGACTAGGC -AAACCTGAGACTCGGA -AAACCTGAGACTGGGT -AAACCTGAGACTGTAA -AAACCTGAGACTTGAA -AAACCTGAGACTTTCG -AAACCTGAGAGAACAG -AAACCTGAGAGACGAA -AAACCTGAGAGACTAT -AAACCTGAGAGACTTA -AAACCTGAGAGAGCTC -AAACCTGAGAGATGAG -AAACCTGAGAGCAATT -AAACCTGAGAGCCCAA -AAACCTGAGAGCCTAG -AAACCTGAGAGCTATA -AAACCTGAGAGCTGCA -AAACCTGAGAGCTGGT -AAACCTGAGAGCTTCT -AAACCTGAGAGGACGG -AAACCTGAGAGGGATA -AAACCTGAGAGGGCTT -AAACCTGAGAGGTACC -AAACCTGAGAGGTAGA -AAACCTGAGAGGTTAT -AAACCTGAGAGGTTGC -AAACCTGAGAGTAAGG -AAACCTGAGAGTAATC -AAACCTGAGAGTACAT -AAACCTGAGAGTACCG -AAACCTGAGAGTCGGT -AAACCTGAGAGTCTGG -AAACCTGAGAGTGACC -AAACCTGAGAGTGAGA -AAACCTGAGAGTTGGC -AAACCTGAGATACACA -AAACCTGAGATAGCAT -AAACCTGAGATAGGAG -AAACCTGAGATAGTCA -AAACCTGAGATATACG -AAACCTGAGATATGCA -AAACCTGAGATATGGT -AAACCTGAGATCACGG -AAACCTGAGATCCCAT -AAACCTGAGATCCCGC -AAACCTGAGATCCGAG -AAACCTGAGATCCTGT -AAACCTGAGATCGATA -AAACCTGAGATCGGGT -AAACCTGAGATCTGAA -AAACCTGAGATCTGCT -AAACCTGAGATGAGAG -AAACCTGAGATGCCAG -AAACCTGAGATGCCTT -AAACCTGAGATGCGAC -AAACCTGAGATGGCGT -AAACCTGAGATGGGTC -AAACCTGAGATGTAAC -AAACCTGAGATGTCGG -AAACCTGAGATGTGGC -AAACCTGAGATGTGTA -AAACCTGAGATGTTAG -AAACCTGAGATTACCC -AAACCTGAGCAAATCA -AAACCTGAGCAACGGT -AAACCTGAGCAATATG -AAACCTGAGCAATCTC -AAACCTGAGCACACAG -AAACCTGAGCACAGGT -AAACCTGAGCACCGCT -AAACCTGAGCACCGTC -AAACCTGAGCACGCCT -AAACCTGAGCAGACTG -AAACCTGAGCAGATCG -AAACCTGAGCAGCCTC -AAACCTGAGCAGCGTA -AAACCTGAGCAGGCTA -AAACCTGAGCAGGTCA -AAACCTGAGCATCATC -AAACCTGAGCATGGCA -AAACCTGAGCCAACAG -AAACCTGAGCCACCTG -AAACCTGAGCCACGCT -AAACCTGAGCCACGTC -AAACCTGAGCCACTAT -AAACCTGAGCCAGAAC -AAACCTGAGCCAGGAT -AAACCTGAGCCAGTAG -AAACCTGAGCCAGTTT -AAACCTGAGCCATCGC -AAACCTGAGCCCAACC -AAACCTGAGCCCAATT -AAACCTGAGCCCAGCT -AAACCTGAGCCCGAAA -AAACCTGAGCCCTAAT -AAACCTGAGCCGATTT -AAACCTGAGCCGCCTA -AAACCTGAGCCGGTAA -AAACCTGAGCCGTCGT -AAACCTGAGCCTATGT -AAACCTGAGCCTCGTG -AAACCTGAGCCTTGAT -AAACCTGAGCGAAGGG -AAACCTGAGCGACGTA -AAACCTGAGCGAGAAA -AAACCTGAGCGATAGC -AAACCTGAGCGATATA -AAACCTGAGCGATCCC -AAACCTGAGCGATGAC -AAACCTGAGCGATTCT -AAACCTGAGCGCCTCA -AAACCTGAGCGCCTTG -AAACCTGAGCGCTCCA -AAACCTGAGCGCTTAT -AAACCTGAGCGGATCA -AAACCTGAGCGGCTTC -AAACCTGAGCGTAATA -AAACCTGAGCGTAGTG -AAACCTGAGCGTCAAG -AAACCTGAGCGTCTAT -AAACCTGAGCGTGAAC -AAACCTGAGCGTGAGT -AAACCTGAGCGTGTCC -AAACCTGAGCGTTCCG -AAACCTGAGCGTTGCC -AAACCTGAGCGTTTAC -AAACCTGAGCTAAACA -AAACCTGAGCTAACAA -AAACCTGAGCTAACTC -AAACCTGAGCTAAGAT -AAACCTGAGCTACCGC -AAACCTGAGCTACCTA -AAACCTGAGCTAGCCC -AAACCTGAGCTAGGCA -AAACCTGAGCTAGTCT -AAACCTGAGCTAGTGG -AAACCTGAGCTAGTTC -AAACCTGAGCTATGCT -AAACCTGAGCTCAACT -AAACCTGAGCTCCCAG -AAACCTGAGCTCCTCT -AAACCTGAGCTCCTTC -AAACCTGAGCTCTCGG -AAACCTGAGCTGAAAT -AAACCTGAGCTGAACG -AAACCTGAGCTGATAA -AAACCTGAGCTGCAAG -AAACCTGAGCTGCCCA -AAACCTGAGCTGCGAA -AAACCTGAGCTGGAAC -AAACCTGAGCTGTCTA -AAACCTGAGCTGTTCA -AAACCTGAGCTTATCG -AAACCTGAGCTTCGCG -AAACCTGAGCTTTGGT -AAACCTGAGGAACTGC -AAACCTGAGGAATCGC -AAACCTGAGGAATGGA -AAACCTGAGGAATTAC -AAACCTGAGGACACCA -AAACCTGAGGACAGAA -AAACCTGAGGACAGCT -AAACCTGAGGACATTA -AAACCTGAGGACCACA -AAACCTGAGGACGAAA -AAACCTGAGGACTGGT -AAACCTGAGGAGCGAG -AAACCTGAGGAGCGTT -AAACCTGAGGAGTACC -AAACCTGAGGAGTAGA -AAACCTGAGGAGTCTG -AAACCTGAGGAGTTGC -AAACCTGAGGAGTTTA -AAACCTGAGGATATAC -AAACCTGAGGATCGCA -AAACCTGAGGATGCGT -AAACCTGAGGATGGAA -AAACCTGAGGATGGTC -AAACCTGAGGATGTAT -AAACCTGAGGATTCGG -AAACCTGAGGCAAAGA -AAACCTGAGGCAATTA -AAACCTGAGGCACATG -AAACCTGAGGCAGGTT -AAACCTGAGGCAGTCA -AAACCTGAGGCATGGT -AAACCTGAGGCATGTG -AAACCTGAGGCATTGG -AAACCTGAGGCCATAG -AAACCTGAGGCCCGTT -AAACCTGAGGCCCTCA -AAACCTGAGGCCCTTG -AAACCTGAGGCCGAAT -AAACCTGAGGCGACAT -AAACCTGAGGCGATAC -AAACCTGAGGCGCTCT -AAACCTGAGGCGTACA -AAACCTGAGGCTACGA -AAACCTGAGGCTAGAC -AAACCTGAGGCTAGCA -AAACCTGAGGCTAGGT -AAACCTGAGGCTATCT -AAACCTGAGGCTCAGA -AAACCTGAGGCTCATT -AAACCTGAGGCTCTTA -AAACCTGAGGGAAACA -AAACCTGAGGGAACGG -AAACCTGAGGGAGTAA -AAACCTGAGGGATACC -AAACCTGAGGGATCTG -AAACCTGAGGGATGGG -AAACCTGAGGGCACTA -AAACCTGAGGGCATGT -AAACCTGAGGGCTCTC -AAACCTGAGGGCTTCC -AAACCTGAGGGCTTGA -AAACCTGAGGGTATCG -AAACCTGAGGGTCGAT -AAACCTGAGGGTCTCC -AAACCTGAGGGTGTGT -AAACCTGAGGGTGTTG -AAACCTGAGGGTTCCC -AAACCTGAGGGTTTCT -AAACCTGAGGTAAACT -AAACCTGAGGTACTCT -AAACCTGAGGTAGCCA -AAACCTGAGGTAGCTG -AAACCTGAGGTCATCT -AAACCTGAGGTCGGAT -AAACCTGAGGTGACCA -AAACCTGAGGTGATAT -AAACCTGAGGTGATTA -AAACCTGAGGTGCAAC -AAACCTGAGGTGCACA -AAACCTGAGGTGCTAG -AAACCTGAGGTGCTTT -AAACCTGAGGTGGGTT -AAACCTGAGGTGTGGT -AAACCTGAGGTGTTAA -AAACCTGAGGTTACCT -AAACCTGAGGTTCCTA -AAACCTGAGTAACCCT -AAACCTGAGTAAGTAC -AAACCTGAGTAATCCC -AAACCTGAGTACACCT -AAACCTGAGTACATGA -AAACCTGAGTACCGGA -AAACCTGAGTACGACG -AAACCTGAGTACGATA -AAACCTGAGTACGCCC -AAACCTGAGTACGCGA -AAACCTGAGTACGTAA -AAACCTGAGTACGTTC -AAACCTGAGTACTTGC -AAACCTGAGTAGATGT -AAACCTGAGTAGCCGA -AAACCTGAGTAGCGGT -AAACCTGAGTAGGCCA -AAACCTGAGTAGGTGC -AAACCTGAGTAGTGCG -AAACCTGAGTATCGAA -AAACCTGAGTATCTCG -AAACCTGAGTATGACA -AAACCTGAGTATTGGA -AAACCTGAGTCAAGCG -AAACCTGAGTCAAGGC -AAACCTGAGTCAATAG -AAACCTGAGTCACGCC -AAACCTGAGTCATCCA -AAACCTGAGTCATGCT -AAACCTGAGTCCAGGA -AAACCTGAGTCCATAC -AAACCTGAGTCCCACG -AAACCTGAGTCCGGTC -AAACCTGAGTCCGTAT -AAACCTGAGTCCTCCT -AAACCTGAGTCGAGTG -AAACCTGAGTCGATAA -AAACCTGAGTCGCCGT -AAACCTGAGTCGTACT -AAACCTGAGTCGTTTG -AAACCTGAGTCTCAAC -AAACCTGAGTCTCCTC -AAACCTGAGTCTCGGC -AAACCTGAGTCTTGCA -AAACCTGAGTGAACAT -AAACCTGAGTGAACGC -AAACCTGAGTGAAGAG -AAACCTGAGTGAAGTT -AAACCTGAGTGAATTG -AAACCTGAGTGACATA -AAACCTGAGTGACTCT -AAACCTGAGTGATCGG -AAACCTGAGTGCAAGC -AAACCTGAGTGCCAGA -AAACCTGAGTGCCATT -AAACCTGAGTGCGATG -AAACCTGAGTGCGTGA -AAACCTGAGTGCTGCC -AAACCTGAGTGGACGT -AAACCTGAGTGGAGAA -AAACCTGAGTGGAGTC -AAACCTGAGTGGCACA -AAACCTGAGTGGGATC -AAACCTGAGTGGGCTA -AAACCTGAGTGGGTTG -AAACCTGAGTGGTAAT -AAACCTGAGTGGTAGC -AAACCTGAGTGGTCCC -AAACCTGAGTGTACCT -AAACCTGAGTGTACGG -AAACCTGAGTGTACTC -AAACCTGAGTGTCCAT -AAACCTGAGTGTCCCG -AAACCTGAGTGTCTCA -AAACCTGAGTGTGAAT -AAACCTGAGTGTGGCA -AAACCTGAGTGTTAGA -AAACCTGAGTGTTGAA -AAACCTGAGTGTTTGC -AAACCTGAGTTAACGA -AAACCTGAGTTAAGTG -AAACCTGAGTTACCCA -AAACCTGAGTTACGGG -AAACCTGAGTTAGCGG -AAACCTGAGTTAGGTA -AAACCTGAGTTATCGC -AAACCTGAGTTCCACA -AAACCTGAGTTCGATC -AAACCTGAGTTCGCAT -AAACCTGAGTTCGCGC -AAACCTGAGTTGAGAT -AAACCTGAGTTGAGTA -AAACCTGAGTTGCAGG -AAACCTGAGTTGTAGA -AAACCTGAGTTGTCGT -AAACCTGAGTTTAGGA -AAACCTGAGTTTCCTT -AAACCTGAGTTTGCGT -AAACCTGCAAACAACA -AAACCTGCAAACCCAT -AAACCTGCAAACCTAC -AAACCTGCAAACGCGA -AAACCTGCAAACGTGG -AAACCTGCAAACTGCT -AAACCTGCAAACTGTC -AAACCTGCAAAGAATC -AAACCTGCAAAGCAAT -AAACCTGCAAAGCGGT -AAACCTGCAAAGGAAG -AAACCTGCAAAGGCGT -AAACCTGCAAAGGTGC -AAACCTGCAAAGTCAA -AAACCTGCAAAGTGCG -AAACCTGCAAATACAG -AAACCTGCAAATCCGT -AAACCTGCAAATTGCC -AAACCTGCAACAACCT -AAACCTGCAACACCCG -AAACCTGCAACACCTA -AAACCTGCAACACGCC -AAACCTGCAACCGCCA -AAACCTGCAACGATCT -AAACCTGCAACGATGG -AAACCTGCAACGCACC -AAACCTGCAACTGCGC -AAACCTGCAACTGCTA -AAACCTGCAACTGGCC -AAACCTGCAACTTGAC -AAACCTGCAAGAAAGG -AAACCTGCAAGAAGAG -AAACCTGCAAGACACG -AAACCTGCAAGACGTG -AAACCTGCAAGAGGCT -AAACCTGCAAGAGTCG -AAACCTGCAAGCCATT -AAACCTGCAAGCCCAC -AAACCTGCAAGCCGCT -AAACCTGCAAGCCGTC -AAACCTGCAAGCCTAT -AAACCTGCAAGCGAGT -AAACCTGCAAGCGATG -AAACCTGCAAGCGCTC -AAACCTGCAAGCGTAG -AAACCTGCAAGCTGAG -AAACCTGCAAGCTGGA -AAACCTGCAAGCTGTT -AAACCTGCAAGGACAC -AAACCTGCAAGGACTG -AAACCTGCAAGGCTCC -AAACCTGCAAGGGTCA -AAACCTGCAAGGTGTG -AAACCTGCAAGGTTCT -AAACCTGCAAGGTTTC -AAACCTGCAAGTAATG -AAACCTGCAAGTACCT -AAACCTGCAAGTAGTA -AAACCTGCAAGTCATC -AAACCTGCAAGTCTAC -AAACCTGCAAGTCTGT -AAACCTGCAAGTTAAG -AAACCTGCAAGTTCTG -AAACCTGCAAGTTGTC -AAACCTGCAATAACGA -AAACCTGCAATAAGCA -AAACCTGCAATACGCT -AAACCTGCAATAGAGT -AAACCTGCAATAGCAA -AAACCTGCAATAGCGG -AAACCTGCAATCACAC -AAACCTGCAATCAGAA -AAACCTGCAATCCAAC -AAACCTGCAATCCGAT -AAACCTGCAATCGAAA -AAACCTGCAATCGGTT -AAACCTGCAATCTACG -AAACCTGCAATCTGCA -AAACCTGCAATGAAAC -AAACCTGCAATGAATG -AAACCTGCAATGACCT -AAACCTGCAATGCCAT -AAACCTGCAATGGAAT -AAACCTGCAATGGACG -AAACCTGCAATGGAGC -AAACCTGCAATGGATA -AAACCTGCAATGGTCT -AAACCTGCAATGTAAG -AAACCTGCAATGTTGC -AAACCTGCAATTCCTT -AAACCTGCAATTGCTG -AAACCTGCACAACGCC -AAACCTGCACAACGTT -AAACCTGCACAACTGT -AAACCTGCACAAGACG -AAACCTGCACAAGCCC -AAACCTGCACAAGTAA -AAACCTGCACACAGAG -AAACCTGCACACATGT -AAACCTGCACACCGAC -AAACCTGCACACCGCA -AAACCTGCACACGCTG -AAACCTGCACACTGCG -AAACCTGCACAGACAG -AAACCTGCACAGACTT -AAACCTGCACAGAGGT -AAACCTGCACAGATTC -AAACCTGCACAGCCCA -AAACCTGCACAGCGTC -AAACCTGCACAGGAGT -AAACCTGCACAGGCCT -AAACCTGCACAGGTTT -AAACCTGCACAGTCGC -AAACCTGCACATAACC -AAACCTGCACATCCAA -AAACCTGCACATCCGG -AAACCTGCACATCTTT -AAACCTGCACATGACT -AAACCTGCACATGGGA -AAACCTGCACATGTGT -AAACCTGCACATTAGC -AAACCTGCACATTCGA -AAACCTGCACATTTCT -AAACCTGCACCAACCG -AAACCTGCACCACCAG -AAACCTGCACCACGTG -AAACCTGCACCAGATT -AAACCTGCACCAGCAC -AAACCTGCACCAGGCT -AAACCTGCACCAGGTC -AAACCTGCACCAGTTA -AAACCTGCACCATCCT -AAACCTGCACCATGTA -AAACCTGCACCCAGTG -AAACCTGCACCCATGG -AAACCTGCACCCATTC -AAACCTGCACCCTATC -AAACCTGCACCGAAAG -AAACCTGCACCGAATT -AAACCTGCACCGATAT -AAACCTGCACCGCTAG -AAACCTGCACCGGAAA -AAACCTGCACCGTTGG -AAACCTGCACCTATCC -AAACCTGCACCTCGGA -AAACCTGCACCTCGTT -AAACCTGCACCTGGTG -AAACCTGCACCTTGTC -AAACCTGCACGAAACG -AAACCTGCACGAAAGC -AAACCTGCACGAAATA -AAACCTGCACGAAGCA -AAACCTGCACGACGAA -AAACCTGCACGACTCG -AAACCTGCACGAGAGT -AAACCTGCACGAGGTA -AAACCTGCACGCATCG -AAACCTGCACGCCAGT -AAACCTGCACGCGAAA -AAACCTGCACGCTTTC -AAACCTGCACGGACAA -AAACCTGCACGGATAG -AAACCTGCACGGCCAT -AAACCTGCACGGCGTT -AAACCTGCACGGCTAC -AAACCTGCACGGTAAG -AAACCTGCACGGTAGA -AAACCTGCACGGTGTC -AAACCTGCACGGTTTA -AAACCTGCACGTAAGG -AAACCTGCACGTCAGC -AAACCTGCACGTCTCT -AAACCTGCACGTGAGA -AAACCTGCACGTTGGC -AAACCTGCACTAAGTC -AAACCTGCACTACAGT -AAACCTGCACTAGTAC -AAACCTGCACTATCTT -AAACCTGCACTCAGGC -AAACCTGCACTCGACG -AAACCTGCACTCTGTC -AAACCTGCACTGAAGG -AAACCTGCACTGCCAG -AAACCTGCACTGTCGG -AAACCTGCACTGTGTA -AAACCTGCACTGTTAG -AAACCTGCACTTAACG -AAACCTGCACTTAAGC -AAACCTGCACTTACGA -AAACCTGCACTTCGAA -AAACCTGCACTTCTGC -AAACCTGCACTTGGAT -AAACCTGCAGAAGCAC -AAACCTGCAGACAAAT -AAACCTGCAGACAAGC -AAACCTGCAGACACTT -AAACCTGCAGACAGGT -AAACCTGCAGACGCAA -AAACCTGCAGACGCCT -AAACCTGCAGACGCTC -AAACCTGCAGACGTAG -AAACCTGCAGACTCGC -AAACCTGCAGAGCCAA -AAACCTGCAGAGTGTG -AAACCTGCAGATAATG -AAACCTGCAGATCCAT -AAACCTGCAGATCGGA -AAACCTGCAGATCTGT -AAACCTGCAGATGAGC -AAACCTGCAGATGGCA -AAACCTGCAGATGGGT -AAACCTGCAGATTGCT -AAACCTGCAGCAGTTT -AAACCTGCAGCATACT -AAACCTGCAGCATGAG -AAACCTGCAGCCAATT -AAACCTGCAGCCACCA -AAACCTGCAGCCAGAA -AAACCTGCAGCCTATA -AAACCTGCAGCCTGTG -AAACCTGCAGCCTTGG -AAACCTGCAGCCTTTC -AAACCTGCAGCGAACA -AAACCTGCAGCGATCC -AAACCTGCAGCGTAAG -AAACCTGCAGCGTCCA -AAACCTGCAGCGTTCG -AAACCTGCAGCTATTG -AAACCTGCAGCTCCGA -AAACCTGCAGCTCGAC -AAACCTGCAGCTCGCA -AAACCTGCAGCTGCAC -AAACCTGCAGCTGCTG -AAACCTGCAGCTGGCT -AAACCTGCAGCTGTAT -AAACCTGCAGCTGTGC -AAACCTGCAGCTGTTA -AAACCTGCAGCTTAAC -AAACCTGCAGCTTCGG -AAACCTGCAGGAACGT -AAACCTGCAGGAATCG -AAACCTGCAGGAATGC -AAACCTGCAGGACCCT -AAACCTGCAGGACGTA -AAACCTGCAGGATCGA -AAACCTGCAGGATTGG -AAACCTGCAGGCAGTA -AAACCTGCAGGCGATA -AAACCTGCAGGCTCAC -AAACCTGCAGGCTGAA -AAACCTGCAGGGAGAG -AAACCTGCAGGGATTG -AAACCTGCAGGGCATA -AAACCTGCAGGGTACA -AAACCTGCAGGGTATG -AAACCTGCAGGGTTAG -AAACCTGCAGGTCCAC -AAACCTGCAGGTCGTC -AAACCTGCAGGTCTCG -AAACCTGCAGGTGCCT -AAACCTGCAGGTGGAT -AAACCTGCAGGTTTCA -AAACCTGCAGTAACGG -AAACCTGCAGTAAGAT -AAACCTGCAGTAAGCG -AAACCTGCAGTACACT -AAACCTGCAGTAGAGC -AAACCTGCAGTATAAG -AAACCTGCAGTATCTG -AAACCTGCAGTATGCT -AAACCTGCAGTCACTA -AAACCTGCAGTCAGAG -AAACCTGCAGTCAGCC -AAACCTGCAGTCCTTC -AAACCTGCAGTCGATT -AAACCTGCAGTCGTGC -AAACCTGCAGTCTTCC -AAACCTGCAGTGACAG -AAACCTGCAGTGAGTG -AAACCTGCAGTGGAGT -AAACCTGCAGTGGGAT -AAACCTGCAGTTAACC -AAACCTGCAGTTCATG -AAACCTGCAGTTCCCT -AAACCTGCAGTTTACG -AAACCTGCATAAAGGT -AAACCTGCATAACCTG -AAACCTGCATAAGACA -AAACCTGCATACAGCT -AAACCTGCATACCATG -AAACCTGCATACGCCG -AAACCTGCATACGCTA -AAACCTGCATACTACG -AAACCTGCATACTCTT -AAACCTGCATAGAAAC -AAACCTGCATAGACTC -AAACCTGCATAGGATA -AAACCTGCATAGTAAG -AAACCTGCATATACCG -AAACCTGCATATACGC -AAACCTGCATATGAGA -AAACCTGCATATGCTG -AAACCTGCATATGGTC -AAACCTGCATCACAAC -AAACCTGCATCACCCT -AAACCTGCATCACGAT -AAACCTGCATCACGTA -AAACCTGCATCAGTAC -AAACCTGCATCAGTCA -AAACCTGCATCATCCC -AAACCTGCATCCAACA -AAACCTGCATCCCACT -AAACCTGCATCCCATC -AAACCTGCATCCGCGA -AAACCTGCATCCGGGT -AAACCTGCATCCGTGG -AAACCTGCATCCTAGA -AAACCTGCATCCTTGC -AAACCTGCATCGACGC -AAACCTGCATCGATGT -AAACCTGCATCGATTG -AAACCTGCATCGGAAG -AAACCTGCATCGGACC -AAACCTGCATCGGGTC -AAACCTGCATCGGTTA -AAACCTGCATCGTCGG -AAACCTGCATCTACGA -AAACCTGCATCTATGG -AAACCTGCATCTCCCA -AAACCTGCATCTCGCT -AAACCTGCATCTGGTA -AAACCTGCATGAACCT -AAACCTGCATGAAGTA -AAACCTGCATGACATC -AAACCTGCATGACGGA -AAACCTGCATGAGCGA -AAACCTGCATGATCCA -AAACCTGCATGCAACT -AAACCTGCATGCAATC -AAACCTGCATGCATGT -AAACCTGCATGCCACG -AAACCTGCATGCCCGA -AAACCTGCATGCCTAA -AAACCTGCATGCCTTC -AAACCTGCATGCGCAC -AAACCTGCATGCTAGT -AAACCTGCATGCTGGC -AAACCTGCATGGAATA -AAACCTGCATGGATGG -AAACCTGCATGGGAAC -AAACCTGCATGGGACA -AAACCTGCATGGTAGG -AAACCTGCATGGTCAT -AAACCTGCATGGTCTA -AAACCTGCATGGTTGT -AAACCTGCATGTAAGA -AAACCTGCATGTAGTC -AAACCTGCATGTCCTC -AAACCTGCATGTCGAT -AAACCTGCATGTCTCC -AAACCTGCATGTTCCC -AAACCTGCATGTTGAC -AAACCTGCATTAACCG -AAACCTGCATTACCTT -AAACCTGCATTACGAC -AAACCTGCATTAGCCA -AAACCTGCATTAGGCT -AAACCTGCATTATCTC -AAACCTGCATTCACTT -AAACCTGCATTCCTCG -AAACCTGCATTCCTGC -AAACCTGCATTCGACA -AAACCTGCATTCTCAT -AAACCTGCATTCTTAC -AAACCTGCATTGAGCT -AAACCTGCATTGCGGC -AAACCTGCATTGGCGC -AAACCTGCATTGGGCC -AAACCTGCATTGGTAC -AAACCTGCATTGTGCA -AAACCTGCATTTCACT -AAACCTGCATTTCAGG -AAACCTGCATTTGCCC -AAACCTGCATTTGCTT -AAACCTGGTAAACACA -AAACCTGGTAAACCTC -AAACCTGGTAAACGCG -AAACCTGGTAAAGGAG -AAACCTGGTAAAGTCA -AAACCTGGTAAATACG -AAACCTGGTAAATGAC -AAACCTGGTAAATGTG -AAACCTGGTAACGACG -AAACCTGGTAACGCGA -AAACCTGGTAACGTTC -AAACCTGGTAAGAGAG -AAACCTGGTAAGAGGA -AAACCTGGTAAGCACG -AAACCTGGTAAGGATT -AAACCTGGTAAGGGAA -AAACCTGGTAAGGGCT -AAACCTGGTAAGTAGT -AAACCTGGTAAGTGGC -AAACCTGGTAAGTGTA -AAACCTGGTAAGTTCC -AAACCTGGTAATAGCA -AAACCTGGTAATCACC -AAACCTGGTAATCGTC -AAACCTGGTAATTGGA -AAACCTGGTACAAGTA -AAACCTGGTACACCGC -AAACCTGGTACAGACG -AAACCTGGTACAGCAG -AAACCTGGTACAGTGG -AAACCTGGTACAGTTC -AAACCTGGTACATCCA -AAACCTGGTACATGTC -AAACCTGGTACCAGTT -AAACCTGGTACCATCA -AAACCTGGTACCCAAT -AAACCTGGTACCGAGA -AAACCTGGTACCGCTG -AAACCTGGTACCGGCT -AAACCTGGTACCGTAT -AAACCTGGTACCGTTA -AAACCTGGTACCTACA -AAACCTGGTACGAAAT -AAACCTGGTACGACCC -AAACCTGGTACGCACC -AAACCTGGTACGCTGC -AAACCTGGTACTCAAC -AAACCTGGTACTCGCG -AAACCTGGTACTCTCC -AAACCTGGTACTTAGC -AAACCTGGTACTTCTT -AAACCTGGTACTTGAC -AAACCTGGTAGAAAGG -AAACCTGGTAGAAGGA -AAACCTGGTAGAGCTG -AAACCTGGTAGAGGAA -AAACCTGGTAGAGTGC -AAACCTGGTAGATTAG -AAACCTGGTAGCAAAT -AAACCTGGTAGCACGA -AAACCTGGTAGCCTAT -AAACCTGGTAGCCTCG -AAACCTGGTAGCGATG -AAACCTGGTAGCGCAA -AAACCTGGTAGCGCTC -AAACCTGGTAGCGTAG -AAACCTGGTAGCGTCC -AAACCTGGTAGCGTGA -AAACCTGGTAGCTAAA -AAACCTGGTAGCTCCG -AAACCTGGTAGCTGCC -AAACCTGGTAGCTTGT -AAACCTGGTAGGACAC -AAACCTGGTAGGAGTC -AAACCTGGTAGGCATG -AAACCTGGTAGGCTGA -AAACCTGGTAGGGACT -AAACCTGGTAGGGTAC -AAACCTGGTAGTACCT -AAACCTGGTAGTAGTA -AAACCTGGTAGTGAAT -AAACCTGGTATAAACG -AAACCTGGTATAATGG -AAACCTGGTATAGGGC -AAACCTGGTATAGGTA -AAACCTGGTATAGTAG -AAACCTGGTATATCCG -AAACCTGGTATATGAG -AAACCTGGTATATGGA -AAACCTGGTATCACCA -AAACCTGGTATCAGTC -AAACCTGGTATCGCAT -AAACCTGGTATCTGCA -AAACCTGGTATGAAAC -AAACCTGGTATGAATG -AAACCTGGTATGCTTG -AAACCTGGTATGGTTC -AAACCTGGTATTACCG -AAACCTGGTATTAGCC -AAACCTGGTATTCGTG -AAACCTGGTATTCTCT -AAACCTGGTCAAACTC -AAACCTGGTCAAAGAT -AAACCTGGTCAAAGCG -AAACCTGGTCAACATC -AAACCTGGTCAACTGT -AAACCTGGTCAAGCGA -AAACCTGGTCAATACC -AAACCTGGTCAATGTC -AAACCTGGTCACAAGG -AAACCTGGTCACACGC -AAACCTGGTCACCCAG -AAACCTGGTCACCTAA -AAACCTGGTCACTGGC -AAACCTGGTCACTTCC -AAACCTGGTCAGAAGC -AAACCTGGTCAGAATA -AAACCTGGTCAGAGGT -AAACCTGGTCAGATAA -AAACCTGGTCAGCTAT -AAACCTGGTCAGGACA -AAACCTGGTCAGTGGA -AAACCTGGTCATACTG -AAACCTGGTCATATCG -AAACCTGGTCATATGC -AAACCTGGTCATCCCT -AAACCTGGTCATCGGC -AAACCTGGTCATGCAT -AAACCTGGTCATGCCG -AAACCTGGTCATTAGC -AAACCTGGTCCAACTA -AAACCTGGTCCAAGTT -AAACCTGGTCCAGTAT -AAACCTGGTCCAGTGC -AAACCTGGTCCAGTTA -AAACCTGGTCCATCCT -AAACCTGGTCCATGAT -AAACCTGGTCCCGACA -AAACCTGGTCCCTACT -AAACCTGGTCCCTTGT -AAACCTGGTCCGAACC -AAACCTGGTCCGAAGA -AAACCTGGTCCGAATT -AAACCTGGTCCGACGT -AAACCTGGTCCGAGTC -AAACCTGGTCCGCTGA -AAACCTGGTCCGTCAG -AAACCTGGTCCGTGAC -AAACCTGGTCCGTTAA -AAACCTGGTCCTAGCG -AAACCTGGTCCTCCAT -AAACCTGGTCCTCTTG -AAACCTGGTCCTGCTT -AAACCTGGTCGAAAGC -AAACCTGGTCGAACAG -AAACCTGGTCGAATCT -AAACCTGGTCGACTAT -AAACCTGGTCGACTGC -AAACCTGGTCGAGATG -AAACCTGGTCGAGTTT -AAACCTGGTCGATTGT -AAACCTGGTCGCATAT -AAACCTGGTCGCATCG -AAACCTGGTCGCCATG -AAACCTGGTCGCGAAA -AAACCTGGTCGCGGTT -AAACCTGGTCGCGTGT -AAACCTGGTCGCTTCT -AAACCTGGTCGCTTTC -AAACCTGGTCGGATCC -AAACCTGGTCGGCACT -AAACCTGGTCGGCATC -AAACCTGGTCGGCTCA -AAACCTGGTCGGGTCT -AAACCTGGTCGTCTTC -AAACCTGGTCGTGGCT -AAACCTGGTCGTTGTA -AAACCTGGTCTAAACC -AAACCTGGTCTAAAGA -AAACCTGGTCTAACGT -AAACCTGGTCTACCTC -AAACCTGGTCTAGAGG -AAACCTGGTCTAGCCG -AAACCTGGTCTAGCGC -AAACCTGGTCTAGGTT -AAACCTGGTCTAGTCA -AAACCTGGTCTAGTGT -AAACCTGGTCTCAACA -AAACCTGGTCTCACCT -AAACCTGGTCTCATCC -AAACCTGGTCTCCACT -AAACCTGGTCTCCATC -AAACCTGGTCTCCCTA -AAACCTGGTCTCGTTC -AAACCTGGTCTCTCGT -AAACCTGGTCTCTCTG -AAACCTGGTCTCTTAT -AAACCTGGTCTCTTTA -AAACCTGGTCTGATCA -AAACCTGGTCTGATTG -AAACCTGGTCTGCAAT -AAACCTGGTCTGCCAG -AAACCTGGTCTGCGGT -AAACCTGGTCTGGAGA -AAACCTGGTCTGGTCG -AAACCTGGTCTTCAAG -AAACCTGGTCTTCGTC -AAACCTGGTCTTCTCG -AAACCTGGTCTTGATG -AAACCTGGTCTTGCGG -AAACCTGGTCTTGTCC -AAACCTGGTCTTTCAT -AAACCTGGTGAAAGAG -AAACCTGGTGAAATCA -AAACCTGGTGAACCTT -AAACCTGGTGAAGGCT -AAACCTGGTGACAAAT -AAACCTGGTGACCAAG -AAACCTGGTGACGCCT -AAACCTGGTGACGGTA -AAACCTGGTGACTACT -AAACCTGGTGACTCAT -AAACCTGGTGAGCGAT -AAACCTGGTGAGGCTA -AAACCTGGTGAGGGAG -AAACCTGGTGAGGGTT -AAACCTGGTGAGTATA -AAACCTGGTGAGTGAC -AAACCTGGTGATAAAC -AAACCTGGTGATAAGT -AAACCTGGTGATGATA -AAACCTGGTGATGCCC -NAGGTGCCAGACACTT diff --git a/tools/scripts/sctools/src/sctools/test/data/cell-gene-umi-queryname-sorted.bam b/tools/scripts/sctools/src/sctools/test/data/cell-gene-umi-queryname-sorted.bam deleted file mode 100644 index f14155a7..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/cell-gene-umi-queryname-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/cell-sorted-missing-cb.bam b/tools/scripts/sctools/src/sctools/test/data/cell-sorted-missing-cb.bam deleted file mode 100644 index 88d2b057..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/cell-sorted-missing-cb.bam and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/cell-sorted.bam b/tools/scripts/sctools/src/sctools/test/data/cell-sorted.bam deleted file mode 100644 index b76e76c3..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/cell-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/cell_metrics_missing_cb.csv.gz b/tools/scripts/sctools/src/sctools/test/data/cell_metrics_missing_cb.csv.gz deleted file mode 100644 index 20a433db..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/cell_metrics_missing_cb.csv.gz and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/chr1.30k_records.gtf.gz b/tools/scripts/sctools/src/sctools/test/data/chr1.30k_records.gtf.gz deleted file mode 100644 index 36e6f0fa..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/chr1.30k_records.gtf.gz and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/expected_picard_group.csv b/tools/scripts/sctools/src/sctools/test/data/group_metrics/expected_picard_group.csv deleted file mode 100644 index 6b97c599..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/expected_picard_group.csv +++ /dev/null @@ -1,3 +0,0 @@ -,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE -Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics -test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2.csv b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2.csv deleted file mode 100644 index 17418654..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2.csv +++ /dev/null @@ -1,3 +0,0 @@ -,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads -Class,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G,HISAT2G -test,478,240,106,4414,652,412,1,95.64%,5479,824 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log deleted file mode 100644 index 982a1b65..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_paired_end_qc.log +++ /dev/null @@ -1,11 +0,0 @@ -HISAT2 summary stats: - Total pairs: 5479 - Aligned concordantly or discordantly 0 time: 412 (7.52%) - Aligned concordantly 1 time: 4414 (80.56%) - Aligned concordantly >1 times: 652 (11.90%) - Aligned discordantly 1 time: 1 (0.02%) - Total unpaired reads: 824 - Aligned 0 time: 478 (58.01%) - Aligned 1 time: 240 (29.13%) - Aligned >1 times: 106 (12.86%) - Overall alignment rate: 95.64% diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_trans.csv b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_trans.csv deleted file mode 100644 index e484efef..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_trans.csv +++ /dev/null @@ -1,3 +0,0 @@ -,Aligned 0 time,Aligned 1 time,Aligned >1 times,Aligned concordantly 1 time,Aligned concordantly >1 times,Aligned concordantly or discordantly 0 time,Aligned discordantly 1 time,Overall alignment rate,Total pairs,Total unpaired reads -Class,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T,HISAT2T -test,7270,0,0,360,1484,3635,0,33.66%,5479,7270 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log deleted file mode 100644 index 099ace2d..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_hisat2_transcriptome_rsem.log +++ /dev/null @@ -1,11 +0,0 @@ -HISAT2 summary stats: - Total pairs: 5479 - Aligned concordantly or discordantly 0 time: 3635 (66.34%) - Aligned concordantly 1 time: 360 (6.57%) - Aligned concordantly >1 times: 1484 (27.09%) - Aligned discordantly 1 time: 0 (0.00%) - Total unpaired reads: 7270 - Aligned 0 time: 7270 (100.00%) - Aligned 1 time: 0 (0.00%) - Aligned >1 times: 0 (0.00%) - Overall alignment rate: 33.66% diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_picard_group.csv b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_picard_group.csv deleted file mode 100644 index 6b97c599..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_picard_group.csv +++ /dev/null @@ -1,3 +0,0 @@ -,BAD_CYCLES.FIRST_OF_PAIR,BAD_CYCLES.PAIR,BAD_CYCLES.SECOND_OF_PAIR,MEAN_READ_LENGTH.FIRST_OF_PAIR,MEAN_READ_LENGTH.PAIR,MEAN_READ_LENGTH.SECOND_OF_PAIR,PCT_ADAPTER.FIRST_OF_PAIR,PCT_ADAPTER.PAIR,PCT_ADAPTER.SECOND_OF_PAIR,PCT_CHIMERAS.FIRST_OF_PAIR,PCT_CHIMERAS.PAIR,PCT_CHIMERAS.SECOND_OF_PAIR,PCT_PF_READS.FIRST_OF_PAIR,PCT_PF_READS.PAIR,PCT_PF_READS.SECOND_OF_PAIR,PCT_PF_READS_ALIGNED.FIRST_OF_PAIR,PCT_PF_READS_ALIGNED.PAIR,PCT_PF_READS_ALIGNED.SECOND_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PCT_PF_READS_IMPROPER_PAIRS.PAIR,PCT_PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,PCT_READS_ALIGNED_IN_PAIRS.PAIR,PCT_READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,PF_ALIGNED_BASES.FIRST_OF_PAIR,PF_ALIGNED_BASES.PAIR,PF_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_BASES.PAIR,PF_HQ_ALIGNED_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.FIRST_OF_PAIR,PF_HQ_ALIGNED_Q20_BASES.PAIR,PF_HQ_ALIGNED_Q20_BASES.SECOND_OF_PAIR,PF_HQ_ALIGNED_READS.FIRST_OF_PAIR,PF_HQ_ALIGNED_READS.PAIR,PF_HQ_ALIGNED_READS.SECOND_OF_PAIR,PF_HQ_ERROR_RATE.FIRST_OF_PAIR,PF_HQ_ERROR_RATE.PAIR,PF_HQ_ERROR_RATE.SECOND_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.FIRST_OF_PAIR,PF_HQ_MEDIAN_MISMATCHES.PAIR,PF_HQ_MEDIAN_MISMATCHES.SECOND_OF_PAIR,PF_INDEL_RATE.FIRST_OF_PAIR,PF_INDEL_RATE.PAIR,PF_INDEL_RATE.SECOND_OF_PAIR,PF_MISMATCH_RATE.FIRST_OF_PAIR,PF_MISMATCH_RATE.PAIR,PF_MISMATCH_RATE.SECOND_OF_PAIR,PF_NOISE_READS.FIRST_OF_PAIR,PF_NOISE_READS.PAIR,PF_NOISE_READS.SECOND_OF_PAIR,PF_READS.FIRST_OF_PAIR,PF_READS.PAIR,PF_READS.SECOND_OF_PAIR,PF_READS_ALIGNED.FIRST_OF_PAIR,PF_READS_ALIGNED.PAIR,PF_READS_ALIGNED.SECOND_OF_PAIR,PF_READS_IMPROPER_PAIRS.FIRST_OF_PAIR,PF_READS_IMPROPER_PAIRS.PAIR,PF_READS_IMPROPER_PAIRS.SECOND_OF_PAIR,READS_ALIGNED_IN_PAIRS.FIRST_OF_PAIR,READS_ALIGNED_IN_PAIRS.PAIR,READS_ALIGNED_IN_PAIRS.SECOND_OF_PAIR,STRAND_BALANCE.FIRST_OF_PAIR,STRAND_BALANCE.PAIR,STRAND_BALANCE.SECOND_OF_PAIR,TOTAL_READS.FIRST_OF_PAIR,TOTAL_READS.PAIR,TOTAL_READS.SECOND_OF_PAIR,MAX_INSERT_SIZE,MEAN_INSERT_SIZE,MEDIAN_ABSOLUTE_DEVIATION,MEDIAN_INSERT_SIZE,MIN_INSERT_SIZE,PAIR_ORIENTATION,READ_PAIRS,STANDARD_DEVIATION,WIDTH_OF_10_PERCENT,WIDTH_OF_20_PERCENT,WIDTH_OF_30_PERCENT,WIDTH_OF_40_PERCENT,WIDTH_OF_50_PERCENT,WIDTH_OF_60_PERCENT,WIDTH_OF_70_PERCENT,WIDTH_OF_80_PERCENT,WIDTH_OF_90_PERCENT,WIDTH_OF_99_PERCENT,ESTIMATED_LIBRARY_SIZE,PERCENT_DUPLICATION,READ_PAIRS_EXAMINED,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,SECONDARY_OR_SUPPLEMENTARY_RDS,UNMAPPED_READS,UNPAIRED_READS_EXAMINED,UNPAIRED_READ_DUPLICATES,CODING_BASES,CORRECT_STRAND_READS,IGNORED_READS,INCORRECT_STRAND_READS,INTERGENIC_BASES,INTRONIC_BASES,MEDIAN_3PRIME_BIAS,MEDIAN_5PRIME_BIAS,MEDIAN_5PRIME_TO_3PRIME_BIAS,MEDIAN_CV_COVERAGE,NUM_R1_TRANSCRIPT_STRAND_READS,NUM_R2_TRANSCRIPT_STRAND_READS,NUM_UNEXPLAINED_READS,PCT_CODING_BASES,PCT_CORRECT_STRAND_READS,PCT_INTERGENIC_BASES,PCT_INTRONIC_BASES,PCT_MRNA_BASES,PCT_R1_TRANSCRIPT_STRAND_READS,PCT_R2_TRANSCRIPT_STRAND_READS,PCT_RIBOSOMAL_BASES,PCT_USABLE_BASES,PCT_UTR_BASES,PF_ALIGNED_BASES,PF_BASES,RIBOSOMAL_BASES,UTR_BASES,ACCUMULATION_LEVEL,ALIGNED_READS,AT_DROPOUT,GC_DROPOUT,GC_NC_0_19,GC_NC_20_39,GC_NC_40_59,GC_NC_60_79,GC_NC_80_100,READS_USED,TOTAL_CLUSTERS,WINDOW_SIZE -Class,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,AlignmentSummaryMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,InsertSizeMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,DuplicationMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,RnaSeqMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics,GcBiasSummaryMetrics -test,0.0,0.0,0.0,25.0,25.0,25.0,0.0,0.0,0.0,0.006141,0.006153,0.006165,1.0,1.0,1.0,0.959299,0.956379,0.953459,0.036149,0.033206,0.030245,0.966514,0.969466,0.972435,131124.0,261405.0,130281.0,116063.0,231550.0,115487.0,115095.0,229110.0,114015.0,4650.0,9279.0,4629.0,0.000922,0.000885,0.000849,0.0,0.0,0.0,6.9e-05,5.4e-05,3.8e-05,0.0009,0.000876,0.000852,0.0,0.0,0.0,5479.0,10958.0,5479.0,5256.0,10480.0,5224.0,190.0,348.0,158.0,5080.0,10160.0,5080.0,0.494292,0.501527,0.508806,5479.0,10958.0,5479.0,2725787,207.219528,63,191,33,FR,5067,106.256303,25,49,73,99,127,157,195,267,641,87835,612743.0,0.007156,5080.0,21.0,0.0,4393.0,478.0,320.0,33.0,56934.0,0.0,0.0,0.0,65569.0,101238.0,0.705663,0.680576,0.496023,0.939679,719.0,795.0,60.0,0.2178,0.0,0.250833,0.387284,0.361883,0.474901,0.525099,0.0,0.345311,0.144083,261405.0,273950.0,0.0,37664.0,All Reads,14873,10.733266,1.82225,0.112713,0.817807,1.086361,2.181453,0.143318,ALL,7701,100 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt deleted file mode 100644 index a1828311..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.alignment_summary_metrics.txt +++ /dev/null @@ -1,12 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.AlignmentSummaryMetrics -CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP -FIRST_OF_PAIR 5479 5479 1 0 5256 0.959299 131124 4650 116063 115095 0 0.0009 0.000922 0.000069 25 5080 0.966514 190 0.036149 0 0.494292 0.006141 0 -SECOND_OF_PAIR 5479 5479 1 0 5224 0.953459 130281 4629 115487 114015 0 0.000852 0.000849 0.000038 25 5080 0.972435 158 0.030245 0 0.508806 0.006165 0 -PAIR 10958 10958 1 0 10480 0.956379 261405 9279 231550 229110 0 0.000876 0.000885 0.000054 25 10160 0.969466 348 0.033206 0 0.501527 0.006153 0 - - diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt deleted file mode 100644 index c4f38f09..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.duplicate_metrics.txt +++ /dev/null @@ -1,8 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam] OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.MarkDuplicated.bam METRICS_FILE=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX= OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:51:46 UTC 2018 - -## METRICS CLASS picard.sam.DuplicationMetrics -LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE -d20fb2dd-3d98-4516-a648-dee5e1917bd7 320 5080 4393 478 33 21 0 0.007156 612743 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt deleted file mode 100644 index c0050359..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.error_summary_metrics.txt +++ /dev/null @@ -1,15 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.artifacts.ErrorSummaryMetrics -REF_BASE ALT_BASE SUBSTITUTION REF_COUNT ALT_COUNT SUBSTITUTION_RATE -A C A>C 231512 16 0.000069 -A G A>G 231512 156 0.000673 -A T A>T 231512 16 0.000069 -C A C>A 173880 16 0.000092 -C G C>G 173880 14 0.000081 -C T C>T 173880 82 0.000471 - - diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt deleted file mode 100644 index 934a84ba..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.gc_bias.summary_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.GcBiasSummaryMetrics -ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP -All Reads ALL 100 7701 14873 10.733266 1.82225 0.112713 0.817807 1.086361 2.181453 0.143318 - - diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt deleted file mode 100644 index 160eb300..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.insert_size_metrics.txt +++ /dev/null @@ -1,8 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam ASSUME_SORTED=true OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:52:21 UTC 2018 - -## METRICS CLASS picard.analysis.InsertSizeMetrics -MEDIAN_INSERT_SIZE MEDIAN_ABSOLUTE_DEVIATION MIN_INSERT_SIZE MAX_INSERT_SIZE MEAN_INSERT_SIZE STANDARD_DEVIATION READ_PAIRS PAIR_ORIENTATION WIDTH_OF_10_PERCENT WIDTH_OF_20_PERCENT WIDTH_OF_30_PERCENT WIDTH_OF_40_PERCENT WIDTH_OF_50_PERCENT WIDTH_OF_60_PERCENT WIDTH_OF_70_PERCENT WIDTH_OF_80_PERCENT WIDTH_OF_90_PERCENT WIDTH_OF_99_PERCENT SAMPLE LIBRARY READ_GROUP -191 63 33 2725787 207.219528 106.256303 5067 FR 25 49 73 99 127 157 195 267 641 87835 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.rna_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.rna_metrics.txt deleted file mode 100644 index f7a52c62..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_qc.rna_metrics.txt +++ /dev/null @@ -1,8 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectRnaSeqMetrics REF_FLAT=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell_root/hca-dcp-mint-test-data/reference/GRCh38_Gencode/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell_root/broad-dsde-mint-test-cromwell-execution/caas-cromwell-executions/AdapterSmartSeq2SingleCell/059f3f7f-844a-44e7-addb-3a3b9e534559/call-analysis/ss2.SmartSeq2SingleCell/aeb598b8-b8e3-4e04-8ba7-b124f4203d04/call-HISAT2PairedEnd/d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.bam OUTPUT=d20fb2dd-3d98-4516-a648-dee5e1917bd7_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Thu Aug 30 20:51:55 UTC 2018 - -## METRICS CLASS picard.analysis.RnaSeqMetrics -PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP -273950 261405 0 56934 37664 101238 65569 0 0 0 719 795 60 0.474901 0.525099 0 0.2178 0.144083 0.387284 0.250833 0.361883 0.345311 0 0.939679 0.680576 0.705663 0.496023 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_rsem.cnt b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_rsem.cnt deleted file mode 100644 index 3ee8b723..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_rsem.cnt +++ /dev/null @@ -1,15 +0,0 @@ -3635 1844 0 5479 -1652 192 1484 -6599 3 -0 3635 -1 360 -2 327 -3 416 -4 243 -5 185 -6 85 -7 76 -8 53 -9 16 -10 83 -Inf 0 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_rsem.csv b/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_rsem.csv deleted file mode 100644 index fc0afb09..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics/test_rsem.csv +++ /dev/null @@ -1,3 +0,0 @@ -,alignable reads,filtered reads,multiple mapped,strand,total alignments,total reads,unalignable reads,uncertain reads,unique aligned -Class,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM,RSEM -test,1844,0,192,3,6599,5479,3635,1484,1652 diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt deleted file mode 100644 index 1559f3e7..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:18 UTC 2019 - -## METRICS CLASS picard.analysis.AlignmentSummaryMetrics -CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP -UNPAIRED 1086652 1086652 1 0 770963 0.709485 38213614 697232 34613985 34073804 0 0.002624 0.002357 0.000149 50 0 0 0 0 0 0.501303 0 0.000027 - - diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt deleted file mode 100644 index 661fa797..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# MarkDuplicates INPUT=[/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectDuplicationMetrics/inputs/-1585165421/SRR6258488_qc.bam] OUTPUT=SRR6258488_qc.MarkDuplicated.bam METRICS_FILE=SRR6258488_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX= OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:17 UTC 2019 - -## METRICS CLASS picard.sam.DuplicationMetrics -LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE -SRR6258488 770963 0 473100 315689 345396 0 0 0.448006 - - diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt deleted file mode 100644 index 26669a77..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt +++ /dev/null @@ -1,10 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:18 UTC 2019 - -## METRICS CLASS picard.analysis.GcBiasSummaryMetrics -ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP -All Reads ALL 100 1559752 1244063 13.760859 1.1878 0.219754 0.753171 1.281724 0.883386 0.021428 - - diff --git a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt b/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt deleted file mode 100644 index 43831064..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt +++ /dev/null @@ -1,113 +0,0 @@ -## htsjdk.samtools.metrics.StringHeader -# CollectRnaSeqMetrics REF_FLAT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=SRR6258488_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-1585165421/SRR6258488_qc.bam OUTPUT=SRR6258488_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false -## htsjdk.samtools.metrics.StringHeader -# Started on: Tue May 14 15:45:18 UTC 2019 - -## METRICS CLASS picard.analysis.RnaSeqMetrics -PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP -54332600 38213614 0 371628 1152265 18630585 18059136 0 0 0 12352 12891 538 0.489324 0.510676 0 0.009725 0.030153 0.487538 0.472584 0.039878 0.028047 0 2.183917 0 0 0 - -## HISTOGRAM java.lang.Integer -normalized_position All_Reads.normalized_coverage -0 1.252653 -1 1.146108 -2 1.065068 -3 1.122433 -4 1.234516 -5 1.247113 -6 1.2191 -7 1.08917 -8 1.101883 -9 1.130302 -10 1.082888 -11 1.146879 -12 1.173149 -13 1.084206 -14 1.035169 -15 1.169359 -16 1.278125 -17 1.298059 -18 1.418038 -19 1.468055 -20 1.306559 -21 1.210198 -22 0.953958 -23 0.806139 -24 0.815513 -25 0.887045 -26 0.763414 -27 0.737914 -28 0.702678 -29 0.689913 -30 0.633512 -31 0.665368 -32 0.682949 -33 0.848599 -34 0.941722 -35 1.082228 -36 1.113449 -37 1.049003 -38 0.97788 -39 0.989931 -40 0.92986 -41 0.874432 -42 0.87788 -43 0.868871 -44 0.92942 -45 1.015775 -46 1.070114 -47 1.023889 -48 1.023103 -49 0.988576 -50 0.931694 -51 0.794716 -52 0.765784 -53 0.721218 -54 0.723223 -55 0.711507 -56 0.704034 -57 0.694139 -58 0.741844 -59 0.831505 -60 0.806244 -61 0.869419 -62 0.987354 -63 0.954176 -64 0.925553 -65 0.951851 -66 0.906269 -67 0.85666 -68 0.985052 -69 0.947861 -70 0.98528 -71 0.873541 -72 0.87925 -73 0.956294 -74 1.137028 -75 1.206313 -76 1.148145 -77 1.159051 -78 1.207689 -79 1.170334 -80 1.199969 -81 1.391121 -82 1.243649 -83 1.235795 -84 1.227105 -85 1.278662 -86 1.298065 -87 1.201038 -88 1.2361 -89 1.098932 -90 1.042881 -91 1.037875 -92 0.95545 -93 0.969215 -94 1.059149 -95 0.857316 -96 0.792585 -97 0.817511 -98 0.880909 -99 0.786114 -100 0.548663 - diff --git a/tools/scripts/sctools/src/sctools/test/data/small-cell-sorted.bam b/tools/scripts/sctools/src/sctools/test/data/small-cell-sorted.bam deleted file mode 100644 index a44c1ff9..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/small-cell-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/small-gene-sorted.bam b/tools/scripts/sctools/src/sctools/test/data/small-gene-sorted.bam deleted file mode 100644 index 9773d658..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/small-gene-sorted.bam and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/test.bam b/tools/scripts/sctools/src/sctools/test/data/test.bam deleted file mode 100644 index 75db9802..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/test.bam and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/test.gtf b/tools/scripts/sctools/src/sctools/test/data/test.gtf deleted file mode 100644 index 79561f37..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/test.gtf +++ /dev/null @@ -1,109 +0,0 @@ -# truncated chromosome 19 genome used for testing util package ONLY -# created Aug 22, 2017 by Ambrose J Carr -chr19 HAVANA gene 60951 71626 . - . gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8"; -chr19 HAVANA transcript 60951 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA exon 70928 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 1; exon_id "ENSE00003781173.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA exon 60951 61894 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 3; exon_id "ENSE00003783010.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; -chr19 HAVANA transcript 62113 66524 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633719.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-009"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475086.2"; -chr19 HAVANA exon 62113 66524 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633719.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-009"; exon_number 1; exon_id "ENSE00003783013.1"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475086.2"; -chr19 HAVANA transcript 63821 70951 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA exon 70928 70951 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; exon_number 1; exon_id "ENSE00003782721.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA exon 63821 64213 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633703.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-010"; exon_number 3; exon_id "ENSE00003781018.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471181.1"; -chr19 HAVANA transcript 65051 66382 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000634023.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-011"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471182.4"; -chr19 HAVANA exon 66346 66382 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000634023.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-011"; exon_number 1; exon_id "ENSE00003778074.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471182.4"; -chr19 HAVANA exon 65051 65226 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000634023.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-011"; exon_number 2; exon_id "ENSE00003782150.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471182.4"; -chr19 HAVANA transcript 65822 66420 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632496.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-005"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475088.2"; -chr19 HAVANA exon 66346 66420 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632496.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-005"; exon_number 1; exon_id "ENSE00003780450.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475088.2"; -chr19 HAVANA exon 65822 66133 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632496.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-005"; exon_number 2; exon_id "ENSE00003782888.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475088.2"; -chr19 HAVANA transcript 65822 70945 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA exon 70928 70945 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; exon_number 1; exon_id "ENSE00003776564.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA exon 65822 66047 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632089.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-003"; exon_number 3; exon_id "ENSE00003779454.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471183.4"; -chr19 HAVANA transcript 65822 70963 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631796.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-004"; level 2; transcript_support_level "2"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475089.2"; -chr19 HAVANA exon 70928 70963 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631796.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-004"; exon_number 1; exon_id "ENSE00003775509.1"; level 2; transcript_support_level "2"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475089.2"; -chr19 HAVANA exon 65822 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631796.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-004"; exon_number 2; exon_id "ENSE00003783427.1"; level 2; transcript_support_level "2"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475089.2"; -chr19 HAVANA transcript 66320 66492 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633742.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WASH5P-001"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000451423.2"; -chr19 HAVANA exon 66320 66492 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000633742.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WASH5P-001"; exon_number 1; exon_id "ENSE00003779144.1"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000451423.2"; -chr19 HAVANA transcript 66378 71566 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631994.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-006"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475090.1"; -chr19 HAVANA exon 71141 71566 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631994.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-006"; exon_number 1; exon_id "ENSE00003776913.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475090.1"; -chr19 HAVANA exon 66378 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000631994.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-006"; exon_number 2; exon_id "ENSE00003775972.1"; level 2; transcript_support_level "5"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000475090.1"; -chr19 HAVANA transcript 70652 71626 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632292.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-012"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471218.1"; -chr19 HAVANA exon 70652 71626 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632292.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "WASH5P-012"; exon_number 1; exon_id "ENSE00003783546.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471218.1"; -chr19 HAVANA gene 68403 69178 . + . gene_id "ENSG00000282542.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "AC008993.2"; level 2; havana_gene "OTTHUMG00000180450.4"; -chr19 HAVANA transcript 68403 69178 . + . gene_id "ENSG00000282542.1"; transcript_id "ENST00000632280.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "AC008993.2"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "AC008993.2-001"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180450.4"; havana_transcript "OTTHUMT00000451405.4"; -chr19 HAVANA exon 68403 69178 . + . gene_id "ENSG00000282542.1"; transcript_id "ENST00000632280.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "AC008993.2"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "AC008993.2-001"; exon_number 1; exon_id "ENSE00003776314.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180450.4"; havana_transcript "OTTHUMT00000451405.4"; -chr19 HAVANA gene 69167 69972 . + . gene_id "ENSG00000282798.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "LLNLR-222A1.1"; level 2; havana_gene "OTTHUMG00000190399.1"; -chr19 HAVANA transcript 69167 69972 . + . gene_id "ENSG00000282798.1"; transcript_id "ENST00000631744.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "LLNLR-222A1.1"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "LLNLR-222A1.1-001"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190399.1"; havana_transcript "OTTHUMT00000484821.1"; -chr19 HAVANA exon 69167 69972 . + . gene_id "ENSG00000282798.1"; transcript_id "ENST00000631744.1"; gene_type "TEC"; gene_status "KNOWN"; gene_name "LLNLR-222A1.1"; transcript_type "TEC"; transcript_status "KNOWN"; transcript_name "LLNLR-222A1.1-001"; exon_number 1; exon_id "ENSE00003780024.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190399.1"; havana_transcript "OTTHUMT00000484821.1"; -chr19 HAVANA gene 71778 72718 . + . gene_id "ENSG00000282807.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; level 2; tag "ncRNA_host"; havana_gene "OTTHUMG00000180451.3"; -chr19 HAVANA transcript 71778 72718 . + . gene_id "ENSG00000282807.1"; transcript_id "ENST00000633603.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC008993.3-001"; level 2; tag "basic"; transcript_support_level "5"; havana_gene "OTTHUMG00000180451.3"; havana_transcript "OTTHUMT00000451407.2"; -chr19 HAVANA exon 71778 72274 . + . gene_id "ENSG00000282807.1"; transcript_id "ENST00000633603.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC008993.3-001"; exon_number 1; exon_id "ENSE00003776113.1"; level 2; tag "basic"; transcript_support_level "5"; havana_gene "OTTHUMG00000180451.3"; havana_transcript "OTTHUMT00000451407.2"; -chr19 HAVANA exon 72585 72718 . + . gene_id "ENSG00000282807.1"; transcript_id "ENST00000633603.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC008993.3"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC008993.3-001"; exon_number 2; exon_id "ENSE00003783209.1"; level 2; tag "basic"; transcript_support_level "5"; havana_gene "OTTHUMG00000180451.3"; havana_transcript "OTTHUMT00000451407.2"; -chr19 ENSEMBL gene 71973 72110 . + . gene_id "ENSG00000275604.1"; gene_type "miRNA"; gene_status "KNOWN"; gene_name "MIR1302-11"; level 3; -chr19 ENSEMBL transcript 71973 72110 . + . gene_id "ENSG00000275604.1"; transcript_id "ENST00000408051.1"; gene_type "miRNA"; gene_status "KNOWN"; gene_name "MIR1302-11"; transcript_type "miRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-201"; level 3; tag "basic"; transcript_support_level "NA"; -chr19 ENSEMBL exon 71973 72110 . + . gene_id "ENSG00000275604.1"; transcript_id "ENST00000408051.1"; gene_type "miRNA"; gene_status "KNOWN"; gene_name "MIR1302-11"; transcript_type "miRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-201"; exon_number 1; exon_id "ENSE00001808054.1"; level 3; tag "basic"; transcript_support_level "NA"; -chr19 HAVANA gene 76163 77686 . - . gene_id "ENSG00000282591.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; level 2; havana_gene "OTTHUMG00000180467.4"; -chr19 HAVANA transcript 76163 77686 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA exon 77330 77686 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; exon_number 1; exon_id "ENSE00003778121.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA exon 76886 77090 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; exon_number 2; exon_id "ENSE00003783139.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA exon 76163 76783 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000631376.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-001"; exon_number 3; exon_id "ENSE00003778696.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451424.4"; -chr19 HAVANA transcript 76220 77659 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000632948.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-002"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451425.3"; -chr19 HAVANA exon 77330 77659 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000632948.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-002"; exon_number 1; exon_id "ENSE00003779597.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451425.3"; -chr19 HAVANA exon 76220 77090 . - . gene_id "ENSG00000282591.1"; transcript_id "ENST00000632948.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138F"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138F-002"; exon_number 2; exon_id "ENSE00003782175.1"; level 2; transcript_support_level "3"; havana_gene "OTTHUMG00000180467.4"; havana_transcript "OTTHUMT00000451425.3"; -chr19 HAVANA gene 94062 94974 . + . gene_id "ENSG00000282137.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G3P"; level 2; havana_gene "OTTHUMG00000180452.2"; -chr19 HAVANA transcript 94062 94974 . + . gene_id "ENSG00000282137.1"; transcript_id "ENST00000633500.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G3P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G3P-001"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180452.2"; havana_transcript "OTTHUMT00000451408.2"; -chr19 HAVANA exon 94062 94974 . + . gene_id "ENSG00000282137.1"; transcript_id "ENST00000633500.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G3P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G3P-001"; exon_number 1; exon_id "ENSE00003781724.1"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180452.2"; havana_transcript "OTTHUMT00000451408.2"; -chr19 HAVANA gene 104535 105471 . + . gene_id "ENSG00000267310.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G1P"; level 2; havana_gene "OTTHUMG00000180453.1"; -chr19 HAVANA transcript 104535 105471 . + . gene_id "ENSG00000267310.1"; transcript_id "ENST00000588632.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G1P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G1P-001"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180453.1"; havana_transcript "OTTHUMT00000451409.1"; -chr19 HAVANA exon 104535 105471 . + . gene_id "ENSG00000267310.1"; transcript_id "ENST00000588632.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4G1P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4G1P-001"; exon_number 1; exon_id "ENSE00002952394.1"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180453.1"; havana_transcript "OTTHUMT00000451409.1"; -chr19 HAVANA gene 107461 111696 . + . gene_id "ENSG00000176695.6"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; level 2; havana_gene "OTTHUMG00000180454.2"; -chr19 HAVANA transcript 107461 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA exon 107461 107555 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 1; exon_id "ENSE00002825729.2"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA exon 110625 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA CDS 110679 111593 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA start_codon 110679 110681 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA stop_codon 111594 111596 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA UTR 107461 107555 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 1; exon_id "ENSE00002825729.2"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA UTR 110625 110678 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA UTR 111594 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000585993.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-001"; exon_number 2; exon_id "ENSE00002973945.1"; level 2; protein_id "ENSP00000467301.1"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000451410.2"; -chr19 HAVANA transcript 110613 111417 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000618231.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "OR4F17-002"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000475091.1"; -chr19 HAVANA exon 110613 111417 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000618231.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "retained_intron"; transcript_status "KNOWN"; transcript_name "OR4F17-002"; exon_number 1; exon_id "ENSE00003719758.1"; level 2; transcript_support_level "NA"; havana_gene "OTTHUMG00000180454.2"; havana_transcript "OTTHUMT00000475091.1"; -chr19 ENSEMBL transcript 110643 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL exon 110643 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL CDS 110679 111593 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL start_codon 110679 110681 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL stop_codon 111594 111596 . + 0 gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL UTR 110643 110678 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 ENSEMBL UTR 111594 111696 . + . gene_id "ENSG00000176695.6"; transcript_id "ENST00000318050.4"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F17"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F17-201"; exon_number 1; exon_id "ENSE00002309998.2"; level 3; protein_id "ENSP00000315047.3"; tag "basic"; transcript_support_level "NA"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS32854.1"; havana_gene "OTTHUMG00000180454.2"; -chr19 HAVANA gene 145485 145812 . + . gene_id "ENSG00000267792.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "WBP1LP11"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000180455.1"; -chr19 HAVANA transcript 145485 145812 . + . gene_id "ENSG00000267792.1"; transcript_id "ENST00000586141.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "WBP1LP11"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WBP1LP11-001"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180455.1"; havana_transcript "OTTHUMT00000451411.1"; -chr19 HAVANA exon 145485 145812 . + . gene_id "ENSG00000267792.1"; transcript_id "ENST00000586141.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "WBP1LP11"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "WBP1LP11-001"; exon_number 1; exon_id "ENSE00002835239.1"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180455.1"; havana_transcript "OTTHUMT00000451411.1"; -chr19 HAVANA gene 156279 157215 . - . gene_id "ENSG00000266971.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4F8P"; level 2; havana_gene "OTTHUMG00000180456.1"; -chr19 HAVANA transcript 156279 157215 . - . gene_id "ENSG00000266971.1"; transcript_id "ENST00000589943.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4F8P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4F8P-001"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180456.1"; havana_transcript "OTTHUMT00000451412.1"; -chr19 HAVANA exon 156279 157215 . - . gene_id "ENSG00000266971.1"; transcript_id "ENST00000589943.1"; gene_type "unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "OR4F8P"; transcript_type "unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "OR4F8P-001"; exon_number 1; exon_id "ENSE00002966057.1"; level 2; ont "PGO:0000005"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180456.1"; havana_transcript "OTTHUMT00000451412.1"; -chr19 HAVANA gene 176896 177913 . + . gene_id "ENSG00000282535.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC092192.1"; level 2; havana_gene "OTTHUMG00000180458.2"; -chr19 HAVANA transcript 176896 177913 . + . gene_id "ENSG00000282535.1"; transcript_id "ENST00000633154.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC092192.1"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC092192.1-001"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180458.2"; havana_transcript "OTTHUMT00000451414.2"; -chr19 HAVANA exon 176896 177913 . + . gene_id "ENSG00000282535.1"; transcript_id "ENST00000633154.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "AC092192.1"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "AC092192.1-001"; exon_number 1; exon_id "ENSE00003777312.1"; level 2; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180458.2"; havana_transcript "OTTHUMT00000451414.2"; -chr19 HAVANA gene 186373 195696 . - . gene_id "ENSG00000281379.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; level 2; havana_gene "OTTHUMG00000180460.8"; -chr19 HAVANA transcript 186373 191429 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000632397.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-003"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475096.2"; -chr19 HAVANA exon 191186 191429 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000632397.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-003"; exon_number 1; exon_id "ENSE00003777503.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475096.2"; -chr19 HAVANA exon 186373 186498 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000632397.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-003"; exon_number 2; exon_id "ENSE00003778733.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475096.2"; -chr19 HAVANA transcript 191115 191325 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000633205.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "SEPT14P19-001"; level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000451416.7"; -chr19 HAVANA exon 191115 191325 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000633205.1"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_status "KNOWN"; transcript_name "SEPT14P19-001"; exon_number 1; exon_id "ENSE00003775583.1"; level 2; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000451416.7"; -chr19 HAVANA transcript 191212 195696 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000587432.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-002"; level 2; tag "basic"; transcript_support_level "2"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475095.2"; -chr19 HAVANA exon 195504 195696 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000587432.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-002"; exon_number 1; exon_id "ENSE00002880392.5"; level 2; tag "basic"; transcript_support_level "2"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475095.2"; -chr19 HAVANA exon 191212 191354 . - . gene_id "ENSG00000281379.2"; transcript_id "ENST00000587432.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_status "KNOWN"; gene_name "SEPT14P19"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SEPT14P19-002"; exon_number 2; exon_id "ENSE00002672754.6"; level 2; tag "basic"; transcript_support_level "2"; havana_gene "OTTHUMG00000180460.8"; havana_transcript "OTTHUMT00000475095.2"; -chr19 HAVANA gene 193239 195595 . + . gene_id "ENSG00000282059.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "CICP19"; level 1; tag "pseudo_consens"; tag "overlapping_locus"; havana_gene "OTTHUMG00000180463.7"; -chr19 HAVANA transcript 193239 195595 . + . gene_id "ENSG00000282059.1"; transcript_id "ENST00000632944.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "CICP19"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "CICP19-001"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180463.7"; havana_transcript "OTTHUMT00000451419.7"; -chr19 HAVANA exon 193239 195595 . + . gene_id "ENSG00000282059.1"; transcript_id "ENST00000632944.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "CICP19"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "CICP19-001"; exon_number 1; exon_id "ENSE00003779877.1"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000180463.7"; havana_transcript "OTTHUMT00000451419.7"; -chr19 HAVANA gene 197310 198066 . - . gene_id "ENSG00000282416.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.2"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000190442.1"; -chr19 HAVANA transcript 197310 198066 . - . gene_id "ENSG00000282416.1"; transcript_id "ENST00000632679.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.2"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.2-001"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190442.1"; havana_transcript "OTTHUMT00000484997.1"; -chr19 HAVANA exon 197310 198066 . - . gene_id "ENSG00000282416.1"; transcript_id "ENST00000632679.1"; gene_type "processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.2"; transcript_type "processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.2-001"; exon_number 1; exon_id "ENSE00003778188.1"; level 1; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000190442.1"; havana_transcript "OTTHUMT00000484997.1"; -chr19 HAVANA gene 197961 200775 . + . gene_id "ENSG00000282051.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; level 2; havana_gene "OTTHUMG00000182072.3"; -chr19 HAVANA transcript 197961 198396 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633895.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-001"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000484998.2"; -chr19 HAVANA exon 197961 198396 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633895.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "transcribed_processed_pseudogene"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-001"; exon_number 1; exon_id "ENSE00003783880.1"; level 2; ont "PGO:0000004"; ont "PGO:0000019"; tag "basic"; transcript_support_level "NA"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000484998.2"; -chr19 HAVANA transcript 198052 200775 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633286.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-002"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000459134.1"; -chr19 HAVANA exon 198052 198234 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633286.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-002"; exon_number 1; exon_id "ENSE00003777852.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000459134.1"; -chr19 HAVANA exon 200578 200775 . + . gene_id "ENSG00000282051.1"; transcript_id "ENST00000633286.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "LLNLF-173C4.1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "LLNLF-173C4.1-002"; exon_number 2; exon_id "ENSE00003780775.1"; level 2; tag "basic"; transcript_support_level "3"; havana_gene "OTTHUMG00000182072.3"; havana_transcript "OTTHUMT00000459134.1"; diff --git a/tools/scripts/sctools/src/sctools/test/data/test.gtf.bz2 b/tools/scripts/sctools/src/sctools/test/data/test.gtf.bz2 deleted file mode 100644 index 5800fce8..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/test.gtf.bz2 and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/test.gtf.gz b/tools/scripts/sctools/src/sctools/test/data/test.gtf.gz deleted file mode 100644 index 2b954059..00000000 Binary files a/tools/scripts/sctools/src/sctools/test/data/test.gtf.gz and /dev/null differ diff --git a/tools/scripts/sctools/src/sctools/test/data/test.sam b/tools/scripts/sctools/src/sctools/test/data/test.sam deleted file mode 100644 index 928bcab1..00000000 --- a/tools/scripts/sctools/src/sctools/test/data/test.sam +++ /dev/null @@ -1,805 +0,0 @@ -@HD VN:1.4 SO:coordinate -@SQ SN:1 LN:248956422 -@SQ SN:10 LN:133797422 -@SQ SN:11 LN:135086622 -@SQ SN:12 LN:133275309 -@SQ SN:13 LN:114364328 -@SQ SN:14 LN:107043718 -@SQ SN:15 LN:101991189 -@SQ SN:16 LN:90338345 -@SQ SN:17 LN:83257441 -@SQ SN:18 LN:80373285 -@SQ SN:19 LN:58617616 -@SQ SN:2 LN:242193529 -@SQ SN:20 LN:64444167 -@SQ SN:21 LN:46709983 -@SQ SN:22 LN:50818468 -@SQ SN:3 LN:198295559 -@SQ SN:4 LN:190214555 -@SQ SN:5 LN:181538259 -@SQ SN:6 LN:170805979 -@SQ SN:7 LN:159345973 -@SQ SN:8 LN:145138636 -@SQ SN:9 LN:138394717 -@SQ SN:MT LN:16569 -@SQ SN:X LN:156040895 -@SQ SN:Y LN:57227415 -@SQ SN:KI270728.1 LN:1872759 -@SQ SN:KI270727.1 LN:448248 -@SQ SN:KI270442.1 LN:392061 -@SQ SN:KI270729.1 LN:280839 -@SQ SN:GL000225.1 LN:211173 -@SQ SN:KI270743.1 LN:210658 -@SQ SN:GL000008.2 LN:209709 -@SQ SN:GL000009.2 LN:201709 -@SQ SN:KI270747.1 LN:198735 -@SQ SN:KI270722.1 LN:194050 -@SQ SN:GL000194.1 LN:191469 -@SQ SN:KI270742.1 LN:186739 -@SQ SN:GL000205.2 LN:185591 -@SQ SN:GL000195.1 LN:182896 -@SQ SN:KI270736.1 LN:181920 -@SQ SN:KI270733.1 LN:179772 -@SQ SN:GL000224.1 LN:179693 -@SQ SN:GL000219.1 LN:179198 -@SQ SN:KI270719.1 LN:176845 -@SQ SN:GL000216.2 LN:176608 -@SQ SN:KI270712.1 LN:176043 -@SQ SN:KI270706.1 LN:175055 -@SQ SN:KI270725.1 LN:172810 -@SQ SN:KI270744.1 LN:168472 -@SQ SN:KI270734.1 LN:165050 -@SQ SN:GL000213.1 LN:164239 -@SQ SN:GL000220.1 LN:161802 -@SQ SN:KI270715.1 LN:161471 -@SQ SN:GL000218.1 LN:161147 -@SQ SN:KI270749.1 LN:158759 -@SQ SN:KI270741.1 LN:157432 -@SQ SN:GL000221.1 LN:155397 -@SQ SN:KI270716.1 LN:153799 -@SQ SN:KI270731.1 LN:150754 -@SQ SN:KI270751.1 LN:150742 -@SQ SN:KI270750.1 LN:148850 -@SQ SN:KI270519.1 LN:138126 -@SQ SN:GL000214.1 LN:137718 -@SQ SN:KI270708.1 LN:127682 -@SQ SN:KI270730.1 LN:112551 -@SQ SN:KI270438.1 LN:112505 -@SQ SN:KI270737.1 LN:103838 -@SQ SN:KI270721.1 LN:100316 -@SQ SN:KI270738.1 LN:99375 -@SQ SN:KI270748.1 LN:93321 -@SQ SN:KI270435.1 LN:92983 -@SQ SN:GL000208.1 LN:92689 -@SQ SN:KI270538.1 LN:91309 -@SQ SN:KI270756.1 LN:79590 -@SQ SN:KI270739.1 LN:73985 -@SQ SN:KI270757.1 LN:71251 -@SQ SN:KI270709.1 LN:66860 -@SQ SN:KI270746.1 LN:66486 -@SQ SN:KI270753.1 LN:62944 -@SQ SN:KI270589.1 LN:44474 -@SQ SN:KI270726.1 LN:43739 -@SQ SN:KI270735.1 LN:42811 -@SQ SN:KI270711.1 LN:42210 -@SQ SN:KI270745.1 LN:41891 -@SQ SN:KI270714.1 LN:41717 -@SQ SN:KI270732.1 LN:41543 -@SQ SN:KI270713.1 LN:40745 -@SQ SN:KI270754.1 LN:40191 -@SQ SN:KI270710.1 LN:40176 -@SQ SN:KI270717.1 LN:40062 -@SQ SN:KI270724.1 LN:39555 -@SQ SN:KI270720.1 LN:39050 -@SQ SN:KI270723.1 LN:38115 -@SQ SN:KI270718.1 LN:38054 -@SQ SN:KI270317.1 LN:37690 -@SQ SN:KI270740.1 LN:37240 -@SQ SN:KI270755.1 LN:36723 -@SQ SN:KI270707.1 LN:32032 -@SQ SN:KI270579.1 LN:31033 -@SQ SN:KI270752.1 LN:27745 -@SQ SN:KI270512.1 LN:22689 -@SQ SN:KI270322.1 LN:21476 -@SQ SN:GL000226.1 LN:15008 -@SQ SN:KI270311.1 LN:12399 -@SQ SN:KI270366.1 LN:8320 -@SQ SN:KI270511.1 LN:8127 -@SQ SN:KI270448.1 LN:7992 -@SQ SN:KI270521.1 LN:7642 -@SQ SN:KI270581.1 LN:7046 -@SQ SN:KI270582.1 LN:6504 -@SQ SN:KI270515.1 LN:6361 -@SQ SN:KI270588.1 LN:6158 -@SQ SN:KI270591.1 LN:5796 -@SQ SN:KI270522.1 LN:5674 -@SQ SN:KI270507.1 LN:5353 -@SQ SN:KI270590.1 LN:4685 -@SQ SN:KI270584.1 LN:4513 -@SQ SN:KI270320.1 LN:4416 -@SQ SN:KI270382.1 LN:4215 -@SQ SN:KI270468.1 LN:4055 -@SQ SN:KI270467.1 LN:3920 -@SQ SN:KI270362.1 LN:3530 -@SQ SN:KI270517.1 LN:3253 -@SQ SN:KI270593.1 LN:3041 -@SQ SN:KI270528.1 LN:2983 -@SQ SN:KI270587.1 LN:2969 -@SQ SN:KI270364.1 LN:2855 -@SQ SN:KI270371.1 LN:2805 -@SQ SN:KI270333.1 LN:2699 -@SQ SN:KI270374.1 LN:2656 -@SQ SN:KI270411.1 LN:2646 -@SQ SN:KI270414.1 LN:2489 -@SQ SN:KI270510.1 LN:2415 -@SQ SN:KI270390.1 LN:2387 -@SQ SN:KI270375.1 LN:2378 -@SQ SN:KI270420.1 LN:2321 -@SQ SN:KI270509.1 LN:2318 -@SQ SN:KI270315.1 LN:2276 -@SQ SN:KI270302.1 LN:2274 -@SQ SN:KI270518.1 LN:2186 -@SQ SN:KI270530.1 LN:2168 -@SQ SN:KI270304.1 LN:2165 -@SQ SN:KI270418.1 LN:2145 -@SQ SN:KI270424.1 LN:2140 -@SQ SN:KI270417.1 LN:2043 -@SQ SN:KI270508.1 LN:1951 -@SQ SN:KI270303.1 LN:1942 -@SQ SN:KI270381.1 LN:1930 -@SQ SN:KI270529.1 LN:1899 -@SQ SN:KI270425.1 LN:1884 -@SQ SN:KI270396.1 LN:1880 -@SQ SN:KI270363.1 LN:1803 -@SQ SN:KI270386.1 LN:1788 -@SQ SN:KI270465.1 LN:1774 -@SQ SN:KI270383.1 LN:1750 -@SQ SN:KI270384.1 LN:1658 -@SQ SN:KI270330.1 LN:1652 -@SQ SN:KI270372.1 LN:1650 -@SQ SN:KI270548.1 LN:1599 -@SQ SN:KI270580.1 LN:1553 -@SQ SN:KI270387.1 LN:1537 -@SQ SN:KI270391.1 LN:1484 -@SQ SN:KI270305.1 LN:1472 -@SQ SN:KI270373.1 LN:1451 -@SQ SN:KI270422.1 LN:1445 -@SQ SN:KI270316.1 LN:1444 -@SQ SN:KI270340.1 LN:1428 -@SQ SN:KI270338.1 LN:1428 -@SQ SN:KI270583.1 LN:1400 -@SQ SN:KI270334.1 LN:1368 -@SQ SN:KI270429.1 LN:1361 -@SQ SN:KI270393.1 LN:1308 -@SQ SN:KI270516.1 LN:1300 -@SQ SN:KI270389.1 LN:1298 -@SQ SN:KI270466.1 LN:1233 -@SQ SN:KI270388.1 LN:1216 -@SQ SN:KI270544.1 LN:1202 -@SQ SN:KI270310.1 LN:1201 -@SQ SN:KI270412.1 LN:1179 -@SQ SN:KI270395.1 LN:1143 -@SQ SN:KI270376.1 LN:1136 -@SQ SN:KI270337.1 LN:1121 -@SQ SN:KI270335.1 LN:1048 -@SQ SN:KI270378.1 LN:1048 -@SQ SN:KI270379.1 LN:1045 -@SQ SN:KI270329.1 LN:1040 -@SQ SN:KI270419.1 LN:1029 -@SQ SN:KI270336.1 LN:1026 -@SQ SN:KI270312.1 LN:998 -@SQ SN:KI270539.1 LN:993 -@SQ SN:KI270385.1 LN:990 -@SQ SN:KI270423.1 LN:981 -@SQ SN:KI270392.1 LN:971 -@SQ SN:KI270394.1 LN:970 -@PG ID:STAR PN:STAR VN:STAR_2.5.0a CL:STAR --runMode alignReads --runThreadN 23 --genomeDir hg38_long_polya/ --readFilesIn test_long_polya/test_long_polya_merged.fastq --limitOutSJcollapsed 2000000 --outFileNamePrefix test_long_polya/alignments/ --outSAMprimaryFlag AllBestScore --outFilterType BySJout --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 --alignIntronMax 1000000 --alignSJDBoverhangMin 8 -@PG ID:STAR-3D48ABDC PN:STAR VN:STAR_2.5.0a CL:STAR --runMode alignReads --runThreadN 23 --genomeDir hg38_long_polya/ --readFilesIn test_long_polya/test_long_polya_merged.fastq --limitOutSJcollapsed 2000000 --outFileNamePrefix test_long_polya/alignments/ --outSAMprimaryFlag AllBestScore --outFilterType BySJout --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 --alignIntronMax 1000000 --alignSJDBoverhangMin 8 -@CO user command line: STAR --alignIntronMax 1000000 --alignIntronMin 20 --genomeDir hg38_long_polya/ --limitOutSJcollapsed 2000000 --outFilterMultimapNmax 1 --readFilesIn test_long_polya/test_long_polya_merged.fastq --outFilterMismatchNoverLmax 0.04 --runThreadN 23 --alignSJDBoverhangMin 8 --runMode alignReads --outFileNamePrefix test_long_polya/alignments/ --outFilterType BySJout --outSAMprimaryFlag AllBestScore -@CO user command line: STAR --alignIntronMax 1000000 --alignIntronMin 20 --genomeDir hg38_long_polya/ --limitOutSJcollapsed 2000000 --outFilterMultimapNmax 1 --readFilesIn test_long_polya/test_long_polya_merged.fastq --outFilterMismatchNoverLmax 0.04 --runThreadN 23 --alignSJDBoverhangMin 8 --runMode alignReads --outFileNamePrefix test_long_polya/alignments/ --outFilterType BySJout --outSAMprimaryFlag AllBestScore -:AGGTTCCATTCTACACGCT:ACGTACAT:TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGGTTTTTTT;HISEQ:222:C7HL8ANXX:8:1110:8959:43102 16 19 281075 255 28S71M2S * 0 0 NGCCCCCGGTCCCCTCTTTTCCTCCCCCCCCCATATACATTACATTTTACAAAACAGCAACTATCTGATCTCTCGGTCCCTTCCTTAACCCCATAAAAAAG ############################BB/ 0 - - -def test_chromosome_19_comes_before_21(indices): - """chromosome 19 comes before 21 in the test file, this should be replicated in the output""" - assert max(indices[0]) < min(indices[1]) - - -# TAGGER TESTED IN INTEGRATION TESTS ONLY (see test_entrypoints.py) - -# TEST SPLIT - - -@pytest.fixture(scope="module", params=[data_dir + "test.sam", data_dir + "test.bam"]) -def bamfile(request): - return request.param - - -def test_split_bam_raises_value_error_when_passed_bam_without_barcodes(bamfile,): - split_size = ( - 0.02 # our test data is very small, 0.01mb = ~10kb, which should yield 5 files. - ) - with pytest.raises(RuntimeError): - bam.split( - [bamfile], - "test_output", - [consts.CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - ) - - -@pytest.fixture -def tagged_bam(): - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test_r2.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - ] - platform.TenXV2.attach_barcodes(args) - return "test_tagged_bam.bam" - - -def test_split_on_tagged_bam(tagged_bam): - split_size = 0.005 # our test data is very small, this value should yield 3 files - outputs = bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - ) - assert len(outputs) == 3 - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_split_with_large_chunk_size_generates_one_file(tagged_bam): - split_size = 1024 # our test data is very small, this value should yield 1 file - outputs = bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - ) - assert len(outputs) == 1 - - # the file should be full size - with pysam.AlignmentFile(outputs[0], "rb", check_sq=False) as f: - assert len([x for x in f]) == 100 - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_split_with_raise_missing_true_raises_warning_without_cr_barcode_passed( - tagged_bam, -): - split_size = 1024 # our test data is very small, this value should yield 1 file - with pytest.raises(RuntimeError): - bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - raise_missing=True, - ) - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_split_succeeds_with_raise_missing_false_and_no_cr_barcode_passed(tagged_bam,): - split_size = 1024 # our test data is very small, this value should yield 1 file - outputs = bam.split( - [tagged_bam], - "test_output", - [consts.CELL_BARCODE_TAG_KEY], - approx_mb_per_split=split_size, - raise_missing=False, - ) - - assert len(outputs) == 1 - - # the file should be full size - with pysam.AlignmentFile(outputs[0], "rb", check_sq=False) as f: - assert ( - len([x for x in f]) == 1 - ) # only one of our barcodes is whitelisted or within 1 base - - # cleanup - os.remove(tagged_bam) # clean up - for f in glob.glob("test_output_*"): - os.remove(f) - - -def test_get_barcodes_from_bam(tagged_bam): - outputs = bam.get_barcodes_from_bam( - tagged_bam, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - raise_missing=True, - ) - assert len(outputs) == 99 - - -def test_get_barcodes_from_bam_with_raise_missing_true_raises_warning_without_cr_barcode_passed( - tagged_bam, -): - with pytest.raises(RuntimeError): - bam.get_barcodes_from_bam( - tagged_bam, [consts.CELL_BARCODE_TAG_KEY], raise_missing=True - ) - - -def test_write_barcodes_to_bins(tagged_bam): - barcodes = bam.get_barcodes_from_bam( - tagged_bam, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - raise_missing=True, - ) - - test_barcodes_to_bins = {} - for barcode in barcodes: - test_barcodes_to_bins[barcode] = 0 - - filenames = bam.write_barcodes_to_bins( - tagged_bam, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - test_barcodes_to_bins, - raise_missing=False, - ) - - assert len(filenames) == 1 - - # cleanup - for f in filenames: - shutil.rmtree(os.path.dirname(f)) - - -def test_get_barcode_for_alignment(tagged_bam): - with pysam.AlignmentFile(tagged_bam, "rb", check_sq=False) as input_alignments: - for alignment in input_alignments: - barcode = bam.get_barcode_for_alignment( - alignment, - [consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY], - raise_missing=False, - ) - assert barcode == "NTAAGAGTCTGCAAGT" - break - - -def test_get_barcode_for_alignment_raises_error_for_missing_tag(tagged_bam): - with pysam.AlignmentFile(tagged_bam, "rb", check_sq=False) as input_alignments: - for alignment in input_alignments: - with pytest.raises(RuntimeError): - bam.get_barcode_for_alignment(alignment, TAG_KEYS, raise_missing=True) - - -# TEST SORTING - - -def test_tag_sortable_records_compare_correctly(): - records = make_records_from_values(TAG_KEYS, SORTED_VALUES) - num_records = len(SORTED_VALUES) - for i in range(num_records): - for j in range(num_records): - if i < j: - assert records[i] < records[j] - elif i == j: - assert records[i] == records[j] - else: - assert records[i] > records[j] - - -def test_tag_sortable_records_raises_error_on_different_tag_lists(): - r1 = bam.TagSortableRecord(["FOO", "BAR"], ["A", "A"], "A") - r2 = bam.TagSortableRecord(["BAR", "BAZ"], ["A", "A"], "A") - with pytest.raises(ValueError): - r1 == r2 - - -def test_tag_sortable_records_str(): - record = bam.TagSortableRecord(TAG_KEYS, SORTED_VALUES[0][0], SORTED_VALUES[0][1]) - s = record.__str__() - assert "TagSortableRecord" in s - assert "['FOO', 'BAR', 'BAZ']" in s - - -def test_verify_sort_on_unsorted_records_raises_error(): - records = make_records_from_values(TAG_KEYS, UNSORTED_VALUES) - with pytest.raises(bam.SortError): - bam.verify_sort(records, TAG_KEYS) - - -def test_verify_sort_raises_no_error_on_sorted_records(): - records = make_records_from_values(TAG_KEYS, SORTED_VALUES) - bam.verify_sort(records, TAG_KEYS) - - -def test_sort_by_tags_and_queryname_sorts_correctly_from_file(): - tag_keys = ["UB", "CB", "GE"] - with pysam.AlignmentFile(data_dir + "unsorted.bam", "rb") as f: - records = f.fetch(until_eof=True) - sorted_records = bam.sort_by_tags_and_queryname(records, tag_keys) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(r, tag_keys) for r in sorted_records - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - -def test_sort_by_tags_and_queryname_sorts_correctly_from_file_no_tag_keys(): - tag_keys = [] - with pysam.AlignmentFile(data_dir + "unsorted.bam", "rb") as f: - records = f.fetch(until_eof=True) - sorted_records = bam.sort_by_tags_and_queryname(records, tag_keys) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(r, tag_keys) for r in sorted_records - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - -def test_tag_sortable_records_sort_correctly(): - tag_keys = TAG_KEYS - records = make_records_from_values(tag_keys, deepcopy(UNSORTED_VALUES)) - sorted_records = sorted(records) - bam.verify_sort(sorted_records, tag_keys) - - -def test_tag_sortable_records_sort_correctly_when_already_sorted(): - # This is to a bit paranoid, but just make sure sorted stays correct if already sorted - tag_keys = TAG_KEYS - records = make_records_from_values(tag_keys, deepcopy(SORTED_VALUES)) - sorted_records = sorted(records) - bam.verify_sort(sorted_records, tag_keys) - - -def test_sort_by_tags_and_queryname_sorts_correctly_no_tag_keys(): - tag_keys = [] - records = make_records_from_values(tag_keys, deepcopy(UNSORTED_VALUES)) - sorted_records = sorted(records) - bam.verify_sort(sorted_records, tag_keys) - - -def test_tag_sortable_record_missing_tag_value_is_empty_string(): - tags = ["_NOT_REAL_TAG_"] - with pysam.AlignmentFile(data_dir + "unsorted.bam", "rb") as f: - records = f.fetch(until_eof=True) - first_record = next(iter(records)) - sortable_record = bam.TagSortableRecord.from_aligned_segment(first_record, tags) - assert sortable_record.tag_values[0] == "" - - -def test_tag_sortable_record_lt_is_false_for_equal_records(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert not r1 < r2 - - -def test_tag_sortable_record_lt_is_true_for_smaller_query_name(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="B" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_is_true_for_smaller_tag(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "B"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "B", "A"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["B", "A", "A"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_is_true_for_smaller_tag_regardless_of_query_name(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="B" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "B"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_empty_query_name_is_smaller(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_lt_empty_tag_is_smaller(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", ""], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 < r2 - - -def test_tag_sortable_record_eq_is_true_for_identical_records(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - assert r1 == r2 - - -def test_tag_sortable_record_eq_is_false_when_any_difference_exists(): - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="B" - ) - assert not r1 == r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "B"], query_name="A" - ) - assert not r1 == r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "B", "A"], query_name="A" - ) - assert not r1 == r2 - r1 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["A", "A", "A"], query_name="A" - ) - r2 = bam.TagSortableRecord( - tag_keys=TAG_KEYS, tag_values=["B", "A", "A"], query_name="A" - ) - assert not r1 == r2 - - -def make_records_from_values(tag_keys, tags_and_query_name): - records = [] - for i in range(len(tags_and_query_name)): - r = bam.TagSortableRecord( - tag_keys=tag_keys, - tag_values=tags_and_query_name[i][0], - query_name=tags_and_query_name[i][1], - ) - records.append(r) - return records diff --git a/tools/scripts/sctools/src/sctools/test/test_barcode.py b/tools/scripts/sctools/src/sctools/test/test_barcode.py deleted file mode 100644 index f9fa39a5..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_barcode.py +++ /dev/null @@ -1,161 +0,0 @@ -import os - -import numpy as np -import pysam -import pytest - -from .. import barcode, encodings, platform, consts - -data_dir = os.path.split(__file__)[0] + "/data/" - - -# TEST BARCODES - - -@pytest.fixture -def barcode_set(): - return barcode.Barcodes.from_whitelist( - data_dir + "1k-august-2016.txt", barcode_length=16 - ) - - -@pytest.fixture(scope="module", params=["r", "rb"]) -def short_barcode_set_from_iterable(request): - with open(data_dir + "1k-august-2016.txt", request.param) as f: - barcodes = [l.strip() for l in f.readlines()[:50]] - if isinstance(barcodes[0], bytes): - return barcode.Barcodes.from_iterable_bytes(barcodes, barcode_length=16) - else: - return barcode.Barcodes.from_iterable_strings(barcodes, barcode_length=16) - - -@pytest.fixture(scope="module") -def short_barcode_set_from_encoded(): - return barcode.Barcodes.from_iterable_encoded( - [0, 1, 2, 3, 4, 5, 6, 7], barcode_length=2 - ) - - -def test_iterable_produces_correct_barcodes(short_barcode_set_from_encoded): - tbe = encodings.TwoBit(2) - decoded = [tbe.decode(b) for b in short_barcode_set_from_encoded] - print(decoded) - assert decoded == [b"AA", b"AC", b"AT", b"AG", b"CA", b"CC", b"CT", b"CG"] - - -def test_reads_barcodes_from_file(barcode_set): - assert len(barcode_set) == 1001 # number of barcodes in file. - - -def test_base_frequency_sums_are_all_equal_to_barcode_set_length(barcode_set): - bf = barcode_set.base_frequency() - assert isinstance(bf, np.ndarray) - assert np.array_equal(bf.sum(axis=1), np.ones(16) * len(barcode_set)) - - -def test_barcode_diversity_is_in_range(barcode_set): - bd = barcode_set.effective_diversity() - assert np.all(bd >= 0) - assert np.all(bd <= 1) - - -def test_summarize_hamming_distances_gives_reasonable_results( - short_barcode_set_from_iterable, -): - - hamming_summary = short_barcode_set_from_iterable.summarize_hamming_distances() - - # we know 10x barcodes have at least this much distance - assert hamming_summary["minimum"] >= 2 - # no barcode can have more hamming distance than length - assert all(v <= 16 for v in hamming_summary.values()) - - -# TEST HashErrorsToCorrectBarcodes - - -@pytest.fixture(scope="module") -def trivial_whitelist(): - barcode_iterable = ["A" * 8] - error_mapping = barcode.ErrorsToCorrectBarcodesMap._prepare_single_base_error_hash_table( - barcode_iterable - ) - return barcode.ErrorsToCorrectBarcodesMap(error_mapping) - - -@pytest.fixture(scope="module") -def truncated_whitelist_from_10x(): - # note that this whitelist contains 1 non-10x barcode to ensure the presence of a matching - # target in the test data. - error_mapping = barcode.ErrorsToCorrectBarcodesMap.single_hamming_errors_from_whitelist( - data_dir + "1k-august-2016.txt" - ) - return error_mapping - - -def test_incorrect_input_raises_errors(trivial_whitelist): - with pytest.raises(TypeError): - barcode.ErrorsToCorrectBarcodesMap("not_a_mapping") - with pytest.raises(TypeError): - barcode.ErrorsToCorrectBarcodesMap({"not_a_mapping"}) - with pytest.raises(TypeError): - barcode.ErrorsToCorrectBarcodesMap(["not_a_mapping", "sldkf"]) - assert isinstance(trivial_whitelist, barcode.ErrorsToCorrectBarcodesMap) - - -def test_correct_barcode_finds_and_corrects_1_base_errors(trivial_whitelist): - assert trivial_whitelist.get_corrected_barcode("TAAAAAAA") == "AAAAAAAA" - assert trivial_whitelist.get_corrected_barcode("AAAACAAA") == "AAAAAAAA" - assert trivial_whitelist.get_corrected_barcode("AAAGAAAA") == "AAAAAAAA" - assert trivial_whitelist.get_corrected_barcode("AAAAAAAA") == "AAAAAAAA" - - -def test_correct_barcode_raises_keyerror_when_barcode_not_correct_length( - trivial_whitelist, -): - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAA") - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAAAAAAAA") - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAAAAAAAAA") - - -def test_correct_barcode_raises_keyerror_when_barcode_has_more_than_one_error( - trivial_whitelist, -): - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("AAAAAATT") - with pytest.raises(KeyError): - trivial_whitelist.get_corrected_barcode("TTAAAAAA") - - -@pytest.fixture(scope="module") -def tagged_bamfile(): - outbam = data_dir + "bam_with_tags_test.bam" - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - outbam, - ] - platform.TenXV2.attach_barcodes(args) - return outbam - - -def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x): - outbam = data_dir + "bam_with_cb_tags.bam" - truncated_whitelist_from_10x.correct_bam(tagged_bamfile, outbam) - success = False - with pysam.AlignmentFile(outbam, "rb") as f: - for record in f: - try: - success = record.get_tag(consts.CELL_BARCODE_TAG_KEY) - except KeyError: - continue - assert success - os.remove(outbam) diff --git a/tools/scripts/sctools/src/sctools/test/test_count.py b/tools/scripts/sctools/src/sctools/test/test_count.py deleted file mode 100644 index 81ec1514..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_count.py +++ /dev/null @@ -1,1348 +0,0 @@ -""" -Testing for Count Matrix Construction -===================================== - -The test generates (1) a random count matrix, and (2) corresponding alignment records, and writes them to disk -(a BAM file, count matrix, row and column indices). The alignment records are expected to produce the same count -matrix according to the counting algorithm implemented in `sctools:bam.from_sorted_tagged_bam`. Gene names are -fetched from an annotations GTF file that is a subset of GENCODE annotations (see `_test_annotation_file` below). - -Notes ------ - -- The agreement between the synthetic count matrix and the synthetic BAM file is contingent on the - agreement between the counting algorithm implemented in `sctools:bam.from_sorted_tagged_bam` and - the test data generator (see SyntheticTaggedBAMGenerator below). Therefore, future changes in the - counting algorithm must be accompanied by a corresponding change in the test data generation class. - Otherwise, the tests will fail. - -- We have adopt a minimal test suite design strategy, in the sense that the synthetic test data is only complete - to the degree that is required by `sctools:bam.from_sorted_tagged_bam`. As such, the synthetic BAM file lacks - the following features: - - * flag, - * query_sequence, - * query_quality, - * CIGAR string, - * cell barcode quality tag, - * molecule barcode quality tag, - * raw cell and molecule barcodes, - - At the time of writing, the counting algorithm **only** relies on the BAM tags. - -- SyntheticTaggedBAMGenerator generates four types of alignment records: - - * necessary alignments -- these records contain one unique cell/molecule/gene tag for each cell/gene count - unit, according to the randomly generated count matrix. Necessary alignments are also sufficient - in the sense that they are expected to reproduce the count matrix in the absence of any other alignment - record. - - * redundant alignments -- these records are expected to be ignored by the counting algorithm and have three - subtypes: - - - duplicate alignments -- these are randomly picked from necessary alignments, though, they are given a new - query name (to mimic PCR and optical duplicates). - - - incomplete alignments -- these records miss at least one necessary tag, e.g. cell barcode, molecule - barcode, or gene name. - - - multi-gene alignments -- these records have the same tags and query_name, though, at least two such - records per query_name exist that point to different genes. -""" - -import operator -import os -import tempfile -from typing import Callable, Optional, List, Set, Tuple, Dict, Generator - -import numpy as np -import scipy.sparse as sp -import pysam -import pytest - -from sctools import gtf, bam, consts -from sctools.count import CountMatrix - -# set the input and output directories -_test_data_dir = os.path.join(os.path.split(__file__)[0], "data") -_test_annotation_file = os.path.join(_test_data_dir, "chr1.30k_records.gtf.gz") - -# constants -_test_num_cells = 50 -_test_max_genes = 20 -_test_gene_expression_rate = 5.0 -_test_num_duplicates = 20 -_test_num_missing_some_tags = 20 -_test_num_multiple_gene_alignments = 20 -_test_max_gene_hits_per_multiple_gene_alignments = 5 - -_test_num_only_exons = 10 -_test_num_only_introns = 10 -_test_both_exons_introns = 10 - - -@pytest.fixture(scope="module") -def gene_name_to_index() -> Dict[str, int]: - return gtf.extract_gene_names(_test_annotation_file) - - -class AlignmentRecordTags: - """Represents the bundle of cell barcode, molecule barcode, and gene name.""" - - def __init__( - self, - cell_barcode: Optional[str], - molecule_barcode: Optional[str], - gene_name: Optional[str], - alignment_location: Optional[str] = "EXONIC", - ) -> None: - self.cell_barcode = cell_barcode - self.molecule_barcode = molecule_barcode - self.gene_name = gene_name - self.alignment_location = alignment_location - - def __hash__(self): - return hash((self.cell_barcode, self.molecule_barcode, self.gene_name)) - - def __repr__(self): - return ( - f"{consts.CELL_BARCODE_TAG_KEY}: {self.cell_barcode}, " - f"{consts.MOLECULE_BARCODE_TAG_KEY}: {self.molecule_barcode}, " - f"{consts.GENE_NAME_TAG_KEY}: {self.gene_name}", - f"{consts.ALIGNMENT_LOCATION_TAG_KEY}: {self.alignment_location}", - ) - - -class CellMoleculeGeneQueryNameSortOrder(bam.AlignmentSortOrder): - """Hierarchical alignment record sort order (cell barcode >= molecule barcode >= gene name >= query name).""" - - def __init__( - self, - cell_barcode_tag_key: str = consts.CELL_BARCODE_TAG_KEY, - molecule_barcode_tag_key: str = consts.MOLECULE_BARCODE_TAG_KEY, - gene_name_tag_key: str = consts.GENE_NAME_TAG_KEY, - ) -> None: - assert cell_barcode_tag_key, "Cell barcode tag key can not be None" - assert molecule_barcode_tag_key, "Molecule barcode tag key can not be None" - assert gene_name_tag_key, "Gene name tag key can not be None" - self.cell_barcode_tag_key = cell_barcode_tag_key - self.molecule_barcode_tag_key = molecule_barcode_tag_key - self.gene_name_tag_key = gene_name_tag_key - - def _get_sort_key( - self, alignment: pysam.AlignedSegment - ) -> Tuple[str, str, str, str]: - return ( - bam.get_tag_or_default(alignment, self.cell_barcode_tag_key, default="N"), - bam.get_tag_or_default( - alignment, self.molecule_barcode_tag_key, default="N" - ), - bam.get_tag_or_default(alignment, self.gene_name_tag_key, default="N"), - alignment.query_name, - ) - - @property - def key_generator( - self, - ) -> Callable[[pysam.AlignedSegment], Tuple[str, str, str, str]]: - return self._get_sort_key - - def __repr__(self) -> str: - return "hierarchical__cell_molecule_gene_query_name" - - -class SyntheticTaggedBAMGenerator: - """This class generates a synthetic count matrix and an accompanying synthetic tagged BAM file as - described in the preamble documentation block. - - Parameters - ---------- - num_cells : int - number of real cells - max-genes : int - maximum number of genes to use to generate synthetic counts - gene_name_to_index : dict - a map from gene name to their count matrix index - gene_expression_rate : float - poisson rate at which each gene is expressed - rng_seed : int - random number generator seed - - Methods - ------- - generate_synthetic_bam_and_counts_matrix - generates synthetic test data and writes the output to disk - - See Also - -------- - count.from_sorted_tagged_bam - """ - - OUTPUT_PREFIX = "synthetic_" - SYNTHETIC_SEQUENCE_NAME = "SYNTHETIC_SEQUENCE" - SYNTHETIC_SEQUENCE_LENGTH = 100 - NECESSARY_QUERY_NAME_PREFIX = "NECESSARY_QUERY_" - DUPLICATE_QUERY_NAME_PREFIX = "DUPLICATE_QUERY_" - INCOMPLETE_QUERY_NAME_PREFIX = "INCOMPLETE_QUERY_" - MULTI_GENE_QUERY_NAME_PREFIX = "MULTI_GENE_QUERY_" - - bam_output_filename = OUTPUT_PREFIX + "records.bam" - count_matrix_output_filename = OUTPUT_PREFIX + "count_matrix.npy" - row_index_output_filename = OUTPUT_PREFIX + "_row_index.npy" - col_index_output_filename = OUTPUT_PREFIX + "_col_index.npy" - - def __init__( - self, - num_cells: int, - max_genes: int, - gene_name_to_index: Dict[str, int], - gene_expression_rate: float, - rng_seed: int = 777, - ) -> None: - self.num_cells = num_cells - self.gene_expression_rate = gene_expression_rate - - # initialize the random number generator - self.rng: np.random.RandomState = np.random.RandomState(seed=rng_seed) - - # generate gene names - self.all_gene_names = [ - k for k, v in sorted(gene_name_to_index.items(), key=operator.itemgetter(1)) - ] - self.num_genes = len(self.all_gene_names) - - self.max_genes = max_genes - assert ( - max_genes <= self.num_genes - ), f"Max genes ({self.max_genes}) must be <= to all annotated genes ({self.num_genes})" - self.to_be_used_gene_indices: List[int] = self.rng.choice( - np.arange(0, self.num_genes, dtype=np.int), - size=self.max_genes, - replace=False, - ).tolist() - self.to_be_used_gene_names = [ - self.all_gene_names[j] for j in self.to_be_used_gene_indices - ] - - def generate_synthetic_bam_and_counts_matrix( - self, - output_path: str, - num_duplicates: int, - num_missing_some_tags: int, - num_multiple_gene_alignments: int, - max_gene_hits_per_multiple_gene_alignments: int, - alignment_sort_order: bam.AlignmentSortOrder = CellMoleculeGeneQueryNameSortOrder(), - ): - """Generates synthetic count matrix and BAM file and writes them to disk. - - Parameters - ---------- - output_path : str - output path - num_duplicates : int - number of duplicate records - num_missing_some_tags : int - number of records that miss at least one crucial tag - num_multiple_gene_alignments : int - number of records that have at least two different gene tags - max_gene_hits_per_multiple_gene_alignments : int - maximum number of unique gene names to use for multiple-gene records - alignment_sort_order : bam.AlignmentSortOrder - sort order of BAM alignment records; if 'None', random sort order is implied - - Returns - ------- - None - """ - assert 2 <= max_gene_hits_per_multiple_gene_alignments <= self.max_genes, ( - f"The parameter `max_gene_hits_per_multiple_gene_alignments` must >= 2 and < maximum annotated " - f"genes ({self.max_genes})" - ) - assert num_duplicates >= 0, "Number of duplicate queries must be non-negative" - assert ( - num_missing_some_tags >= 0 - ), "Number of queries with missing tags must be non-negative" - assert ( - num_multiple_gene_alignments >= 0 - ), "Number of queries with multiple gene alignments must be non-negative" - - # generate synthetic count matrix and corresponding simulated records - synthetic_data_bundle = self._generate_synthetic_counts_and_alignment_tags( - num_duplicates, - num_missing_some_tags, - num_multiple_gene_alignments, - max_gene_hits_per_multiple_gene_alignments, - ) - records = list( - SyntheticTaggedBAMGenerator._get_bam_records_generator( - synthetic_data_bundle - ) - ) - - if not alignment_sort_order: # random - # shuffle records - self.rng.shuffle(records) - - else: - records = sorted(records, key=alignment_sort_order.key_generator) - - # write BAM file - with pysam.AlignmentFile( - os.path.join(output_path, self.bam_output_filename), - mode="wb", - reference_names=[self.SYNTHETIC_SEQUENCE_NAME], - reference_lengths=[self.SYNTHETIC_SEQUENCE_LENGTH], - ) as bo: - for record in records: - bo.write(record) - - # write count matrix, row index, and col index - np.save( - os.path.join(output_path, self.count_matrix_output_filename), - synthetic_data_bundle.count_matrix, - ) - np.save( - os.path.join(output_path, self.row_index_output_filename), - synthetic_data_bundle.row_index, - ) - np.save( - os.path.join(output_path, self.col_index_output_filename), - synthetic_data_bundle.col_index, - ) - - def _generate_synthetic_counts_and_alignment_tags( - self, - num_duplicates: int, - num_missing_some_tags: int, - num_multiple_gene_alignments: int, - max_gene_hits_per_multiple_gene_alignments: int, - ) -> "SyntheticDataBundle": - - # generate count matrix - count_matrix: np.ndarray = self._generate_random_count_matrix() - - # generate necessary alignment tags that produce count_matrix - ( - necessary_alignment_record_tags_set, - row_index, - col_index, - ) = self._generate_necessary_alignment_record_bundle(count_matrix) - necessary_alignment_record_tags_list = list(necessary_alignment_record_tags_set) - - # sanity check -- we require as many necessary alignment records as the total counts - assert len(necessary_alignment_record_tags_set) == np.sum(count_matrix), ( - "There is an inconsistency between synthetic counts and necessary tags: we require as " - "many necessary alignment tags as the total counts" - ) - - # add duplicate records - duplicate_alignment_tags_list = self._generate_duplicate_alignment_tags( - num_duplicates, necessary_alignment_record_tags_list - ) - - # add records with missing tags - incomplete_alignment_tags_list: List[ - AlignmentRecordTags - ] = self._generate_incomplete_alignment_tags(num_missing_some_tags) - - # add records with multiple gene alignments - multiple_alignment_tags_list: List[ - List[AlignmentRecordTags] - ] = self._generate_multiple_gene_alignment_tags( - num_multiple_gene_alignments, - max_gene_hits_per_multiple_gene_alignments, - necessary_alignment_record_tags_set, - ) - - return SyntheticDataBundle( - count_matrix, - row_index, - col_index, - necessary_alignment_record_tags_list, - duplicate_alignment_tags_list, - incomplete_alignment_tags_list, - multiple_alignment_tags_list, - ) - - def _generate_random_count_matrix(self) -> np.ndarray: - """Generates a random count matrix. - - This method selects `self.max_genes` out of all all genes (`self.num_genes`) and populates the selected genes - with Poisson counts with rate `self.gene_expression_rate`. The count matrix entries corresponding to the - rest of the genes are set to zero. - - Returns - ------- - np.ndarray - an ndarray of shape (`self.num_cells`, `self.num_genes`) - """ - non_zero_count_matrix = self.rng.poisson( - lam=self.gene_expression_rate, size=(self.num_cells, self.max_genes) - ) - count_matrix = np.zeros((self.num_cells, self.num_genes), dtype=np.int) - for i, i_gene in enumerate(self.to_be_used_gene_indices): - count_matrix[:, i_gene] = non_zero_count_matrix[:, i] - return count_matrix - - @staticmethod - def _get_bam_records_generator( - synthetic_data_bundle: "SyntheticDataBundle", rng_seed: int = 777 - ) -> Generator[pysam.AlignedSegment, None, None]: - """Returns a generator of pysam.AlignedSegment instances created from the alignment tags - provided to the initializer. - - Parameters - ---------- - synthetic_data_bundle : SyntheticDataBundle - a bundle of synthetic alignment tags - rng_seed : int - random number generator seed; it is used for generating random reference_start position. - - See Also - -------- - - The preamble documentation block for a description of the meaning of different alignment records - (necessary, duplicate, incomplete, etc.) - - SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags - """ - rng = np.random.RandomState(rng_seed) - - num_queries = synthetic_data_bundle.num_queries - i_query = 0 - - # necessary, duplicate, and incomplete alignments - for alignment_tags_list, query_name_prefix in zip( - [ - synthetic_data_bundle.necessary_alignment_record_tags_list, - synthetic_data_bundle.duplicate_alignment_tags_list, - synthetic_data_bundle.incomplete_alignment_tags_list, - ], - [ - SyntheticTaggedBAMGenerator.NECESSARY_QUERY_NAME_PREFIX, - SyntheticTaggedBAMGenerator.DUPLICATE_QUERY_NAME_PREFIX, - SyntheticTaggedBAMGenerator.INCOMPLETE_QUERY_NAME_PREFIX, - ], - ): - for alignment_tags in alignment_tags_list: - yield SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tags, query_name_prefix, i_query, num_queries, rng - ) - i_query += 1 - - # multi-gene alignments - for alignment_tags_list in synthetic_data_bundle.multiple_alignment_tags_list: - # multiple alignments have the same query name (by definition) - for alignment_tags in alignment_tags_list: - yield SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tags, - SyntheticTaggedBAMGenerator.MULTI_GENE_QUERY_NAME_PREFIX, - i_query, - num_queries, - rng, - ) - i_query += 1 - - @staticmethod - def _generate_aligned_segment_from_tags( - alignment_tags: AlignmentRecordTags, - query_prefix: str, - i_query: int, - num_queries: int, - rng: np.random.RandomState, - record_reference_id: Optional[int] = 0, - reference_start: Optional[int] = -1, - ) -> pysam.AlignedSegment: - """Generates pysam.AlignedSegment instances from alignment_tags. - - Parameters - ---------- - alignment_tags : AlignmentRecordTags - tags to attach to the instantiated pysam.AlignedSegment - query_prefix : str - prefix to use for query name - i_query : int - query index - num_queries: int - maximum number of queries (only used for pretty-printing the query index) - rng: np.random.RandomState - a random number generator - - Notes - ----- - The query_sequence and query_quality are both empty as these query features are not used for generating - the counts matrix. Likewise, the flag is currently unset. In the future, once we add a filtering - policy based on BAM record flags (such as duplicates), this method must be updated accordingly. - - Returns - ------- - pysam.AlignedSegment - an instance of pysam.AlignedSegment - - """ - tags = [] - if alignment_tags.cell_barcode: - tags.append((consts.CELL_BARCODE_TAG_KEY, alignment_tags.cell_barcode, "Z")) - if alignment_tags.molecule_barcode: - tags.append( - (consts.MOLECULE_BARCODE_TAG_KEY, alignment_tags.molecule_barcode, "Z") - ) - if alignment_tags.gene_name: - tags.append((consts.GENE_NAME_TAG_KEY, alignment_tags.gene_name, "Z")) - - if alignment_tags.alignment_location: - tags.append( - ( - consts.ALIGNMENT_LOCATION_TAG_KEY, - alignment_tags.alignment_location, - "Z", - ) - ) - - record = pysam.AlignedSegment() - record.query_name = SyntheticTaggedBAMGenerator._generate_query_name( - query_prefix, i_query, num_queries - ) - - if reference_start == -1: - record.reference_start = rng.randint( - low=0, high=SyntheticTaggedBAMGenerator.SYNTHETIC_SEQUENCE_LENGTH - ) - else: - record.reference_start = reference_start - - record.reference_id = ( - record_reference_id # note: we only use one synthetic sequence - ) - if len(tags) > 0: - record.set_tags(tags) - return record - - @staticmethod - def _generate_query_name(query_prefix: str, i_query: int, num_queries: int) -> str: - """Returns query name string from query index. We zero-pad the string representation of query - indices merely for pretty-printing, e.g. 0000, 0001, ..., 9999.""" - num_digits = len(str(num_queries - 1)) - return query_prefix + str(i_query).zfill(num_digits) - - def _generate_necessary_alignment_record_bundle( - self, count_matrix: np.ndarray - ) -> Tuple[Set[AlignmentRecordTags], List[str], List[str]]: - alignments: Set[AlignmentRecordTags] = set() - used_cell_barcodes: Set[str] = set() - - row_index: List[str] = [] - col_index = self.all_gene_names - - for i_cell in range(self.num_cells): - # generate a unique cell barcode - while True: - cell_barcode = self._generate_random_cell_barcode() - if cell_barcode not in used_cell_barcodes: - break - row_index.append(cell_barcode) - - for i_gene in self.to_be_used_gene_indices: - for i_molecule in range(count_matrix[i_cell, i_gene]): - # generate a unique alignment tag - unique_alignment_tag = self._generate_unique_random_alignment_tag( - alignments, - gene_name=self.all_gene_names[i_gene], - cell_barcode=cell_barcode, - ) - alignments.add(unique_alignment_tag) - - return alignments, row_index, col_index - - def _generate_unique_random_alignment_tag( - self, - existing_alignment_tags: Set[AlignmentRecordTags], - gene_name: str, - cell_barcode: Optional[str] = None, - molecule_barcode: Optional[str] = None, - ) -> AlignmentRecordTags: - assert ( - gene_name in self.to_be_used_gene_names - ), f"{gene_name} is not an allowed gene for generating synthetic data" - - while True: - alignment = AlignmentRecordTags( - cell_barcode=cell_barcode - if cell_barcode - else self._generate_random_cell_barcode(), - molecule_barcode=molecule_barcode - if molecule_barcode - else self._generate_random_molecule_barcode(), - gene_name=gene_name, - ) - if alignment not in existing_alignment_tags: - return alignment - - def _generate_duplicate_alignment_tags( - self, num_duplicates: int, necessary_alignments_list: List[AlignmentRecordTags] - ) -> List[AlignmentRecordTags]: - return self.rng.choice(necessary_alignments_list, size=num_duplicates).tolist() - - def _generate_incomplete_alignment_tags( - self, num_missing_some_tags: int - ) -> List[AlignmentRecordTags]: - """Generates alignments with missing crucial tags. - - Notes - ----- - This method requires each combination of missing tags to occur at least once and may therefore return lists - that are longer than `num_missing_some_tags`. - """ - incomplete_alignment_tags_list: List[AlignmentRecordTags] = list() - tag_mask_occurrences: Set[int] = set() - i_entries = 0 - while i_entries < num_missing_some_tags or len(tag_mask_occurrences) < 7: - tag_mask = self.rng.randint(low=0, high=7) - tag_mask_occurrences.add(tag_mask) - gene_name = self.rng.choice(self.to_be_used_gene_names) - alignment = self._generate_unique_random_alignment_tag(set(), gene_name) - if not tag_mask & 1: - alignment.cell_barcode = None - if not tag_mask & 2: - alignment.molecule_barcode = None - if not tag_mask & 4: - alignment.gene_name = None - incomplete_alignment_tags_list.append(alignment) - i_entries += 1 - return incomplete_alignment_tags_list - - def _generate_multiple_gene_alignment_tags( - self, - num_multiple_gene_alignments: int, - max_gene_hits_per_multiple_gene_alignments: int, - necessary_alignment_record_tags_set: Set[AlignmentRecordTags], - ) -> List[List[AlignmentRecordTags]]: - - necessary_alignment_record_tags_list = list(necessary_alignment_record_tags_set) - - multiple_gene_alignment_tags_list: List[List[AlignmentRecordTags]] = list() - for _ in range(num_multiple_gene_alignments): - random_necessary_alignment = self.rng.choice( - necessary_alignment_record_tags_list - ) - random_necessary_cell_barcode: str = random_necessary_alignment.cell_barcode - novel_molecule_barcode: str = self._generate_unique_random_alignment_tag( - necessary_alignment_record_tags_set, - gene_name=random_necessary_alignment.gene_name, - cell_barcode=random_necessary_cell_barcode, - ).molecule_barcode - num_gene_hits = self.rng.randint( - low=2, high=max_gene_hits_per_multiple_gene_alignments + 1 - ) - gene_name_hits = self.rng.choice( - self.to_be_used_gene_names, replace=False, size=num_gene_hits - ) - multiple_gene_alignment_tags_list.append( - [ - AlignmentRecordTags( - random_necessary_cell_barcode, novel_molecule_barcode, gene_name - ) - for gene_name in gene_name_hits - ] - ) - return multiple_gene_alignment_tags_list - - def _generate_random_cell_barcode(self, length: int = 16): - return self._generate_random_genomic_sequences(length) - - def _generate_random_molecule_barcode(self, length: int = 10): - return self._generate_random_genomic_sequences(length) - - def _generate_random_genomic_sequences(self, length: int): - return "".join(self.rng.choice(["A", "C", "T", "G"], size=length)) - - -class SyntheticDataBundle: - """A container for synthetic count matrix, row and column indices, and alignment tags. - - Parameters - ---------- - count_matrix : np.ndarray - the cell x gene synthetic count matrix - row_index : List[str] - list of cell barcodes - col_index : List[str] - list of gene names - necessary_alignment_record_tags_list : List[AlignmentRecordTags] - list of necessary alignment tags; alignment records made using these tags are expected to produce - `count_matrix` once processed by the counting algorithm. - duplicate_alignment_tags_list : List[AlignmentRecordTags] - list of duplicate alignment tags (a subset of `necessary_alignment_record_tags_list`) - incomplete_alignment_tags_list : List[AlignmentRecordTags] - list of incomplete alignment tags (miss at least one of the required tags: cell, molecule, gene) - multiple_alignment_tags_list : List[List[AlignmentRecordTags]] - list of lists of multiple alignment tags; each list element is a list of alignment tags with the - same molecular barcodes, though, with multiple gene names. - - See Also - -------- - SyntheticBarcodedBAMGenerator - """ - - def __init__( - self, - count_matrix: np.ndarray, - row_index: List[str], - col_index: List[str], - necessary_alignment_record_tags_list: List[AlignmentRecordTags], - duplicate_alignment_tags_list: List[AlignmentRecordTags], - incomplete_alignment_tags_list: List[AlignmentRecordTags], - multiple_alignment_tags_list: List[List[AlignmentRecordTags]], - ) -> None: - - assert count_matrix.shape == ( - len(row_index), - len(col_index), - ), "The shape of the count matrix is inconsistent with the provided row/column indices" - - self.count_matrix = count_matrix - self.row_index = row_index - self.col_index = col_index - - self.necessary_alignment_record_tags_list = necessary_alignment_record_tags_list - self.duplicate_alignment_tags_list = duplicate_alignment_tags_list - self.incomplete_alignment_tags_list = incomplete_alignment_tags_list - self.multiple_alignment_tags_list = multiple_alignment_tags_list - - self.num_queries = ( - len(necessary_alignment_record_tags_list) - + len(duplicate_alignment_tags_list) - + len(incomplete_alignment_tags_list) - + len(multiple_alignment_tags_list) - ) - - -def _get_sorted_count_matrix( - count_matrix: np.ndarray, row_index: np.ndarray, col_index: np.ndarray -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Sorted the rows and columns of `count_matrix` and the associated row/column indices. - - Parameters - ---------- - count_matrix : np.ndarray - a cell x gene count matrix - row_index : np.ndarray - row index of the count matrix (i.e. cell barcodes) - col_index : np.ndarray - column index of the count matrix (i.e. gene names) - - Returns - ------- - Tuple[np.ndarray, np.ndarray, np.ndarray] - row/column sorted count matrix, sorted row index, sorted column index - """ - sorted_row_indices = [ - idx for idx, _ in sorted(enumerate(row_index), key=operator.itemgetter(1)) - ] - sorted_col_indices = [ - idx for idx, _ in sorted(enumerate(col_index), key=operator.itemgetter(1)) - ] - return ( - count_matrix[sorted_row_indices, :][:, sorted_col_indices], - row_index[sorted_row_indices], - col_index[sorted_col_indices], - ) - - -@pytest.mark.parametrize( - "alignment_sort_order", - [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()], - ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"], -) -def test_count_matrix_from_bam( - alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index -): - # instantiate a test data generator - synthetic_data_generator = SyntheticTaggedBAMGenerator( - _test_num_cells, _test_max_genes, gene_name_to_index, _test_gene_expression_rate - ) - - _test_temp_dir = tempfile.TemporaryDirectory() - try: - # generate test data - synthetic_data_generator.generate_synthetic_bam_and_counts_matrix( - _test_temp_dir.name, - _test_num_duplicates, - _test_num_missing_some_tags, - _test_num_multiple_gene_alignments, - _test_max_gene_hits_per_multiple_gene_alignments, - alignment_sort_order=alignment_sort_order, - ) - - # test data paths - test_bam_path = os.path.join( - _test_temp_dir.name, SyntheticTaggedBAMGenerator.bam_output_filename - ) - test_count_matrix_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedBAMGenerator.count_matrix_output_filename, - ) - test_row_index_path = os.path.join( - _test_temp_dir.name, SyntheticTaggedBAMGenerator.row_index_output_filename - ) - test_col_index_path = os.path.join( - _test_temp_dir.name, SyntheticTaggedBAMGenerator.col_index_output_filename - ) - - # create CountMatrix from the synthetic bam - count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( - test_bam_path, gene_name_to_index - ) - - # load the test counts matrix - count_matrix_data_expected = np.load(test_count_matrix_path) - row_index_expected = np.load(test_row_index_path) - col_index_expected = np.load(test_col_index_path) - - finally: - _test_temp_dir.cleanup() - - count_matrix_data_from_bam = count_matrix_from_bam.matrix.todense() - row_index_from_bam = count_matrix_from_bam.row_index - col_index_from_bam = count_matrix_from_bam.col_index - - # sort expected and from_bam results by their respective row and column indices, since their sort order - # is not part of the design specs and is considered arbitrary - ( - sorted_count_matrix_data_from_bam, - sorted_row_index_from_bam, - sorted_col_index_from_bam, - ) = _get_sorted_count_matrix( - count_matrix_data_from_bam, row_index_from_bam, col_index_from_bam - ) - ( - sorted_count_matrix_data_expected, - sorted_row_index_expected, - sorted_col_index_expected, - ) = _get_sorted_count_matrix( - count_matrix_data_expected, row_index_expected, col_index_expected - ) - - # assert equality of sorted count matrices and sorted row/col indices - assert np.allclose( - sorted_count_matrix_data_from_bam, sorted_count_matrix_data_expected - ) - assert all( - [ - row_name_from_bam == row_name_expected - for row_name_from_bam, row_name_expected in zip( - sorted_row_index_from_bam, sorted_row_index_expected - ) - ] - ) - assert all( - [ - col_name_from_bam == col_name_expected - for col_name_from_bam, col_name_expected in zip( - sorted_col_index_from_bam, sorted_col_index_expected - ) - ] - ) - - -def extract_gene_non_exons( - chromosome_gene_exons: Dict[str, List[tuple]], - chromosome_gene_locations_extended: Dict[str, List[tuple]], -) -> Dict[str, Dict[str, List[tuple]]]: - - chromosome_gene_non_exons = {} - - for chromosome in chromosome_gene_exons: - chromosome_gene_non_exons[chromosome] = {} - gene_name_exon_list = {} - for gene_exons in chromosome_gene_exons[chromosome]: - gene_name_exon_list[gene_exons[1]] = gene_exons[0] - - gene_name_location_dict = {} - for gene_locations in chromosome_gene_locations_extended[chromosome]: - gene_name_location_dict[gene_locations[1]] = gene_locations[0] - - for gene_name in gene_name_location_dict: - non_exon_list = [] - if gene_name in gene_name_exon_list: - - start, end = gene_name_location_dict[gene_name] - coords = gene_name_exon_list[gene_name] - coords.sort(key=lambda a: a[0]) - - x = start - y = coords[0][0] - 1 - i = 0 - - n = len(coords) - while i < n: - if y <= coords[i][0]: - if x < y: - non_exon_list.append((x, y)) - x = coords[i][1] - else: - x = max(x, coords[i][1]) - - if i < n - 1: - y = min(end, coords[i + 1][0]) - i += 1 - chromosome_gene_non_exons[chromosome][gene_name] = non_exon_list.copy() - - return chromosome_gene_non_exons - - -@pytest.mark.parametrize( - "alignment_sort_order", - [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()], - ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"], -) -def _count_matrix_with_introns( - alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index -): - - chromosomes_gene_locations_extended = gtf.extract_extended_gene_names( - _test_annotation_file - ) - chromosomes_gene_exons = gtf.extract_gene_exons(_test_annotation_file) - - _test_chromosomes_gene_non_exons = extract_gene_non_exons( - chromosomes_gene_exons, chromosomes_gene_locations_extended - ) - - _test_chromosomes_gene_exons = {} - for chromosome in chromosomes_gene_exons: - _test_chromosomes_gene_exons[chromosome] = {} - for gene_exons in chromosomes_gene_exons[chromosome]: - _test_chromosomes_gene_exons[chromosome][gene_exons[1]] = gene_exons[0] - - # instantiate a test data generator - chromosome = list(_test_chromosomes_gene_exons.keys())[0] - - synthetic_data_generator = SyntheticTaggedAlignmentTypeBAMGenerator( - _test_num_cells, - _test_max_genes, - _test_chromosomes_gene_exons[chromosome], - _test_chromosomes_gene_non_exons[chromosome], - ) - - _test_temp_dir = tempfile.TemporaryDirectory() - try: - # generate test data - synthetic_data_generator.generate_synthetic_bam_and_counts_matrix( - _test_temp_dir.name, - gene_name_to_index, - test_index, - alignment_sort_order=alignment_sort_order, - ) - - # test data paths - test_bam_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.bam_output_filename, - ) - test_count_matrix_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename, - ) - test_row_index_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename, - ) - test_col_index_path = os.path.join( - _test_temp_dir.name, - SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename, - ) - # create CountMatrix from the synthetic bam - if test_index == consts.SINGLE_CELL_COUNT_MATRIX: - count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( - test_bam_path, gene_name_to_index - ) - if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX: - count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( - test_bam_path, - gene_name_to_index, - chromosomes_gene_locations_extended=chromosomes_gene_locations_extended, - ) - - # load the test counts matrix - _count_matrix_data_expected = sp.csr_matrix(np.load(test_count_matrix_path)) - row_index_expected = np.load(test_row_index_path) - col_index_expected = np.load(test_col_index_path) - - count_matrix_data_expected = CountMatrix( - _count_matrix_data_expected, row_index_expected, col_index_expected - ) - count_matrix_data_expected = count_matrix_data_expected.matrix.todense() - - finally: - _test_temp_dir.cleanup() - - count_matrix_data_from_bam = count_matrix_from_bam.matrix.todense() - row_index_from_bam = count_matrix_from_bam.row_index - col_index_from_bam = count_matrix_from_bam.col_index - - # sort expected and from_bam results by their respective row and column indices, since their sort order - # is not part of the design specs and is considered arbitrary - ( - sorted_count_matrix_data_from_bam, - sorted_row_index_from_bam, - sorted_col_index_from_bam, - ) = _get_sorted_count_matrix( - count_matrix_data_from_bam, row_index_from_bam, col_index_from_bam - ) - ( - sorted_count_matrix_data_expected, - sorted_row_index_expected, - sorted_col_index_expected, - ) = _get_sorted_count_matrix( - count_matrix_data_expected, row_index_expected, col_index_expected - ) - - assert all( - [ - row_name_from_bam == row_name_expected - for row_name_from_bam, row_name_expected in zip( - sorted_row_index_from_bam, sorted_row_index_expected - ) - ] - ) - assert all( - [ - col_name_from_bam == col_name_expected - for col_name_from_bam, col_name_expected in zip( - sorted_col_index_from_bam, sorted_col_index_expected - ) - ] - ) - - assert np.allclose( - sorted_count_matrix_data_from_bam, sorted_count_matrix_data_expected - ) - - -class SyntheticTaggedAlignmentTypeBAMGenerator: - """This class generates a synthetic count matrix and an accompanying synthetic tagged BAM file as - described in the preamble documentation block. - - Parameters - ---------- - num_cells : int - number of real cells - max-genes : int - maximum number of genes to use to generate synthetic counts - chromosomes_gene_exons : Dict[str, Dict[str, List[tuple]]] - keys at the first level refers to chromosome number, keys at the - second level refers to a gene and with the list of exonic regions as values - chromosomes_gene_non_exons : Dict[str, Dict[str, List[tuple]]] - keys at the first level refers to chromosome number, keys at the - second level refers to a gene and with the list of intronic regions as values - - rng_seed : int - random number generator seed - - Methods - ------- - generate_synthetic_bam_and_counts_matrix - generates synthetic test data and writes the output to disk - - See Also - -------- - count.from_sorted_tagged_bam - """ - - OUTPUT_PREFIX = "intronic_" - SYNTHETIC_SEQUENCE_LENGTH = 5 - REFERENCE_SEQUENCE_NAME = "1" - # EXONIC_SEQUENCE_NAME = "EXONIC_SEQUENCE" - SYNTHETIC_SEQUENCE_LENGTH = 100 - - bam_output_filename = OUTPUT_PREFIX + "records.bam" - count_matrix_output_filename = OUTPUT_PREFIX + "count_matrix.npy" - row_index_output_filename = OUTPUT_PREFIX + "_row_index.npy" - col_index_output_filename = OUTPUT_PREFIX + "_col_index.npy" - - def __init__( - self, - num_cells: int, - max_genes: int, - chromosomes_gene_exons: Dict[str, Dict[str, List[tuple]]], - chromosomes_gene_non_exons: Dict[str, List[tuple]], - rng_seed: int = 777, - ) -> None: - self.num_cells = num_cells - - self.chromosomes_gene_exons = chromosomes_gene_exons - self.chromosomes_gene_non_exons = chromosomes_gene_non_exons - - # initialize the random number generator - self.rng: np.random.RandomState = np.random.RandomState(seed=rng_seed) - - # generate gene names - self.all_gene_names = list(self.chromosomes_gene_exons.keys())[:max_genes] - self.num_genes = len(self.all_gene_names) - - self.max_genes = max_genes - assert ( - max_genes <= self.num_genes - ), f"Max genes ({self.max_genes}) must be <= to all annotated genes ({self.num_genes})" - self.to_be_used_gene_indices: List[int] = self.rng.choice( - np.arange(0, self.num_genes, dtype=np.int), - size=self.max_genes, - replace=False, - ).tolist() - self.to_be_used_gene_names = [ - self.all_gene_names[j] for j in self.to_be_used_gene_indices - ] - - def _generate_random_cell_barcode(self, length: int = 16): - return self._generate_random_genomic_sequences(length) - - def _generate_random_molecule_barcode(self, length: int = 10): - return self._generate_random_genomic_sequences(length) - - def _generate_random_genomic_sequences(self, length: int): - return "".join(self.rng.choice(["A", "C", "T", "G"], size=length)) - - def _generate_location_based_tag_list( - self, num_alignments: int, gene_names: List[str], alignment_location: str - ): - alignment_record_tags = [] - for i in range(num_alignments): - alignment_record_tags.append( - AlignmentRecordTags( - self._generate_random_cell_barcode(), - self._generate_random_molecule_barcode(), - gene_names[i], - alignment_location, - ) - ) - - return alignment_record_tags - - def _add_alignment_start_coordinates(self, alignment_tags, alignment_location): - _alignment_tags = [] - - for alignment_tag in alignment_tags: - if alignment_location == "EXONIC": - if alignment_tag.gene_name in self.chromosomes_gene_exons: - coord = self.chromosomes_gene_exons[alignment_tag.gene_name] - setattr(alignment_tag, "coordinate", coord[0][0] + 1) - _alignment_tags.append(alignment_tag) - - if alignment_location == "INTRONIC": - if alignment_tag.gene_name in self.chromosomes_gene_non_exons: - coord = self.chromosomes_gene_non_exons[alignment_tag.gene_name] - if coord: - setattr(alignment_tag, "coordinate", coord[0][0] + 1) - alignment_tag.gene_name = "" - _alignment_tags.append(alignment_tag) - - return _alignment_tags - - def generate_synthetic_bam_and_counts_matrix( - self, - output_path: str, - gene_name_to_index: int, - test_index: int, - alignment_sort_order: bam.AlignmentSortOrder = CellMoleculeGeneQueryNameSortOrder(), - ): - """Generates synthetic count matrix and BAM file and writes them to disk. - - Parameters - ---------- - output_path : str - output path - gene_name_to_index : Dict[str, int] - gene name to an index - test_index : int - 0 for single cell matrix and 1 for single nuclei matrix - alignment_sort_order : bam.AlignmentSortOrder - sort order of BAM alignment records; if 'None', random sort order is implied - - Returns - ------- - None - """ - - gene_names_alignments = [] - - for gene_name in sorted(self.chromosomes_gene_non_exons.keys()): - if self.chromosomes_gene_non_exons[gene_name]: - gene_names_alignments.append(gene_name) - - gene_names: List[int] = [] - cell_ids: List[int] = [] - - records = [] - # Only exons, expected in both single-cell and single-nuclei modes - exonic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[0:], "EXONIC" - ) - exonic_alignment_tags = self._add_alignment_start_coordinates( - exonic_alignment_tags, "EXONIC" - ) - - for i, alignment_tag in enumerate(exonic_alignment_tags): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tag, - "EXONIC", - i, - 10, - self.rng, - reference_start=alignment_tag.coordinate, - ) - records.append(pysam_alignment) - gene_names.append(alignment_tag.gene_name) - cell_ids.append(alignment_tag.cell_barcode) - - "Only introns only in single-nuclei mode" - intronic_alignment_tags = self._generate_location_based_tag_list( - 3, gene_names_alignments[10:], "INTRONIC" - ) - intronic_alignment_tags = self._add_alignment_start_coordinates( - intronic_alignment_tags, "INTRONIC" - ) - for i, alignment_tag in enumerate(intronic_alignment_tags): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - alignment_tag, - "INTRONIC", - i + 10, - 10, - self.rng, - reference_start=alignment_tag.coordinate, - ) - records.append(pysam_alignment) - if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX: - gene_names.append(gene_names_alignments[i + 10]) - cell_ids.append(alignment_tag.cell_barcode) - - "both intron and exons from the same gene in bost single-cell and single-nuclei modes" - exonic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[20:], "EXONIC" - ) - exonic_alignment_tags = self._add_alignment_start_coordinates( - exonic_alignment_tags, "EXONIC" - ) - - _intronic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[20:], "INTRONIC" - ) - intronic_alignment_tags = [] - for intronic_tag, exonic_tag in zip( - _intronic_alignment_tags, exonic_alignment_tags - ): - intronic_tag.cell_barcode = exonic_tag.cell_barcode - intronic_alignment_tags.append(intronic_tag) - intronic_alignment_tags = self._add_alignment_start_coordinates( - intronic_alignment_tags, "INTRONIC" - ) - - for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate( - zip(exonic_alignment_tags, intronic_alignment_tags) - ): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - exonic_alignment_tag, - "EXONINTRONSAME", - i + 20, - 10, - self.rng, - reference_start=exonic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - intronic_alignment_tag, - "EXONINTRONSAME", - i + 20, - 10, - self.rng, - reference_start=intronic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - cell_ids.append(exonic_alignment_tag.cell_barcode) - gene_names.append(exonic_alignment_tag.gene_name) - - # both intron and exons from separate genes should not appear in single-cell mode - exonic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[30:], "EXONIC" - ) - exonic_alignment_tags = self._add_alignment_start_coordinates( - exonic_alignment_tags, "EXONIC" - ) - - _intronic_alignment_tags = self._generate_location_based_tag_list( - 10, gene_names_alignments[31:], "INTRONIC" - ) - intronic_alignment_tags = [] - for intronic_tag, exonic_tag in zip( - _intronic_alignment_tags, exonic_alignment_tags - ): - intronic_tag.cell_barcode = exonic_tag.cell_barcode - intronic_alignment_tags.append(intronic_tag) - intronic_alignment_tags = self._add_alignment_start_coordinates( - intronic_alignment_tags, "INTRONIC" - ) - - for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate( - zip(exonic_alignment_tags, intronic_alignment_tags) - ): - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - exonic_alignment_tag, - "EXONINTRONSEP", - i + 30, - 10, - self.rng, - reference_start=exonic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - - pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( - intronic_alignment_tag, - "EXONINTRONSEP", - i + 30, - 10, - self.rng, - reference_start=intronic_alignment_tag.coordinate, - ) - records.append(pysam_alignment) - - if test_index == consts.SINGLE_CELL_COUNT_MATRIX: - cell_ids.append(exonic_alignment_tag.cell_barcode) - gene_names.append(exonic_alignment_tag.gene_name) - - # write BAM file - with pysam.AlignmentFile( - os.path.join(output_path, self.bam_output_filename), - mode="wb", - reference_names=[self.REFERENCE_SEQUENCE_NAME], - reference_lengths=[self.SYNTHETIC_SEQUENCE_LENGTH], - ) as bo: - for record in records: - bo.write(record) - - n_genes = len(gene_name_to_index) - n_data = len(cell_ids) - # write count matrix, row index, and col index - count_matrix = np.zeros((n_data, n_genes), dtype=np.int32) - for i, (cell_id, gene_name) in enumerate(zip(cell_ids, gene_names)): - count_matrix[i][gene_name_to_index[gene_name]] = 1 - - test_count_matrix_path = os.path.join( - output_path, - SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename, - ) - test_row_index_path = os.path.join( - output_path, - SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename, - ) - test_col_index_path = os.path.join( - output_path, - SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename, - ) - - np.save(test_count_matrix_path, count_matrix) - np.save(test_row_index_path, cell_ids) - gene_rank = [(gene, rank) for gene, rank in gene_name_to_index.items()] - gene_rank.sort(key=lambda x: x[1]) - gene_names = [x[0] for x in gene_rank] - np.save(test_col_index_path, gene_names) - - return os.path.join(output_path, self.bam_output_filename) diff --git a/tools/scripts/sctools/src/sctools/test/test_encodings.py b/tools/scripts/sctools/src/sctools/test/test_encodings.py deleted file mode 100644 index 1eeb4584..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_encodings.py +++ /dev/null @@ -1,97 +0,0 @@ -import pytest -from .. import encodings -from itertools import combinations - - -@pytest.fixture(scope="module") -def sequence(): - return b"ACGTTTGAGATGAGATATAGANNNN" - - -@pytest.fixture(scope="module") -def encoder_2bit(sequence): - length = len(sequence) - return encodings.TwoBit(length) - - -@pytest.fixture(scope="module") -def encoder_3bit(): - return encodings.ThreeBit() - - -@pytest.fixture(scope="module", params=[encodings.TwoBit, encodings.ThreeBit]) -def encoder(request): - return request.param - - -def test_two_bit_encode_decode_produces_same_string_except_for_N( - sequence, encoder_2bit -): - encoded = encoder_2bit.encode(sequence) - decoded = encoder_2bit.decode(encoded) - assert sequence[:4] == decoded[:4] # last 4 are N, which get randomized - - -def test_three_bit_encode_decode_produces_same_string(sequence, encoder_3bit): - encoded = encoder_3bit.encode(sequence) - decoded = encoder_3bit.decode(encoded) - assert sequence == decoded - - -def test_two_bit_encoder_gets_correct_gc_content(encoder_2bit): - sequence_no_n = b"AGCGCGAT" - gc_content = sequence_no_n.count(b"C") + sequence_no_n.count(b"G") - encoded = encoder_2bit.encode(sequence_no_n) - assert encoder_2bit.gc_content(encoded) == gc_content - - -def test_three_bit_encoder_gets_correct_gc_content(sequence, encoder_3bit): - encoded = encoder_3bit.encode(sequence) - assert encoder_3bit.gc_content(encoded) == sequence.count(b"C") + sequence.count( - b"G" - ) - - -def test_two_bit_throws_errors_when_asked_to_encode_unknown_nucleotide(encoder_2bit): - with pytest.raises(KeyError): - encoder_2bit.encode(b"ACGTP") # P is not a valid code - - -def test_three_bit_encodes_unknown_nucleotides_as_N(encoder_3bit): - encoded = encoder_3bit.encode(b"ACGTP") # P is not a valid code - decoded = encoder_3bit.decode(encoded) - assert decoded == b"ACGTN" - - -@pytest.fixture -def simple_barcodes(): - """simple barcode set with min_hamming = 1, max_hamming = 2""" - return [b"ACGT", b"ACGG", b"ACGA", b"ACGC", b"TCGT", b"CCGT", b"GCGT"] - - -@pytest.fixture -def simple_hamming_distances(simple_barcodes): - simple_hamming_distances = [] - for a, b in combinations(simple_barcodes, 2): - d_hamming = 0 - for i, j in zip(a, b): - if i != j: - d_hamming += 1 - simple_hamming_distances.append(d_hamming) - return simple_hamming_distances - - -def test_encoded_hamming_distance_is_accurate( - simple_hamming_distances, simple_barcodes, encoder -): - # encode simple barcodes - tbe = encoder(4) - encoded = [tbe.encode(b) for b in simple_barcodes] - encoded_hamming_distances = [] - - # use hamming distance function - for a, b in combinations(encoded, 2): - encoded_hamming_distances.append(tbe.hamming_distance(a, b)) - - # verify they are the same as the simple function used in this file - assert simple_hamming_distances == encoded_hamming_distances diff --git a/tools/scripts/sctools/src/sctools/test/test_entrypoints.py b/tools/scripts/sctools/src/sctools/test/test_entrypoints.py deleted file mode 100644 index 419b3257..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_entrypoints.py +++ /dev/null @@ -1,307 +0,0 @@ -import glob -import os -import tempfile - -import numpy as np -import pysam -import pytest -import scipy.sparse as sp - -from sctools import bam, platform, count, consts - -data_dir = os.path.split(__file__)[0] + "/data/" - - -def test_Attach10XBarcodes_entrypoint(): - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - ] - - rc = platform.TenXV2.attach_barcodes(args) - assert rc == 0 - with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: - for alignment in f: - # each alignment should now have a tag, and that tag should be a string - assert isinstance( - alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str - ) - os.remove("test_tagged_bam.bam") # clean up - - -def test_Attach10XBarcodes_entrypoint_with_whitelist(): - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - ] - - return_call = platform.TenXV2.attach_barcodes(args) - assert return_call == 0 - success = False - with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: - for alignment in f: - if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): - success = True - # each alignment should now have a tag, and that tag should be a string - assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str - ) - assert success - os.remove("test_tagged_bam.bam") # clean up - - -def test_AttachBarcodes_entrypoint_with_whitelist(): - # test of the BarcodePlatform.attach_barcodes entry point with - # sample, cell, and molecule barcodes all specified - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - "--sample-barcode-start-position", - "0", - "--sample-barcode-length", - "8", - "--cell-barcode-start-position", - "0", - "--cell-barcode-length", - "16", - "--molecule-barcode-start-position", - "16", - "--molecule-barcode-length", - "7", # changed 10>7 intentionally for test - ] - - return_call = platform.BarcodePlatform.attach_barcodes(args) - assert return_call == 0 - success = False - with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: - for alignment in f: - if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): - success = True - # each alignment should now have a tag, and that tag should be a string - assert isinstance(alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY), str - ) - assert isinstance( - alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY), str - ) - assert len(alignment.get_tag(consts.RAW_MOLECULE_BARCODE_TAG_KEY)) == 7 - assert isinstance( - alignment.get_tag(consts.QUALITY_MOLECULE_BARCODE_TAG_KEY), str - ) - assert isinstance(alignment.get_tag(consts.RAW_SAMPLE_BARCODE_TAG_KEY), str) - assert isinstance( - alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str - ) - assert success - os.remove("test_tagged_bam.bam") # clean up - - -def test_split_bam(): - tag_args = [ - "--r1", - data_dir + "test_r1.fastq", - "--i1", - data_dir + "test_i7.fastq", - "--u2", - data_dir + "test.bam", - "--output-bamfile", - "test_tagged_bam.bam", - "--whitelist", - data_dir + "1k-august-2016.txt", - ] - - platform.TenXV2.attach_barcodes(tag_args) - - split_args = [ - "--bamfile", - "test_tagged_bam.bam", - "--output-prefix", - "test_tagged", - "--subfile-size", - "0.005", - "--tags", - consts.CELL_BARCODE_TAG_KEY, - consts.RAW_CELL_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.split_bam(split_args) - assert return_call == 0 - - for f in glob.glob("test_tagged*"): - os.remove(f) - - -def test_tag_sort_bam(): - args = [ - "-i", - data_dir + "unsorted.bam", - "-o", - "test_sorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.tag_sort_bam(args) - assert return_call == 0 - - tag_keys = [ - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - with pysam.AlignmentFile("test_sorted.bam", "rb") as f: - segments = f.fetch(until_eof=True) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - for f in glob.glob("test_sorted*"): - os.remove(f) - - -def test_tag_sort_bam_dash_t_specified_multiple_times(): - args = [ - "-i", - data_dir + "unsorted.bam", - "-o", - "test_sorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - "-t", - consts.GENE_NAME_TAG_KEY, - "-t", - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.tag_sort_bam(args) - assert return_call == 0 - - tag_keys = [ - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - with pysam.AlignmentFile("test_sorted.bam", "rb") as f: - segments = f.fetch(until_eof=True) - tag_sortable_record_generator = ( - bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments - ) - bam.verify_sort(tag_sortable_record_generator, tag_keys) - - for f in glob.glob("test_sorted*"): - os.remove(f) - - -def test_tag_sort_bam_no_tags(): - args = ["-i", data_dir + "unsorted.bam", "-o", "test_sorted.bam"] - - return_call = platform.GenericPlatform.tag_sort_bam(args) - assert return_call == 0 - - tag_keys = [] - with pysam.AlignmentFile("test_sorted.bam", "rb") as f: - segments = f.fetch(until_eof=True) - tag_sortable_records = ( - bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments - ) - bam.verify_sort(tag_sortable_records, tag_keys) - - for f in glob.glob("test_sorted*"): - os.remove(f) - - -def test_verify_bam_sort(): - args = [ - "-i", - data_dir + "cell-gene-umi-queryname-sorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - return_call = platform.GenericPlatform.verify_bam_sort(args) - assert return_call == 0 - - -def test_verify_bam_sort_raises_error_on_unsorted(): - args = [ - "-i", - data_dir + "unsorted.bam", - "-t", - consts.CELL_BARCODE_TAG_KEY, - consts.GENE_NAME_TAG_KEY, - consts.MOLECULE_BARCODE_TAG_KEY, - ] - - with pytest.raises(bam.SortError): - platform.GenericPlatform.verify_bam_sort(args) - - -def test_count_merge(): - tmp = tempfile.mkdtemp() - - data, ind, col = [np.arange(10)] * 3 - matrix = sp.coo_matrix((data, (ind, col)), shape=(10, 10), dtype=np.float32).tocsr() - # be lazy and reuse the inds as the col and row index - counts = count.CountMatrix(matrix, ind, col) - counts.save(tmp + "/test_input_1") - counts.save(tmp + "/test_input_2") - - merge_args = [ - "-o", - tmp + "/test_merged_counts", - "-i", - tmp + "/test_input_2", - tmp + "/test_input_1", - ] - return_call = platform.GenericPlatform.merge_count_matrices(merge_args) - assert return_call == 0 diff --git a/tools/scripts/sctools/src/sctools/test/test_fastq.py b/tools/scripts/sctools/src/sctools/test/test_fastq.py deleted file mode 100644 index fdf8f58c..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_fastq.py +++ /dev/null @@ -1,275 +0,0 @@ -import os -import string -from functools import partial -from itertools import product - -import pytest - -from .. import fastq, consts -from ..reader import zip_readers - -# set some useful globals for testing -data_dir = os.path.split(__file__)[0] + "/data/" -_i7_files = [ - data_dir + f for f in ("test_i7.fastq", "test_i7.fastq.gz", "test_i7.fastq.bz2") -] -_files = [data_dir + f for f in ("test_i7.fastq", "test_r1.fastq", "test_r2.fastq")] -_gz_files = [ - data_dir + f for f in ("test_i7.fastq.gz", "test_r1.fastq.gz", "test_r2.fastq.gz") -] -_bz2_files = [ - data_dir + f - for f in ("test_i7.fastq.bz2", "test_r1.fastq.bz2", "test_r2.fastq.bz2") -] - -_modes = ("r", "rb") -_files_and_modes = list(product(_i7_files, _modes)) -_multifiles_and_modes = list(product((_files, _gz_files, _bz2_files), _modes)) -_map_encoder = {"r": str, "rb": partial(bytes, encoding="utf-8")} - - -# TEST READER - - -@pytest.fixture(scope="module", params=_files_and_modes) -def i7_files_compressions_and_modes(request): - """generates different compression types and modes for testing""" - return request.param[0], request.param[1] - - -@pytest.fixture(scope="module", params=_multifiles_and_modes) -def reader_all_compressions(request): - """generates open fastq reader files for each compression and read mode""" - return fastq.Reader(request.param[0], request.param[1]) - - -@pytest.fixture(scope="module") -def bytes_fastq_record(): - return [b"@name\n", b"ACTACAAT\n", b"+\n", b"%%%%AAAA\n"] - - -@pytest.fixture(scope="module") -def string_fastq_record(): - return ["@name\n", "ACTACAAT\n", "+\n", "%%%%AAAA\n"] - - -def test_reader_stores_filenames(): - names = ["notreal", "fake"] - rd = fastq.Reader(files=names) - assert rd.filenames == names - - -def test_reader_reads_first_record(reader_all_compressions): - for record in reader_all_compressions: - assert isinstance(record, fastq.Record) - expected_result = ( - "NCACAATG\n" if isinstance(record.sequence, str) else b"NCACAATG\n" - ) - assert record.sequence == expected_result - break # just first record - - -def test_reader_skips_header_character_raises_value_error( - i7_files_compressions_and_modes, -): - """ - test should skip the first name line, shifting each record up 1. As a result, the - first sequence should be found in the name field - """ - filename, mode = i7_files_compressions_and_modes - rd = fastq.Reader(filename, mode=mode, header_comment_char="@") - with pytest.raises(ValueError): - next(iter(rd)) - - -def test_reader_reads_correct_number_of_records_across_multiple_files( - reader_all_compressions, -): - assert len(reader_all_compressions) == 300 # 3 files - - -def test_mixed_filetype_read_gets_correct_record_number(): - rd = fastq.Reader([_gz_files[0], _bz2_files[0]], mode="r", header_comment_char="#") - - assert len(rd) == 200 - - -def test_non_string_filename_raises_typeerror(): - with pytest.raises(TypeError): - _ = fastq.Reader(10, "r") - - -def test_non_string_filename_in_iterable_raises_typeerror(): - with pytest.raises(TypeError): - _ = fastq.Reader(("works", 10), "r") - - -def test_invalid_open_mode_raises_valueerror(): - with pytest.raises(ValueError): - _ = fastq.Reader("works", "not_acceptable_open_mode") - - -def test_fastq_returns_correct_filesize_for_single_and_multiple_files(): - rd = fastq.Reader( - _i7_files[0], mode="r", header_comment_char="#" # mode irrelevant - ) - assert rd.size == 7774 - - rd = fastq.Reader(_i7_files, mode="r", header_comment_char="#") # mode irrelevant - assert rd.size == 7774 + 853 + 802 # three file sizes - - -def test_reader_properly_subsets_based_on_indices(): - rd = fastq.Reader(_i7_files[0], mode="r") - indices = {0, 5, 10, 12} - n_records = sum(1 for _ in rd.select_record_indices(indices)) - assert n_records == len(indices) - - -def test_zipping_readers_generates_expected_output(): - rd1 = fastq.Reader(_files[0], "r") - rd2 = fastq.Reader(_files[0], "r") - for r1, r2 in zip_readers(rd1, rd2): - assert isinstance(r1, fastq.Record) - assert isinstance(r2, fastq.Record) - expected_result = "NCACAATG\n" - assert r1.sequence == r2.sequence == expected_result - break # just first record - - -def test_zipping_readers_with_indices_generates_expected_output(): - rd1 = fastq.Reader(_files[0], "r") - rd2 = fastq.Reader(_files[0], "r") - indices = {0, 1, 2, 3} - for r1, r2 in zip_readers(rd1, rd2, indices=indices): - assert isinstance(r1, fastq.Record) - assert isinstance(r2, fastq.Record) - expected_result = "NCACAATG\n" - assert r1.sequence == r2.sequence == expected_result - break # just first record - - -def test_printing_bytes_record_generates_valid_fastq_record(bytes_fastq_record): - record = fastq.Record(bytes_fastq_record) - assert str(record) == b"".join(bytes_fastq_record).decode() - assert bytes(record) == b"".join(bytes_fastq_record) - - -def test_bytes_fastq_record_quality_score_parsing(bytes_fastq_record): - record = fastq.Record(bytes_fastq_record) - assert record.average_quality() == 18 - - -def test_printing_string_record_generates_valid_fastq_record(string_fastq_record): - record = fastq.StrRecord(string_fastq_record) - assert str(record) == "".join(string_fastq_record) - assert bytes(record) == "".join(string_fastq_record).encode() - - -def test_string_fastq_record_quality_score_parsing(string_fastq_record): - record = fastq.StrRecord(string_fastq_record) - assert record.average_quality() == 18 - - -# TEST RECORD - - -def test_fields_populate_properly(reader_all_compressions): - encoder = _map_encoder[reader_all_compressions._mode] - name_prefix = encoder("@") - alphabet = set(encoder("ACGTN")) - name2_string = encoder("+\n") - ascii_chars = set(i for i in encoder(string.printable)) - for record in reader_all_compressions: - assert record.name.startswith(name_prefix) - assert all(i in alphabet for i in record.sequence.strip()) - assert record.name2 == name2_string - assert all(i in ascii_chars for i in record.quality.strip()) - - -# TEST BarcodeGeneratorWithCorrectedCellbarcodes - - -@pytest.fixture(scope="function") -def embedded_barcode_generator(): - cell_barcode = fastq.EmbeddedBarcode( - start=0, - end=16, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - molecule_barcode = fastq.EmbeddedBarcode( - start=16, - end=26, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - return fastq.EmbeddedBarcodeGenerator( - data_dir + "test_r1.fastq.gz", [cell_barcode, molecule_barcode] - ) - - -@pytest.fixture(scope="function") -def barcode_generator_with_corrected_cell_barcodes(): - cell_barcode = fastq.EmbeddedBarcode( - start=0, - end=16, - quality_tag=consts.QUALITY_CELL_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_CELL_BARCODE_TAG_KEY, - ) - molecule_barcode = fastq.EmbeddedBarcode( - start=16, - end=26, - quality_tag=consts.QUALITY_MOLECULE_BARCODE_TAG_KEY, - sequence_tag=consts.RAW_MOLECULE_BARCODE_TAG_KEY, - ) - return fastq.BarcodeGeneratorWithCorrectedCellBarcodes( - data_dir + "test_r1.fastq.gz", - cell_barcode, - data_dir + "1k-august-2016.txt", - [molecule_barcode], - ) - - -def test_embedded_barcode_generator_produces_outputs_of_expected_size( - embedded_barcode_generator, -): - for cell_seq, cell_qual, umi_seq, umi_qual in embedded_barcode_generator: - - # correct values - correct_cell_barcode_length = 16 - correct_umi_length = 10 - - # note that all barcodes are strings and therefore should get 'Z' values - - # test cell tags - assert cell_seq[0] == consts.RAW_CELL_BARCODE_TAG_KEY - assert len(cell_seq[1]) == correct_cell_barcode_length - assert all(v in "ACGTN" for v in cell_seq[1]) - assert cell_seq[2] == "Z" - assert cell_qual[0] == consts.QUALITY_CELL_BARCODE_TAG_KEY - assert len(cell_qual[1]) == correct_cell_barcode_length - assert all(v in string.printable for v in cell_qual[1]) - assert cell_seq[2] == "Z" - - # test umi tags - assert umi_seq[0] == consts.RAW_MOLECULE_BARCODE_TAG_KEY - assert len(umi_seq[1]) == correct_umi_length - assert all(v in "ACGTN" for v in umi_seq[1]) - assert umi_seq[2] == "Z" - assert umi_qual[0] == consts.QUALITY_MOLECULE_BARCODE_TAG_KEY - assert len(umi_qual[1]) == correct_umi_length - assert all(v in string.printable for v in umi_qual[1]) - assert umi_seq[2] == "Z" - - break # just the first tag is fine - - -def test_corrects_barcodes(barcode_generator_with_corrected_cell_barcodes): - success = False - for barcode_sets in barcode_generator_with_corrected_cell_barcodes: - for barcode_set in barcode_sets: - if barcode_set[0] == consts.CELL_BARCODE_TAG_KEY: - success = True - break - assert success diff --git a/tools/scripts/sctools/src/sctools/test/test_groups.py b/tools/scripts/sctools/src/sctools/test/test_groups.py deleted file mode 100644 index 71d24539..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_groups.py +++ /dev/null @@ -1,345 +0,0 @@ -import os -import csv -import itertools -from sctools import platform - - -data_dir = os.path.split(__file__)[0] + "/data/group_metrics/" -unpaired_data_dir = os.path.split(__file__)[0] + "/data/group_metrics_unpaired_ss2/" - - -def check_parsed_metrics_csv(file_name, cell_id, class_name, expected_metrics): - with open(file_name) as f: - column_headers = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - metrics = f.readline().strip().split(",") - assert classes[0] == "Class" - assert set(classes[1:]) == {class_name} - for idx, each in enumerate(column_headers): - if idx == 0: - assert metrics[0] == cell_id - if idx > 0: - metric_name = column_headers[idx] - assert metrics[idx] == expected_metrics[metric_name] - - -def test_write_aggregated_picard_metrics_by_row(): - args = [ - "-f", - data_dir + "test_qc.alignment_summary_metrics.txt", - data_dir + "test_qc.insert_size_metrics.txt", - data_dir + "test_qc.duplicate_metrics.txt", - data_dir + "test_qc.rna_metrics.txt", - data_dir + "test_qc.gc_bias.summary_metrics.txt", - "-t", - "Picard", - "-o", - "output_picard_group", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = {} - with open(data_dir + "expected_picard_group.csv") as f: - column_headers = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - metrics = f.readline().strip().split(",") - for idx, each in enumerate(column_headers): - expected_metrics[each] = {"class": classes[idx], "metric": metrics[idx]} - with open("output_picard_group.csv") as f: - column_headers = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - metrics = f.readline().strip().split(",") - assert len(column_headers) == len(expected_metrics.keys()) - for idx, each in enumerate(column_headers): - header = expected_metrics[each] - assert classes[idx] == header["class"] - assert metrics[idx] == header["metric"] - os.remove("output_picard_group.csv") - - -def test_write_aggregated_picard_metrics_by_table(): - args = [ - "-t", - "PicardTable", - "-o", - "output_picard_group", - "-f", - data_dir + "test_qc.error_summary_metrics.txt", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = [ - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "C"), - ("ALT_COUNT", "16"), - ("REF_BASE", "A"), - ("REF_COUNT", "231512"), - ("SUBSTITUTION", "A>C"), - ("SUBSTITUTION_RATE", "6.9e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "G"), - ("ALT_COUNT", "156"), - ("REF_BASE", "A"), - ("REF_COUNT", "231512"), - ("SUBSTITUTION", "A>G"), - ("SUBSTITUTION_RATE", "0.000673"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "T"), - ("ALT_COUNT", "16"), - ("REF_BASE", "A"), - ("REF_COUNT", "231512"), - ("SUBSTITUTION", "A>T"), - ("SUBSTITUTION_RATE", "6.9e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "A"), - ("ALT_COUNT", "16"), - ("REF_BASE", "C"), - ("REF_COUNT", "173880"), - ("SUBSTITUTION", "C>A"), - ("SUBSTITUTION_RATE", "9.2e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "G"), - ("ALT_COUNT", "14"), - ("REF_BASE", "C"), - ("REF_COUNT", "173880"), - ("SUBSTITUTION", "C>G"), - ("SUBSTITUTION_RATE", "8.1e-05"), - ] - ), - dict( - [ - ("Sample", "test"), - ("ALT_BASE", "T"), - ("ALT_COUNT", "82"), - ("REF_BASE", "C"), - ("REF_COUNT", "173880"), - ("SUBSTITUTION", "C>T"), - ("SUBSTITUTION_RATE", "0.000471"), - ] - ), - ] - - with open("output_picard_group_error_summary_metrics.csv") as f: - reader = csv.DictReader(f) - - i = 0 - match_list = [] - for line in reader: - assert line in expected_metrics - i = i + 1 - - # expect the same set, list to be precise, of indices - assert i == len(expected_metrics) - - os.remove("output_picard_group_error_summary_metrics.csv") - - -def test_parse_hisat2_paired_end_log(): - args = [ - "-f", - data_dir + "test_hisat2_paired_end_qc.log", - "-t", - "HISAT2", - "-o", - "output_hisat2", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - cell_id = "test_hisat2_paired_end" - tag = "HISAT2G" - expected_metrics = { - "Total pairs": "5479", - "Aligned concordantly or discordantly 0 time": "412", - "Aligned concordantly 1 time": "4414", - "Aligned concordantly >1 times": "652", - "Aligned discordantly 1 time": "1", - "Total unpaired reads": "824", - "Aligned 0 time": "478", - "Aligned 1 time": "240", - "Aligned >1 times": "106", - "Overall alignment rate": "95.64%", - } - check_parsed_metrics_csv("output_hisat2.csv", cell_id, tag, expected_metrics) - os.remove("output_hisat2.csv") - - -def test_parse_hisat2_transcriptome_log(): - args = [ - "-f", - data_dir + "test_hisat2_transcriptome_rsem.log", - "-t", - "HISAT2", - "-o", - "output_hisat2_trans", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - cell_id = "test_hisat2_transcriptome" - tag = "HISAT2T" - expected_metrics = { - "Total pairs": "5479", - "Aligned concordantly or discordantly 0 time": "3635", - "Aligned concordantly 1 time": "360", - "Aligned concordantly >1 times": "1484", - "Aligned discordantly 1 time": "0", - "Total unpaired reads": "7270", - "Aligned 0 time": "7270", - "Aligned 1 time": "0", - "Aligned >1 times": "0", - "Overall alignment rate": "33.66%", - } - check_parsed_metrics_csv("output_hisat2_trans.csv", cell_id, tag, expected_metrics) - os.remove("output_hisat2_trans.csv") - - -def test_parse_rsem_cnt(): - file_name = data_dir + "test_rsem.cnt" - args = ["-f", file_name, "-t", "RSEM", "-o", "output_rsem"] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - cell_id = "test" - class_name = "RSEM" - expected_metrics = None - with open(file_name) as f: - N0, N1, N2, N_tot = f.readline().strip().split(" ") - n_unique, n_multi, n_uncertain = f.readline().strip().split(" ") - n_hits, read_type = f.readline().strip().split(" ") - expected_metrics = { - "unalignable reads": N0, - "alignable reads": N1, - "filtered reads": N2, - "total reads": N_tot, - "unique aligned": n_unique, - "multiple mapped": n_multi, - "total alignments": n_hits, - "strand": read_type, - "uncertain reads": n_uncertain, - } - check_parsed_metrics_csv("output_rsem.csv", cell_id, class_name, expected_metrics) - os.remove("output_rsem.csv") - - -def test_write_aggregated_qc_metrics(): - input_files = [ - data_dir + "test_picard_group.csv", - data_dir + "test_hisat2.csv", - data_dir + "test_hisat2_trans.csv", - data_dir + "test_rsem.csv", - ] - args = [ - "-f", - data_dir + "test_picard_group.csv", - data_dir + "test_hisat2.csv", - data_dir + "test_hisat2_trans.csv", - data_dir + "test_rsem.csv", - "-t", - "Core", - "-o", - "output_QCs", - ] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = [] - expected_headers = [] - for input_file in input_files: - with open(input_file) as f: - reader = csv.DictReader(f) - expected_headers.extend(reader.fieldnames[1:]) - for idx, line in enumerate(reader): - if len(expected_metrics) < idx + 1: - expected_metrics.append(line) - else: - expected_metrics[idx].update(line) - output_headers = [] - with open("output_QCs.csv") as output_file: - reader = csv.DictReader(output_file) - output_headers.extend(reader.fieldnames) - for line in reader: - assert line in expected_metrics - # The output file should contain all of the column headers from the input files plus the "joined column" containing row headers - assert len(output_headers) == len(expected_headers) + 1 - os.remove("output_QCs.csv") - - -def test_unpaired_ss2_write_aggregated_picard_metrics_by_row(): - - sources = [ - unpaired_data_dir + "SRR6258488_qc.alignment_summary_metrics.txt", - unpaired_data_dir + "SRR6258488_qc.duplicate_metrics.txt", - unpaired_data_dir + "SRR6258488_qc.gc_bias.summary_metrics.txt", - unpaired_data_dir + "SRR6258488_qc.rna_metrics.txt", - ] - - args = ["-f", *sources, "-t", "Picard", "-o", "output_picard_group_unpaired"] - return_code = platform.GenericPlatform.group_qc_outputs(args) - assert return_code == 0 - - expected_metrics = {} - - for source in sources: - with open(source) as f: - for line in f: - if line.startswith("## METRICS CLASS"): - class_ = line.strip().split("\t")[1].split(".")[-1] - break - labels = f.readline().strip().split("\t") - values = f.readline().strip().split("\t") - - for label, value in itertools.zip_longest(labels, values, fillvalue=""): - if label in ("LIBRARY", "SAMPLE", "READ_GROUP", "CATEGORY"): - continue - if class_ == "AlignmentSummaryMetrics": - label += ".UNPAIRED" - try: - value = str(float(value)) - except ValueError: - pass - expected_metrics[(class_, label)] = value - expected_metrics[("Class", "")] = "SRR6258488" - - with open("output_picard_group_unpaired.csv") as f: - labels = f.readline().strip().split(",") - classes = f.readline().strip().split(",") - values = f.readline().strip().split(",") - assert len(labels) == len(expected_metrics) - - for class_, label in expected_metrics: - if class_ not in classes or label not in labels: - print("!", class_, label) - - for class_, label, value in zip(classes, labels, values): - assert (class_, label) in expected_metrics - try: - value = str(float(value)) - except ValueError: - value = value - try: - expected_value = str(float(expected_metrics[(class_, label)])) - except ValueError: - expected_value = expected_metrics[(class_, label)] - assert value == expected_value - os.remove("output_picard_group_unpaired.csv") diff --git a/tools/scripts/sctools/src/sctools/test/test_gtf.py b/tools/scripts/sctools/src/sctools/test/test_gtf.py deleted file mode 100644 index fd74ea91..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_gtf.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -from .. import gtf -from itertools import chain -import pytest - -_data_dir = os.path.split(__file__)[0] + "/data" -_files = ["%s/%s" % (_data_dir, f) for f in ("test.gtf", "test.gtf.gz", "test.gtf.bz2")] - - -@pytest.fixture(scope="module", params=_files) -def files(request): - """returns a filename""" - return request.param - - -def test_opens_file_reads_first_line(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - assert isinstance(record, gtf.GTFRecord) - - -def test_opens_file_populates_fields_properly(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - assert record.seqname == "chr19" - assert record.chromosome == "chr19" - assert record.source == "HAVANA" - assert record.feature == "gene" - assert record.start == 60951 - assert record.end == 71626 - assert record.score == "." - assert record.strand == "-" - assert record.frame == "." - - expected_features = { - "gene_id": "ENSG00000282458.1", - "gene_type": "transcribed_processed_pseudogene", - "gene_status": "KNOWN", - "gene_name": "WASH5P", - "level": "2", - "havana_gene": "OTTHUMG00000180466.8", - } - assert record._attributes == expected_features - - assert all( - i in str(record) - for i in chain(expected_features.keys(), expected_features.values()) - ) - - -def test_set_attribute_verify_included_in_output_string(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - record.set_attribute("test_attr", "foo") - assert record.get_attribute("test_attr") == "foo" - - # verify in output string - assert "foo" in str(record) - - -def test_opens_file_parses_size(files): - rd = gtf.Reader(files, "r", header_comment_char="#") - record = next(iter(rd)) - assert 71626 - 60951 == record.size - - # mangle record, make sure error is raised - record._fields[3:5] = [record.end, record.start] - with pytest.raises(ValueError): - getattr(record, "size") diff --git a/tools/scripts/sctools/src/sctools/test/test_metrics.py b/tools/scripts/sctools/src/sctools/test/test_metrics.py deleted file mode 100644 index 303c573d..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_metrics.py +++ /dev/null @@ -1,930 +0,0 @@ -import fileinput -import math -import os -import tempfile -from typing import Callable - -import numpy as np -import pandas as pd -import pytest -from sctools.metrics.gatherer import ( - GatherGeneMetrics, - GatherCellMetrics, - MetricGatherer, -) -from sctools.metrics.merge import MergeCellMetrics, MergeGeneMetrics -from sctools.platform import TenXV2 - -""" -Testing Data Definition & Acquisition - -The data hardcoded into this file come from the two notebooks associated with these metrics: -characterize-cell-testing-data.ipynb and characterize-gene-testing-data.ipynb. In these -notebooks, the testing .bam files are loaded into memory and interrogated for each of the -metrics in question using pandas and numpy commands. These independent calculation provide the -hard-coded data found in these tests. When testing data is changed, the notebook can be updated -to re-calculate the values found in this file. -""" - -# set the input and output directories, using a tempdir to automatically clean up generated files -_data_dir = os.path.split(__file__)[0] + "/data" -_test_dir = tempfile.mkdtemp() -os.makedirs(_test_dir, exist_ok=True) - -# note, to inspect these testing files, please install samtools and use the following command: -# samtools view | less - -# set the input files -_gene_sorted_bam = os.path.join(_data_dir, "small-gene-sorted.bam") -_cell_sorted_bam = os.path.join(_data_dir, "small-cell-sorted.bam") -_cell_sorted_bam_missing_cell_barcodes = os.path.join( - _data_dir, "cell-sorted-missing-cb.bam" -) - -# specify filenames for temporary metrics outputs that are used in the following tests -_gene_metric_output_file = os.path.join(_test_dir, "gene_metrics.csv.gz") -_cell_metric_output_file = os.path.join(_test_dir, "cell_metrics.csv.gz") -_cell_metric_output_file_missing_cell_barcodes = os.path.join( - _test_dir, "cell_metrics_missing_cb.csv.gz" -) - -# run the gene metrics suite -gene_gatherer = GatherGeneMetrics(_gene_sorted_bam, _gene_metric_output_file) -gene_gatherer.extract_metrics() -_gene_metrics = pd.read_csv(_gene_metric_output_file, index_col=0) - -# run the cell metrics suite -cell_gatherer = GatherCellMetrics(_cell_sorted_bam, _cell_metric_output_file) -cell_gatherer.extract_metrics() -_cell_metrics = pd.read_csv(_cell_metric_output_file, index_col=0) - -# run the cell metrics suite -cell_gatherer_missing_cbs = GatherCellMetrics( - _cell_sorted_bam_missing_cell_barcodes, - _cell_metric_output_file_missing_cell_barcodes, -) -cell_gatherer_missing_cbs.extract_metrics() -_cell_metrics_missing_cbs = pd.read_csv( - _cell_metric_output_file_missing_cell_barcodes, index_col=0 -) - - -def test_calculate_cell_metrics_cli(): - """test the sctools cell metrics CLI invocation""" - cell_metrics_csv = os.path.join(_test_dir, "cell_metrics.csv") - return_call = TenXV2.calculate_cell_metrics( - args=["-i", _cell_sorted_bam, "-o", cell_metrics_csv] - ) - assert return_call == 0 - - -def test_calculate_gene_metrics_cli(): - """test the sctools gene metrics CLI invocation""" - gene_metrics_csv = os.path.join(_test_dir, "gene_metrics.csv") - return_call = TenXV2.calculate_gene_metrics( - args=["-i", _gene_sorted_bam, "-o", gene_metrics_csv] - ) - assert return_call == 0 - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 300), (_cell_metrics, 656)] -) -def test_metrics_n_reads(metrics, expected_value): - """test that the metrics identify the correct read number""" - assert metrics["n_reads"].sum() == expected_value - - -def test_cell_metrics_mean_n_genes_observed(): - """ - test that the GatherCellMetrics method identifies the correct number of genes per cell, on - average. - """ - genes_observed = _cell_metrics["n_genes"].mean() - assert math.isclose(genes_observed, 1.9827, abs_tol=1e-4), "%f != %f" % ( - genes_observed, - 1.9827, - ) - - -def test_gene_metrics_n_genes(): - """Test that GatherGeneMetrics identifies the total number of genes in the test file""" - genes_observed = _gene_metrics.shape[0] - assert genes_observed == 8 - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 88), (_cell_metrics, 249)] -) -def test_metrics_n_molecules(metrics, expected_value): - """Test that each metric identifies the total number of molecules in the test file - - Molecules are defined as a unique combination of {cell barcode, molecule barcode, gene} - """ - molecules_observed = metrics["n_molecules"].sum() - assert molecules_observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 217), (_cell_metrics, 499)] -) -def test_metrics_n_fragments(metrics, expected_value): - """Test that each metric identifies the total number of fragments in the test file. - - Fragments are defined as a unique combination of {cell barcode, molecule barcode, strand, - position, chromosome} - """ - fragments_observed = metrics["n_fragments"].sum() - assert fragments_observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_gene_metrics, "AL627309.7"), (_cell_metrics, "AAACCTGGTAGAAGGA")], -) -def test_metrics_highest_expression_class(metrics, expected_value): - """ - for gene metrics, this is the highest expression gene. For cell metrics, this is the highest - expression cell. - """ - observed_max_gene = metrics["n_reads"].idxmax() - assert observed_max_gene == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 245), (_cell_metrics, 94)] -) -def test_metrics_highest_read_count(metrics, expected_value): - """ - Test that each metric identifies the what the highest read count associated with any single - entity - """ - observed_max_gene_reads = metrics["n_reads"].max() - assert observed_max_gene_reads == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [ - ( - _gene_metrics, - 300, - ), # todo this is 100%, we should mangle a few in the testing data - (_cell_metrics, 655), - ], -) -def test_metrics_number_perfect_molecule_barcodes(metrics, expected_value): - """Test that each metric correctly identifies the number of perfect molecule barcodes where UB == UR""" - observed_perfect_barcodes = metrics["perfect_molecule_barcodes"].sum() - assert observed_perfect_barcodes == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_cell_metrics, 650), (_cell_metrics_missing_cbs, 12861)], -) -def test_metrics_number_perfect_cell_barcodes(metrics, expected_value): - """Test that each metric correctly identifies the number of perfect cell barcodes where CB == CR""" - observed_perfect_cell_barcodes = metrics["perfect_cell_barcodes"].sum() - assert observed_perfect_cell_barcodes == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [ - ( - _gene_metrics, - 300, - ), # todo this is 100%, should get some intronic or other reads - (_cell_metrics, 609), - ], -) -def test_reads_mapped_exonic(metrics, expected_value): - """Test that each metric identifies the number of reads mapped to an exon (XF=='CODING')""" - observed = metrics["reads_mapped_exonic"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_gene_metrics, 0), (_cell_metrics, 28)], # todo null case -) -def test_reads_mapped_intronic(metrics, expected_value): - """Test that each metric identifies the number of reads mapped to an intron (XF=='INTRONIC')""" - observed = metrics["reads_mapped_intronic"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [(_gene_metrics, 0), (_cell_metrics, 19)], # todo null case -) -def test_reads_mapped_utr(metrics, expected_value): - """Test that each metric identifies the number of reads mapped to a UTR (XF=='UTR')""" - observed = metrics["reads_mapped_utr"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", - [ - (_gene_metrics, 300), # todo need to include at least 1 multi-mapper - (_cell_metrics, 656), - ], -) -def test_reads_mapped_uniquely(metrics, expected_value): - """Uniquely mapping reads will be tagged with NH==1""" - observed = metrics["reads_mapped_uniquely"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 90), (_cell_metrics, 107)] -) -def test_duplicate_records(metrics, expected_value): - """Duplicate records are identified by the 1024 bit being set in the sam flag""" - observed = metrics["duplicate_reads"].sum() - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, expected_value", [(_gene_metrics, 29), (_cell_metrics, 2)] -) -def test_spliced_reads(metrics, expected_value): - """ - This pipeline defines spliced reads as containing an N segment of any length in the cigar string - """ - observed = metrics["spliced_reads"].sum() - assert observed == expected_value - - -# todo failing -# @pytest.mark.parametrize('metrics', [_gene_metrics, _cell_metrics]) -# def test_relationship_of_duplicates_and_fragments(metrics): -# """ -# We expect the number of duplicates and fragments to add up to the total number of reads. The -# rationale is that any read that is not a duplicate should be a distinct fragment, under our -# definitions. -# -# This fails because of (1) N-base and 2-base cell barcode correction errors and (2) -# fragment calculationes currently do not account for soft clipping. Fixing these will cause -# this test to pass -# """ -# dup_and_fragments = metrics['duplicate_reads'].sum() + metrics['n_fragments'].sum() -# reads = metrics['n_reads'].sum() -# assert reads == dup_and_fragments - - -@pytest.mark.parametrize("metrics", [_gene_metrics, _cell_metrics]) -def test_fragments_number_is_greater_than_molecule_number(metrics): - """ - There should always be more fragments than molecules, as the minimum definition of a molecule is - a fragment covered by a single read - """ - assert np.all(metrics["n_molecules"] >= 1) - assert np.all(metrics["n_fragments"] >= 1) - assert np.all(metrics["n_fragments"] >= metrics["n_molecules"]) - - -@pytest.mark.parametrize( - "metrics, key, expected_value", - [ - ( - _cell_metrics, - "molecule_barcode_fraction_bases_above_30_mean", - np.array( - [ - 1.0000, - 0.9500, - 1.0000, - 1.0000, - 0.9778, - 1.0000, - 1.0000, - 1.0000, - 0.9833, - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 0.9759, - 1.0000, - 1.0000, - 0.9830, - 1.0000, - 1.0000, - 1.0000, - 0.9778, - 0.9783, - 1.0000, - 0.9800, - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 0.9500, - 1.0000, - 0.9895, - 1.0000, - 0.9760, - 1.0000, - 1.0000, - 1.0000, - 0.9889, - 1.0000, - 0.9600, - 1.0000, - 0.9909, - 1.0000, - 1.0000, - 0.9556, - 0.9800, - 1.0000, - 0.9000, - 1.0000, - 0.9588, - 1.0000, - 1.0000, - 0.9889, - 0.8000, - 0.9538, - 0.9909, - 0.9929, - 0.9571, - ] - ), - ), - # todo failing. Odd because mean is passing; catastrophic cancellation in the online method? - # other methods that use the variance estimator work just fine. Something about the gene issue - # that is identified by other methods below? - # (_cell_metrics, 'molecule_barcode_fraction_bases_above_30_variance', - # np.array( - # [np.nan, 0.0050, np.nan, np.nan, 0.0019, 0.0000, 0.0000, np.nan, 0.0015, np.nan, 0.0000, - # 0.0000, np.nan, 0.0000, 0.0048, 0.0000, 0.0000, 0.0029, 0.0000, np.nan, 0.0000, 0.0044, - # 0.0109, 0.0000, 0.0020, 0.0000, 0.0000, np.nan, 0.0000, 0.0100, np.nan, 0.0010, 0.0000, - # 0.0052, 0.0000, 0.0000, 0.0000, 0.0011, 0.0000, 0.0162, 0.0000, 0.0016, 0.0000, np.nan, - # 0.0178, 0.0020, np.nan, np.nan, 0.0000, 0.0163, np.nan, np.nan, 0.0011, np.nan, 0.0147, - # 0.0018, 0.0007, 0.0306])), - ( - _cell_metrics, - "genomic_reads_fraction_bases_quality_above_30_mean", - np.array( - [ - 0.3980, - 0.6786, - 0.5000, - 0.9796, - 0.7800, - 0.7811, - 0.9337, - 0.8469, - 0.6743, - 0.4565, - 0.8622, - 0.9762, - 0.4925, - 0.7857, - 0.7478, - 0.8561, - 0.6327, - 0.7948, - 0.8405, - 0.4286, - 0.7735, - 0.6445, - 0.7291, - 0.8520, - 0.6711, - 0.6123, - 0.8238, - 0.5000, - 0.8376, - 0.5137, - 0.7526, - 0.7584, - 0.7574, - 0.8379, - 0.8490, - 0.5000, - 0.5983, - 0.7489, - 0.7755, - 0.8107, - 0.6963, - 0.8363, - 0.8896, - 0.6186, - 0.7549, - 0.7151, - 1.0000, - 0.5306, - 0.8347, - 0.7340, - 0.8367, - 0.8878, - 0.7347, - 0.4592, - 0.7718, - 0.7583, - 0.8439, - 0.7576, - ] - ), - ), - ( - _cell_metrics, - "genomic_reads_fraction_bases_quality_above_30_variance", - np.array( - [ - np.nan, - 0.1812, - np.nan, - np.nan, - 0.0266, - 0.0461, - 0.0042, - np.nan, - 0.0387, - np.nan, - 0.0178, - 0.0000, - np.nan, - 0.0002, - 0.0455, - 0.0342, - 0.0588, - 0.0359, - 0.0247, - np.nan, - 0.0400, - 0.0436, - 0.0754, - 0.0005, - 0.1140, - 0.0617, - 0.0400, - np.nan, - 0.0230, - 0.0491, - np.nan, - 0.0608, - 0.0556, - 0.0367, - 0.0215, - 0.0860, - 0.2182, - 0.0564, - 0.0008, - 0.0395, - 0.0330, - 0.0433, - 0.0063, - np.nan, - 0.0366, - 0.0778, - np.nan, - np.nan, - 0.0114, - 0.0391, - np.nan, - np.nan, - 0.0193, - np.nan, - 0.0288, - 0.0444, - 0.0311, - 0.0558, - ] - ), - ), - ( - _cell_metrics, - "genomic_read_quality_mean", - np.array( - [ - 25.3776, - 32.5051, - 27.7755, - 39.9184, - 34.3639, - 34.5969, - 37.4592, - 35.9490, - 31.6345, - 26.5870, - 36.7500, - 39.5374, - 28.0896, - 33.7041, - 33.6079, - 36.2787, - 30.8472, - 34.8402, - 35.9327, - 24.7755, - 34.3603, - 31.0934, - 33.2880, - 36.7092, - 31.9647, - 30.2158, - 35.3956, - 27.6837, - 35.8674, - 27.4527, - 34.3918, - 33.7323, - 33.6425, - 35.9552, - 35.5694, - 27.4184, - 30.0479, - 33.4621, - 34.6633, - 35.2128, - 32.4619, - 35.7690, - 36.9963, - 30.0722, - 33.6353, - 32.6708, - 39.8721, - 28.0510, - 35.9388, - 33.1278, - 35.8265, - 36.6633, - 32.7188, - 26.6429, - 34.1053, - 34.0012, - 36.0956, - 33.7704, - ] - ), - ), - ( - _cell_metrics, - "genomic_read_quality_variance", - np.array( - [ - np.nan, - 92.5078, - np.nan, - np.nan, - 18.9818, - 29.9521, - 6.6724, - np.nan, - 25.4164, - np.nan, - 12.8541, - 0.3790, - np.nan, - 0.0019, - 28.7815, - 24.6669, - 37.7402, - 22.8765, - 16.5399, - np.nan, - 22.9679, - 26.2414, - 44.8249, - 0.5740, - 70.4607, - 42.5318, - 24.9536, - np.nan, - 14.0772, - 32.6389, - np.nan, - 38.1213, - 34.4094, - 23.2517, - 13.9110, - 48.9622, - 117.2337, - 32.9814, - 0.3850, - 24.3135, - 17.8765, - 26.5847, - 5.2099, - np.nan, - 22.5846, - 48.2133, - np.nan, - np.nan, - 5.6775, - 23.9395, - np.nan, - np.nan, - 12.9322, - np.nan, - 18.1475, - 29.6960, - 20.7504, - 34.9055, - ] - ), - ), - # todo right now the metrics count reads that have no 'gene' towards molecules, whereas - # the calculations in the notebook exclude them. We should decide which method we prefer. - # there may be further problems. - # (_cell_metrics, 'reads_per_molecule', - # np.array( - # [1.0000, 2.0000, np.nan, 1.0000, 9.0000, 2.4000, 2.0000, 1.0000, 3.0000, 1.0000, 3.0000, - # 3.0000, 1.0000, np.nan, 2.4167, 4.3333, 1.2222, 5.8750, 1.3333, 1.0000, 1.2000, 1.5000, - # 4.6000, 2.0000, 2.5000, 1.2000, 2.1429, 1.0000, 2.6364, 4.0000, 1.0000, 2.1111, 1.7273, - # 6.2500, 5.0000, 1.3333, 2.0000, 2.2500, np.nan, 2.0000, 4.3333, 3.9286, 2.2000, 1.0000, - # 1.5000, 1.6667, np.nan, 1.0000, 1.6667, 1.8889, 1.0000, 1.0000, 2.2500, 1.0000, 9.7500, - # 11.0000, 4.0000, 1.5000])), - ( - _cell_metrics, - "reads_per_fragment", - np.array( - [ - 1.0000, - 1.0000, - 1.0000, - 1.0000, - 1.1250, - 1.3333, - 2.0000, - 1.0000, - 1.2000, - 1.0000, - 1.2000, - 3.0000, - 1.0000, - 2.0000, - 1.3182, - 1.4444, - 1.1000, - 1.4688, - 1.1429, - 1.0000, - 1.2000, - 1.2857, - 1.5333, - 2.0000, - 1.2500, - 1.0000, - 1.1538, - 1.0000, - 1.3182, - 1.0000, - 1.0000, - 1.4615, - 1.3571, - 1.3158, - 1.2500, - 1.3333, - 1.0000, - 1.1250, - 1.0000, - 1.1765, - 1.0833, - 1.4103, - 1.1000, - 1.0000, - 1.2857, - 1.2500, - 1.0000, - 1.0000, - 1.2500, - 1.3077, - 1.0000, - 1.0000, - 1.2857, - 1.0000, - 1.3929, - 1.5714, - 1.4737, - 1.1053, - ] - ), - ), - # (_cell_metrics, 'fragments_per_molecule', # todo failure depends on above reads_per_molecule - # np.array( - # [1.0000, 2.0000, np.nan, 1.0000, 8.0000, 1.8000, 1.0000, 1.0000, 2.5000, 1.0000, 2.5000, - # 1.0000, 1.0000, np.nan, 1.8333, 3.0000, 1.1111, 4.0000, 1.1667, 1.0000, 1.0000, 1.1667, - # 3.0000, 1.0000, 2.0000, 1.2000, 1.8571, 1.0000, 2.0000, 4.0000, 1.0000, 1.4444, 1.2727, - # 4.7500, 4.0000, 1.0000, 2.0000, 2.0000, np.nan, 1.7000, 4.0000, 2.7857, 2.0000, 1.0000, - # 1.1667, 1.3333, np.nan, 1.0000, 1.3333, 1.4444, 1.0000, 1.0000, 1.7500, 1.0000, 7.0000, - # 7.0000, 2.7143, 1.3571])), - ( - _gene_metrics, - "molecule_barcode_fraction_bases_above_30_mean", - np.array([1.0000, 1.0000, 0.8000, 0.9885, 0.9833, 0.9857, 0.7000, 0.9444]), - ), - ( - _gene_metrics, - "molecule_barcode_fraction_bases_above_30_variance", - np.array([np.nan, np.nan, np.nan, 0.0011, 0.0051, 0.0014, np.nan, 0.0120]), - ), - ( - _gene_metrics, - "genomic_reads_fraction_bases_quality_above_30_mean", - np.array([0.8878, 0.3980, 0.4271, 0.8148, 0.7681, 0.7216, 0.1546, 0.5089]), - ), - ( - _gene_metrics, - "genomic_reads_fraction_bases_quality_above_30_variance", - np.array([np.nan, np.nan, np.nan, 0.0282, 0.0346, 0.0537, np.nan, 0.0849]), - ), - ( - _gene_metrics, - "genomic_read_quality_mean", - np.array( - [36.2143, 24.8469, 25.4792, 35.3664, 34.0956, 33.0364, 20.7423, 27.3078] - ), - ), - ( - _gene_metrics, - "genomic_read_quality_variance", - np.array( - [np.nan, np.nan, np.nan, 18.4553, 21.6745, 33.6572, np.nan, 53.5457] - ), - ), - ( - _gene_metrics, - "reads_per_molecule", - np.array([1.0000, 1.0000, 1.0000, 3.2500, 4.1525, 1.7500, 1.0000, 1.3846]), - ), - ( - _gene_metrics, - "reads_per_fragment", - np.array([1.0000, 1.0000, 1.0000, 1.7333, 1.3920, 1.4000, 1.0000, 1.0588]), - ), - ( - _gene_metrics, - "fragments_per_molecule", - np.array([1.0000, 1.0000, 1.0000, 1.8750, 2.9831, 1.2500, 1.0000, 1.3077]), - ), - ], -) -def test_higher_order_metrics_by_gene(metrics, key, expected_value): - """Test metrics that depend on other metrics - - This class tests a very large number of higher-order metrics that examine the functionality of - the test suite across all measured instances of the metric class. E.g. for cell metrics (class), - each test will verify the value for each cell (instance). - - Parameters - ---------- - metrics : pd.DataFrame - Output from subclass of sctools.metrics.MetricAggregator - key : str - The column of metrics to interrogate in the parametrized test - expected_value : np.ndarray - An array of expected values - - """ - # need to sort, metrics are not always in same order as results. - observed = sorted(np.nan_to_num(metrics[key].values).round(4)) - expected_value = sorted(np.nan_to_num(expected_value)) - assert observed == expected_value - - -@pytest.mark.parametrize( - "metrics, key, expected_value", - [ - # todo failing; suspect related to problem with how fragments are defined - # (_cell_metrics, 'fragments_with_single_read_evidence', 345), - # todo failing. Does not make sense that this would also be a fragment issue. - # (_cell_metrics, 'molecules_with_single_read_evidence', 130), - (_gene_metrics, "fragments_with_single_read_evidence", 155), - (_gene_metrics, "molecules_with_single_read_evidence", 42), - ], -) -def test_single_read_evidence(metrics, key, expected_value): - """ - We want to determine how many molecules and fragments are covered by only one read, as reads - covered by multiple reads have much lower probabilities of being the result of error processes. - """ - observed = metrics[key].sum() - assert observed == expected_value - - -def split_metrics_file(metrics_file): - """ - produces two mergeable on-disk metric files from a single file that contain the first 3/4 - of the file in the first output and the last 3/4 of the file in the second output, such that - 1/2 of the metrics in the two files overlap - """ - with fileinput.FileInput( - [metrics_file], mode="r", openhook=fileinput.hook_compressed - ) as f: - data = [line for line in f] - - header, data = data[0], data[1:] - - low_split, high_split = round(len(data) * 0.25), round(len(data) * 0.75) - file_1, file_2 = [_test_dir + "metrics_for_merging_%d.csv" % i for i in (1, 2)] - - with open(file_1, "wb") as f: - f.write(header + b"\n") - for line in data[:high_split]: - f.write(line + b"\n") - - with open(file_2, "wb") as f: - f.write(header + b"\n") - for line in data[low_split:]: - f.write(line + b"\n") - - return file_1, file_2 - - -@pytest.fixture -def mergeable_cell_metrics(): - return split_metrics_file(_cell_metric_output_file) - - -@pytest.fixture -def mergeable_gene_metrics(): - return split_metrics_file(_gene_metric_output_file) - - -def test_merge_cell_metrics_cli(mergeable_cell_metrics): - """test the sctools merge cell metrics CLI invocation""" - return_call = TenXV2.merge_cell_metrics( - args=["-o", _test_dir + "/merged-cell-metrics.csv.gz"] - + list(mergeable_cell_metrics) - ) - assert return_call == 0 - - -def test_merge_gene_metrics_cli(mergeable_gene_metrics): - """test the sctools merge gene metrics CLI invocation""" - return_call = TenXV2.merge_gene_metrics( - args=["-o", _test_dir + "/merged-gene-metrics.csv.gz"] - + list(mergeable_gene_metrics) - ) - assert return_call == 0 - - -def test_merge_cell_metrics_does_not_correct_duplicates(mergeable_cell_metrics): - """ - test takes offset cell metrics outputs and merges them. Cell metrics does not check for - duplication, so should return a 2x length file. - """ - output_file = os.path.join(_test_dir, "merged_metrics.csv.gz") - m = MergeCellMetrics(mergeable_cell_metrics, output_file) - m.execute() - - merged_data = pd.read_csv(output_file, index_col=0) - - input_sizes = [] - for f in mergeable_cell_metrics: - input_sizes.append(pd.read_csv(f, index_col=0).shape) - target_rows = sum(row for row, col in input_sizes) - - target_cols = input_sizes[0][1] # cols will always be the same - - assert merged_data.shape == (target_rows, target_cols) - - -def test_merge_gene_metrics_averages_over_multiply_detected_genes( - mergeable_gene_metrics, -): - output_file = os.path.join(_test_dir, "merged_metrics.csv.gz") - m = MergeGeneMetrics(mergeable_gene_metrics, output_file) - m.execute() - - merged_data = pd.read_csv(output_file, index_col=0) - - input_data = pd.read_csv(mergeable_gene_metrics[0], index_col=0) - target_cols = input_data.shape[1] - - input_genes = input_data.index - for f in mergeable_gene_metrics[1:]: - input_genes = input_genes.union(pd.read_csv(f, index_col=0).index) - target_rows = len(input_genes) - - assert merged_data.shape == (target_rows, target_cols), "%s" % repr(merged_data) - - -@pytest.mark.parametrize( - "bam, gatherer", - [(_gene_sorted_bam, GatherGeneMetrics), (_cell_sorted_bam, GatherCellMetrics)], -) -def test_gzip_compression(bam: str, gatherer: Callable): - """ - gzip compression should produce a .gz file which is identical when uncompressed to the - uncompressed version - """ - - gz_fout = _test_dir + "test_bam.csv.gz" - g: MetricGatherer = gatherer(bam, gz_fout, compress=True) - g.extract_metrics() - gz_metrics = pd.read_csv(gz_fout, index_col=0) - - fout = _test_dir + "test_bam.csv" - g: MetricGatherer = gatherer(bam, fout, compress=False) - g.extract_metrics() - metrics = pd.read_csv(fout, index_col=0) - - assert np.allclose(gz_metrics.fillna(0).values, metrics.fillna(0).values) diff --git a/tools/scripts/sctools/src/sctools/test/test_platform.py b/tools/scripts/sctools/src/sctools/test/test_platform.py deleted file mode 100644 index e18e0cd8..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_platform.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import tempfile -import pysam - -from .. import platform - -data_dir = os.path.split(__file__)[0] + "/data/" - - -def test_attach_barcodes(): - """High-level test of the AttachBarcodes command""" - - temp_dir_name = tempfile.mkdtemp() - - # Construct cli arguments to pass to the command - temp_output_bam = temp_dir_name + "output.bam" - - args = [ - "--r1", - data_dir + "test_r1.fastq", - "--u2", - data_dir + "test_r2.bam", - "--i1", - data_dir + "test_i1.fastq", - "--o", - temp_output_bam, - "--sample-barcode-start-pos", - "0", - "--sample-barcode-length", - "8", - "--cell-barcode-start-pos", - "0", - "--cell-barcode-length", - "16", - "--molecule-barcode-start-pos", - "16", - "--molecule-barcode-length", - "4", - ] - - platform.BarcodePlatform.attach_barcodes(args) - - with pysam.AlignmentFile(temp_output_bam, "rb", check_sq=False) as samfile: - for read in samfile: - tag_cr = read.get_tag("CR") - tag_cy = read.get_tag("CY") - tag_ur = read.get_tag("UR") - tag_uy = read.get_tag("UY") - tag_sr = read.get_tag("SR") - tag_sy = read.get_tag("SY") - assert len(tag_cr) == 16 - assert len(tag_cy) == 16 - assert len(tag_ur) == 4 - assert len(tag_uy) == 4 - assert len(tag_sr) == 8 - assert len(tag_sy) == 8 diff --git a/tools/scripts/sctools/src/sctools/test/test_stats.py b/tools/scripts/sctools/src/sctools/test/test_stats.py deleted file mode 100644 index c59d8f98..00000000 --- a/tools/scripts/sctools/src/sctools/test/test_stats.py +++ /dev/null @@ -1,21 +0,0 @@ -from .. import stats - - -def test_concentrated_data_produces_entropy_0(): - entropy = stats.base4_entropy([1, 0, 0, 0], axis=0) - assert entropy == 0 - - -def test_concentrated_unnormalized_data_produces_entropy_0(): - entropy = stats.base4_entropy([1000, 0, 0, 0], axis=0) - assert entropy == 0 - - -def test_balanced_data_produces_entropy_1(): - entropy = stats.base4_entropy([0.25, 0.25, 0.25, 0.25], axis=0) - assert entropy == 1 - - -def test_balanced_unnormalized_data_produces_entropy_1(): - entropy = stats.base4_entropy([20, 20, 20, 20], axis=0) - assert entropy == 1