diff --git a/.bumpversion.cfg b/.bumpversion.cfg index c06d8c0..4e546f6 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.1 +current_version = 0.3.2 [bumpversion:file:setup.py] diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7b7fd37..6309121 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,7 +15,7 @@ Types of Contributions Report Bugs ~~~~~~~~~~~ -Report bugs at https://github.com/jrderuiter/imfusion/issues. +Report bugs at https://github.com/nki-ccb/imfusion/issues. If you are reporting a bug, please include: @@ -46,7 +46,7 @@ Submit Feedback ~~~~~~~~~~~~~~~ The best way to send feedback is to file an issue at -https://github.com/jrderuiter/imfusion/issues. +https://github.com/nki-ccb/imfusion/issues. If you are proposing a feature: @@ -105,5 +105,5 @@ Before you submit a pull request, check that it meets these guidelines: your new functionality into a function with a docstring, and add the feature to the list in README.rst. 3. The pull request should work for Python 2.7, 3.4 and 3.5. Check - https://travis-ci.org/jrderuiter/imfusion/pull_requests + https://travis-ci.org/nki-ccb/imfusion/pull_requests and make sure that the tests pass for all supported Python versions. diff --git a/HISTORY.rst b/HISTORY.rst index b557ab6..2cbcbf9 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,13 @@ History ======= +0.3.2 (2017-05-11) +------------------ + +* Properly added star-fusion support to star aligner (was previously not + fully merged). +* Changed documentation URLs to new repository. + 0.3.1 (2017-05-09) ------------------ diff --git a/Makefile b/Makefile index f65c3ba..5e7f1e0 100644 --- a/Makefile +++ b/Makefile @@ -82,7 +82,7 @@ dist: clean ## builds source and wheel package python setup.py sdist bdist_wheel conda: clean-pyc ## build a conda release - conda build --python 3.5 -c bioconda -c r -c jrderuiter conda + conda build --python 3.5 -c bioconda conda conda-docker: clean-pyc docker run -v `pwd`:/imfusion -t -i condaforge/linux-anvil /bin/sh -c 'cd /imfusion && ./scripts/conda_build_docker.sh' diff --git a/README.rst b/README.rst index e0a5e6d..f855ebf 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,3 @@ -.. image:: https://img.shields.io/travis/jrderuiter/imfusion/develop.svg - :target: https://travis-ci.org/jrderuiter/imfusion - -.. image:: https://img.shields.io/coveralls/jrderuiter/imfusion/develop.svg - :target: https://coveralls.io/github/jrderuiter/imfusion - IM-Fusion ========= @@ -42,12 +36,14 @@ Documentation ============= IM-Fusion's documentation is available at -`jrderuiter.github.io/imfusion `_. +`nki-ccb.github.io/imfusion `_. References ========== -de Ruiter, JR. *et al.*, 2017. **"Identifying transposon insertions and -their effects from RNA-sequencing data"** (*Under revision*). + +de Ruiter J.R., Kas S.M. *et al.* **"Identifying transposon insertions and their +effects from RNA-sequencing data"** Nucleic Acids Research 2017, *in press*. + License ======= diff --git a/conda/meta.yaml b/conda/meta.yaml index 27ff977..76d7565 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,6 +1,6 @@ package: name: imfusion - version: 0.3.1 + version: 0.3.2 build: number: 0 @@ -71,7 +71,7 @@ test: - featureCounts -v about: - home: https://github.com/jrderuiter/imfusion + home: https://github.com/nki-ccb/imfusion license: MIT summary: "IM-Fusion - Tool for identifying transposon insertions and their effects from RNA-sequencing data" diff --git a/docs/conf.py b/docs/conf.py index b0b98bd..bca1a68 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -255,8 +255,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, 'im-fusion', u'IM-Fusion Documentation', [author], - 1)] +man_pages = [(master_doc, 'imfusion', u'IM-Fusion Documentation', [author], 1)] # If true, show URL addresses after external links. #man_show_urls = False diff --git a/docs/extras.rst b/docs/extras.rst index 3c23980..7aebe14 100644 --- a/docs/extras.rst +++ b/docs/extras.rst @@ -72,7 +72,7 @@ that this script does expect the required dependencies to be installed (FusionFilter, Repeatmasker and Blast). .. _FusionFilter wiki: https://github.com/FusionFilter/FusionFilter/wiki/Building-a-Custom-FusionFilter-Dataset -.. _Python script: https://github.com/jrderuiter/imfusion/blob/develop/scripts/starfusion_build_reference.py +.. _Python script: https://github.com/nki-ccb/imfusion/blob/develop/scripts/starfusion_build_reference.py Identifying fusions ~~~~~~~~~~~~~~~~~~~ diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 119b705..11438bc 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,7 +1,6 @@ Getting started =============== - Overview -------- @@ -39,6 +38,10 @@ supporting code is provided for interactive analyses (such as plotting differential expression) and manually running certain steps of the pipeline. For more details, see :doc:`api`. +For the STAR aligner, we also support identifying endogenous gene fusions +using STAR-Fusion. See :doc:`extras` for more details about this type of +analysis. + Required files -------------- @@ -65,6 +68,8 @@ be either ‘SD’ or ‘SA’ for splice-donor or splice-acceptor sites respect The field may also be left empty for other types of features, however these features will not be used by IM-Fusion. +For generating the exon-level expression counts, IM-Fusion needs a +flattened exon representation of the reference gene features (in GTF format). Example reference files ----------------------- diff --git a/docs/index.rst b/docs/index.rst index b7b7d2c..b2af8c1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,9 +29,19 @@ IM-Fusion has the following key features: single insertion in a specific sample, or determine the general effect of insertions on a given gene within the tumor cohort. +Additionally, by integrating STAR-Fusion into its STAR-based insertion +identification pipeline, IM-Fusion enables simultaneous identification of +transposon insertions and endogenous gene fusions. As shown in our manuscript, +this approach can identify endogenous gene-fusions driving tumorigenesis, which +are impossible to identify using targeted DNA-based transposon-sequencing +approaches. + +.. _STAR-Fusion: https://github.com/STAR-Fusion/STAR-Fusion/wiki + For more details on the approach and a comparison with existing DNA-sequencing approaches, please see our paper **"Identifying transposon insertions and -their effects from RNA-sequencing data"** (*Currently under revision*). +their effects from RNA-sequencing data"** (Nucleic Acids Research 2017, +*in press*). .. toctree:: :maxdepth: 2 @@ -41,6 +51,7 @@ their effects from RNA-sequencing data"** (*Currently under revision*). installation getting_started usage + extras api extras contributing diff --git a/docs/usage.rst b/docs/usage.rst index 96ce735..f4002da 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -14,7 +14,7 @@ format) into a new Fasta file and builds the indices needed for alignment. Separate sub-commands are provided for each supported aligner (currently STAR and Tophat-Fusion). -The basic command for building a (STAR-based) reference is as follows: +The basic command for building a STAR-based reference is as follows: .. code:: bash @@ -90,7 +90,7 @@ reference, whilst the ``--output_dir`` argument specifies where the sample output should be written. An optional ``--assemble`` argument indicates whether IM-Fusion should perform -a reference-guided transcript assembly. If given, IM-Fusion runs Stringtie +a reference-guided transcript assembly. If given, IM-Fusion runs StringTie after the RNA-seq alignment to detect novel gene transcripts based on the RNA-seq alignment. The results of this assembly are subsequently used in the insertion detection step to annotate insertions that involve novel transcripts. @@ -105,10 +105,11 @@ The command for using Tophat-Fusion is nearly identical: --output_dir output/sample_s1 \ --tophat_threads 4 -However, both aligners do have some aligner-specific arguments concerning the -alignment. See the help of the respective sub-commands for more details. For -STAR, special attention should be paid to memory usage, as STAR requires -approximately 30GB of memory (per process) for loading the reference genome. +However, as was the case when building the reference genomes, both aligners do +have some aligner-specific arguments concerning the alignment. See the help of +the respective sub-commands for more details. Again, for STAR special +attention should be paid to memory usage, as STAR requires approximately +30GB of memory for loading the reference genome. Quantifying expression (per sample) ----------------------------------- diff --git a/scripts/starfusion_build_reference.py b/scripts/starfusion_build_reference.py index 261b9a6..e68631c 100644 --- a/scripts/starfusion_build_reference.py +++ b/scripts/starfusion_build_reference.py @@ -21,28 +21,41 @@ def main(): tmp_dir.mkdir(parents=True, exist_ok=False) # Filter the patch chromosomes from the gtf, as these are - # likely not present in the fasta. - logging.info('- Generating cDNA sequences') + # likely not present in the fasta. Note we also filter rows without + # a transcript_id, as these seem to be problematic for STAR-Fusion. + logging.info('- Filtering GTF file') gtf_path = tmp_dir / 'ref.gtf' + tmp_gtf_path = gtf_path.with_suffix('.gtf.tmp') + + with tmp_gtf_path.open('wb') as file_: + check_call( + ['grep', '-v', r'^\(MG\|JH\|GL\)', str(args.gtf)], stdout=file_) + with gtf_path.open('wb') as file_: - check_call(['grep', '-v', r'^\(MG\|JH\|GL\)', str(args.gtf)], - stdout=file_) + check_call(['grep', 'transcript_id', str(tmp_gtf_path)], stdout=file_) + + tmp_gtf_path.unlink() # Create cDNA_seqs file. + logging.info('- Generating cDNA sequences') + cdna_path = tmp_dir / 'cDNA_seqs.fa' with cdna_path.open('wb') as file_: script_path = args.ff_path / 'util' / 'gtf_file_to_cDNA_seqs.pl' - check_call(['perl', str(script_path), str(gtf_path), str(args.fasta)], - stdout=file_) + check_call( + ['perl', str(script_path), str(gtf_path), str(args.fasta)], + stdout=file_) # Build masked cDNA_seqs file using RepeatMasker. # Note: requires library to be installed from http://www.girinst.org. logging.info('- Masking repeats') masked_path = cdna_path.with_suffix('.fa.masked') - check_call([str(args.rm_path / 'RepeatMasker'), '-pa', str(args.threads), '-s', - '-species', 'mouse', '-xsmall', str(cdna_path)]) + check_call([ + str(args.rm_path / 'RepeatMasker'), '-pa', str(args.threads), '-s', + '-species', 'mouse', '-xsmall', str(cdna_path) + ]) # Create blastpairs. logging.info('- Creating blast pairs') @@ -51,37 +64,35 @@ def main(): pair_path = tmp_dir / 'blast_pairs.outfmt6' with pair_path.open('wb') as file_: - check_call(['blastn', - '-query', str(cdna_path), - '-db', str(masked_path), - '-max_target_seqs', '10000', - '-outfmt', '6', - '-evalue', '1e-3', - '-lcase_masking', - '-num_threads', str(args.threads), - '-word_size', '11'], - stdout=file_) + check_call( + [ + 'blastn', '-query', str(cdna_path), '-db', str(masked_path), + '-max_target_seqs', '10000', '-outfmt', '6', '-evalue', '1e-3', + '-lcase_masking', '-num_threads', str(args.threads), + '-word_size', '11' + ], + stdout=file_) pair_gz_path = pair_path.with_suffix('.gene_syms.outfmt6.gz') with gzip.open(str(pair_gz_path), 'wb') as file_: script_path = (args.ff_path / 'util' / 'blast_outfmt6_replace_trans_id_w_gene_symbol.pl') - check_call(['perl', str(script_path), str(cdna_path), str(pair_path)], - stdout=file_) + check_call( + ['perl', str(script_path), str(cdna_path), str(pair_path)], + stdout=file_) # Prepare library. logging.info('- Preparing library') - script_path = args.ff_path / 'util' / 'prep_genome_lib.pl' - check_call(['perl', str(script_path), - '--genome_fa', str(args.fasta), - '--gtf', str(gtf_path), - '--blast_pairs', str(pair_gz_path), - '--cdna_fa', str(cdna_path), - '--CPU', str(args.threads), - '--max_readlength', str(args.read_length), - '--output_dir', str(args.output_dir)]) - - # shutil.rmtree(str(args.tmp_dir)) + script_path = args.ff_path / 'prep_genome_lib.pl' + check_call([ + 'perl', str(script_path), '--genome_fa', str(args.fasta), '--gtf', + str(gtf_path), '--blast_pairs', str(pair_gz_path), '--cdna_fa', + str(cdna_path), '--CPU', str(args.threads), '--max_readlength', + str(args.read_length), '--output_dir', str(args.output_dir) + ]) + + shutil.rmtree(str(args.tmp_dir)) + def parse_args(): """Parses command line arguments.""" @@ -91,7 +102,7 @@ def parse_args(): parser.add_argument('--fasta', required=True, type=Path) parser.add_argument('--gtf', required=True, type=Path) parser.add_argument('--output_dir', required=True, type=Path) - + parser.add_argument('--ff_path', required=False, default='', type=Path) parser.add_argument('--rm_path', required=False, default='', type=Path) diff --git a/setup.py b/setup.py index cefc7ae..0267a62 100644 --- a/setup.py +++ b/setup.py @@ -27,12 +27,12 @@ setuptools.setup( name='imfusion', - version='0.3.1', + version='0.3.2', description=('Tool for identifying transposon insertions in ' 'Insertional Mutagenesis screens from gene-transposon ' 'fusions using single- and paired-end RNA-sequencing data.'), long_description=README + '\n\n' + HISTORY, - url='https://github.com/jrderuiter/im-fusion', + url='https://github.com/nki-ccb/imfusion', author='Julian de Ruiter', author_email='julianderuiter@gmail.com', license='MIT license', diff --git a/src/imfusion/__init__.py b/src/imfusion/__init__.py index fc8c8ee..f6c0fc7 100644 --- a/src/imfusion/__init__.py +++ b/src/imfusion/__init__.py @@ -2,4 +2,4 @@ __author__ = 'Julian de Ruiter' __email__ = 'julianderuiter@gmail.com' -__version__ = '0.3.1' +__version__ = '0.3.2' diff --git a/src/imfusion/build/indexers/base.py b/src/imfusion/build/indexers/base.py index cd1c53b..0fb00e3 100644 --- a/src/imfusion/build/indexers/base.py +++ b/src/imfusion/build/indexers/base.py @@ -249,7 +249,10 @@ def configure_args(cls, parser): help='Path to the transposon features (tsv).') base_group.add_argument( - '--output_dir', type=pathlib.Path, required=True) + '--output_dir', + type=pathlib.Path, + required=True, + help='Path to write the built reference.') # Optional blacklist arguments. blacklist_group = parser.add_argument_group('Blacklist arguments') diff --git a/src/imfusion/external/__init__.py b/src/imfusion/external/__init__.py index e69de29..40a96af 100644 --- a/src/imfusion/external/__init__.py +++ b/src/imfusion/external/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/src/imfusion/external/star_fusion.py b/src/imfusion/external/star_fusion.py new file mode 100644 index 0000000..7259470 --- /dev/null +++ b/src/imfusion/external/star_fusion.py @@ -0,0 +1,24 @@ +"""Module containing functions for calling star-fusion.""" + +from .util import run_command + + +def star_fusion(junction_path, reference_path, output_dir=None, log_path=None): + """Identifies endogenous fusions from an existing STAR alignment. + + Parameters + ---------- + reference_path : pathlib.Path + Path to the reference genome. + out_base_path : pathlib.Path + Base output path for the built index. + log_path : pathlib.Path + Where to write the log output. + + """ + + args = [ + 'STAR-Fusion', '--genome_lib_dir', str(reference_path), '-J', + str(junction_path), '--output_dir', str(output_dir) + ] + run_command(args=args, log_path=log_path) diff --git a/src/imfusion/insertions/aligners/base.py b/src/imfusion/insertions/aligners/base.py index 5ee7511..f9a6772 100644 --- a/src/imfusion/insertions/aligners/base.py +++ b/src/imfusion/insertions/aligners/base.py @@ -98,7 +98,7 @@ def configure_args(cls, parser): type=pathlib.Path, required=True, help='Path to the index of the augmented reference ' - 'generated by im-fusion build.') + 'generated by imfusion-build.') base_group.add_argument( '--output_dir', diff --git a/src/imfusion/insertions/aligners/star.py b/src/imfusion/insertions/aligners/star.py index 5471feb..50511bb 100644 --- a/src/imfusion/insertions/aligners/star.py +++ b/src/imfusion/insertions/aligners/star.py @@ -13,13 +13,17 @@ from typing import Any import re +from future.utils import native_str from intervaltree import IntervalTree import numpy as np import pandas as pd +from pathlib2 import Path +import pysam import toolz from imfusion.build.indexers.star import StarReference from imfusion.external.star import star_align +from imfusion.external.star_fusion import star_fusion from imfusion.external.stringtie import stringtie_assemble from imfusion.external.compound import sort_bam from imfusion.external.util import which, parse_arguments @@ -114,10 +118,17 @@ class StarAligner(Aligner): distance of each other to be merged. The value should be chosen to reflect the expected or emprical insert size. max_junction_dist : int - Maxixmum distance within which groups of spanning mates are assigned + Maximum distance within which groups of spanning mates are assigned to a junction fusion (which is supported by split reads, so that its exact position is known). Groups that cannot be assigned to a junction fusion are considered to arise from a separate insertion. + star_fusion_ref_path : Path + Path to a STAR-Fusion reference. If given, STAR-Fusion is used to + identify endogenous gene fusions from IM-Fusions alignment. Identified + gene fusions are written to a gene_fusions.txt file in the output + directory. Note that the STAR-Fusion reference should use the same + reference genome as IM-Fusions reference to avoid compatability issues. + Requires STAR-Fusion to be installed. """ @@ -136,7 +147,8 @@ def __init__( external_sort=False, # type: bool merge_junction_dist=10, # type: int max_spanning_dist=300, # type: int - max_junction_dist=10000 # type: int + max_junction_dist=10000, # type: int + star_fusion_ref_path=None # type: Path ): # type: (...) -> None super().__init__(reference=reference, logger=logger) @@ -156,6 +168,8 @@ def __init__( self._filter_orientation = filter_orientation self._filter_blacklist = filter_blacklist + self._star_fusion_ref_path = star_fusion_ref_path + @property def dependencies(self): """External dependencies required by aligner.""" @@ -165,6 +179,9 @@ def dependencies(self): if self._assemble: programs += ['stringtie'] + if self._star_fusion_ref_path is not None: + programs += ['STAR-Fusion'] + return programs def identify_insertions(self, fastq_path, output_dir, fastq2_path=None): @@ -195,9 +212,21 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None): # Perform alignment using STAR. alignment_path = output_dir / 'alignment.bam' + star_dir = output_dir / '_star' + if not alignment_path.exists(): self._logger.info('Performing alignment using STAR') - self._align(fastq_path, output_dir, fastq2_path=fastq2_path) + + self._align( + fastq_path=fastq_path, + output_dir=star_dir, + fastq2_path=fastq2_path) + + path.symlink_relative( + src_path=star_dir / 'Aligned.sortedByCoord.out.bam', + dest_path=alignment_path) + + pysam.index(native_str(alignment_path)) else: self._logger.info('Using existing STAR alignment') @@ -222,12 +251,11 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None): else: assembled_path = None - # Extract identified fusions. + # Extract identified fusions and corresponding insertions. self._logger.info('Extracting gene-transposon fusions') - fusion_path = output_dir / 'fusions.out' - fusions = list(self._extract_fusions(fusion_path)) + junction_path = star_dir / 'Chimeric.out.junction' + fusions = list(self._extract_fusions(junction_path)) - # Extract insertions. self._logger.info('Summarizing insertions') insertions = list( util.extract_insertions( @@ -238,21 +266,30 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None): ffpm_fastq_path=fastq_path, chromosomes=None)) - # self._logger.info('Filtering insertions') insertions = util.filter_insertions( insertions, features=self._filter_features, orientation=self._filter_orientation, blacklist=self._filter_blacklist) + # Identify endogenous fusions with STAR if requested. + if self._star_fusion_ref_path is not None: + self._logger.info('Identifying gene fusions using STAR-Fusion') + star_fusion_dir = output_dir / '_star_fusion' + star_fusion( + junction_path, + self._star_fusion_ref_path, + output_dir=star_fusion_dir) + + path.symlink_relative( + src_path=star_fusion_dir / + 'star-fusion.fusion_candidates.final.abridged', + dest_path=output_dir / 'gene_fusions.txt') + for insertion in insertions: yield insertion def _align(self, fastq_path, output_dir, fastq2_path=None): - # Put output into subdirectory, we will symlink the - # expected outputs from STAR later. - star_dir = output_dir / '_star' - # Gather default arguments. sort_type = 'Unsorted' if self._external_sort else 'SortedByCoordinate' @@ -269,26 +306,17 @@ def _align(self, fastq_path, output_dir, fastq2_path=None): fastq_path=fastq_path, fastq2_path=fastq2_path, index_path=self._reference.index_path, - output_dir=star_dir, + output_dir=output_dir, extra_args=toolz.merge(args, self._extra_args)) # If not yet sorted, sort bam file using samtools/sambamba. - sorted_bam_path = star_dir / 'Aligned.sortedByCoord.out.bam' + sorted_bam_path = output_dir / 'Aligned.sortedByCoord.out.bam' if sort_type == 'Unsorted': - unsorted_bam_path = star_dir / 'Aligned.out.bam' + unsorted_bam_path = output_dir / 'Aligned.out.bam' sort_bam(unsorted_bam_path, sorted_bam_path, threads=self._threads) unsorted_bam_path.unlink() - # Symlink fusions and alignment into expected location. - dest_bam_path = output_dir / 'alignment.bam' - path.symlink_relative( - src_path=sorted_bam_path, dest_path=dest_bam_path) - - path.symlink_relative( - src_path=star_dir / 'Chimeric.out.junction', - dest_path=output_dir / 'fusions.out') - def _extract_fusions(self, fusion_path): # Read chimeric junction data. chimeric_data = read_chimeric_junctions(fusion_path) @@ -322,32 +350,103 @@ def configure_args(cls, parser): super().configure_args(parser) star_group = parser.add_argument_group('STAR arguments') - star_group.add_argument('--star_threads', type=int, default=1) - star_group.add_argument('--star_min_flank', type=int, default=12) star_group.add_argument( - '--star_external_sort', default=False, action='store_true') - star_group.add_argument('--star_args', default='') + '--star_threads', + type=int, + default=1, + help='Number of threads to use when running STAR.') - star_group.add_argument('--merge_junction_dist', default=10, type=int) - star_group.add_argument('--max_spanning_dist', default=300, type=int) - star_group.add_argument('--max_junction_dist', default=10000, type=int) + star_group.add_argument( + '--star_min_flank', + type=int, + default=12, + help=('Minimum mapped length of the two segments ' + 'on each side of the fusion.')) + + star_group.add_argument( + '--star_external_sort', + default=False, + action='store_true', + help=('Use samtools/sambamba for sorting, rather than STAR. ' + 'Takes longer, but results in lower memory usage for ' + 'large bam files.')) + + star_group.add_argument( + '--star_args', default='', help='Additional args to pass to STAR.') + + star_group.add_argument( + '--merge_junction_dist', + default=10, + type=int, + help=('Maximum distance within which fusions supported by split ' + 'reads (overlapping the junction, so that the exact ' + 'breakpoint is known) are merged. This merging avoids ' + 'calling multiple fusions due to slight variations in the ' + 'alignment, although this value should not be chosen ' + 'too large to avoid merging distinct insertions.')) + + star_group.add_argument( + '--max_spanning_dist', + default=300, + type=int, + help=( + 'Maximum distance within which spanning mate pairs (mates ' + 'that do not overlap the fusion junction) are grouped when ' + 'summarizing spanning chimeric reads. Both mates from two pairs' + ' need to be within this distance of each other to be merged. ' + 'The value should be chosen to reflect the expected or ' + 'emprical insert size.')) + star_group.add_argument( + '--max_junction_dist', + default=10000, + type=int, + help=('Maximum distance within which groups of spanning mates are ' + 'assigned to a junction fusion (which is supported by split ' + 'reads, so that its exact position is known). Groups that ' + 'cannot be assigned to a junction fusion are considered to ' + 'arise from a separate insertion.')) assemble_group = parser.add_argument_group('Assembly') assemble_group.add_argument( - '--assemble', default=False, action='store_true') + '--assemble', + default=False, + action='store_true', + help='Perform de-novo transcript assembly using StringTie.') filt_group = parser.add_argument_group('Filtering') filt_group.add_argument( '--no_filter_orientation', dest='filter_orientation', default=True, - action='store_false') + action='store_false', + help=('Don\'t filter fusions with transposon features and genes ' + 'in opposite (incompatible) orientations.')) + filt_group.add_argument( '--no_filter_feature', dest='filter_features', default=True, - action='store_false') - filt_group.add_argument('--blacklisted_genes', nargs='+') + action='store_false', + help=('Don\'t filter fusions with non-SA/SD features.')) + + filt_group.add_argument( + '--blacklisted_genes', + nargs='+', + help='Blacklisted genes to filter.') + + sf_group = parser.add_argument_group('STAR-Fusion') + sf_group.add_argument( + '--star_fusion_reference', + type=Path, + default=None, + help=( + 'Path to a STAR-Fusion reference. If given, STAR-Fusion is ' + 'used to identify endogenous gene fusions from IM-Fusions ' + 'alignment. Identified gene fusions are written to a ' + 'gene_fusions.txt file in the output directory. Note that the ' + 'STAR-Fusion reference should use the same reference genome as ' + 'IM-Fusions reference to avoid compatability issues.' + 'Requires STAR-Fusion to be installed.')) @classmethod def _parse_args(cls, args): @@ -363,7 +462,8 @@ def _parse_args(cls, args): max_junction_dist=args.max_junction_dist, filter_features=args.filter_features, filter_orientation=args.filter_orientation, - filter_blacklist=args.blacklisted_genes) + filter_blacklist=args.blacklisted_genes, + star_fusion_ref_path=args.star_fusion_reference) return toolz.merge(super()._parse_args(args), kws) diff --git a/src/imfusion/insertions/aligners/tophat.py b/src/imfusion/insertions/aligners/tophat.py index e7a0b75..634713d 100644 --- a/src/imfusion/insertions/aligners/tophat.py +++ b/src/imfusion/insertions/aligners/tophat.py @@ -6,9 +6,10 @@ from builtins import * # pylint: enable=wildcard-import,redefined-builtin,unused-wildcard-import -from pathlib2 import Path - +from future.utils import native_str import pandas as pd +from pathlib2 import Path +import pysam import toolz from imfusion.build.indexers.tophat import TophatReference @@ -104,6 +105,7 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None): if not alignment_path.exists(): self._logger.info('Performing alignment using Tophat2') self._align(fastq_path, output_dir, fastq2_path=fastq2_path) + pysam.index(native_str(alignment_path)) else: self._logger.info('Using existing Tophat2 alignment') @@ -200,26 +202,52 @@ def configure_args(cls, parser): super().configure_args(parser) group = parser.add_argument_group('Tophat2 arguments') - group.add_argument('--tophat_threads', type=int, default=1) - group.add_argument('--tophat_min_flank', type=int, default=12) - group.add_argument('--tophat_args', type=parse_arguments, default='') + group.add_argument( + '--tophat_threads', + type=int, + default=1, + help='Number of threads to use when running Tophat2.') + + group.add_argument( + '--tophat_min_flank', + type=int, + default=12, + help=('Minimum mapped length of the two segments ' + 'on each side of the fusion.')) + + group.add_argument( + '--tophat_args', + type=parse_arguments, + default='', + help='Additional args to pass to Tophat2.') assemble_group = parser.add_argument_group('Assembly') assemble_group.add_argument( - '--assemble', default=False, action='store_true') + '--assemble', + default=False, + action='store_true', + help='Perform de-novo transcript assembly using StringTie.') filt_group = parser.add_argument_group('Filtering') filt_group.add_argument( '--no_filter_orientation', dest='filter_orientation', default=True, - action='store_false') + action='store_false', + help=('Don\'t filter fusions with transposon features and genes ' + 'in opposite (incompatible) orientations.')) + filt_group.add_argument( '--no_filter_feature', dest='filter_features', default=True, - action='store_false') - filt_group.add_argument('--blacklisted_genes', nargs='+') + action='store_false', + help=('Don\'t filter fusions with non-SA/SD features.')) + + filt_group.add_argument( + '--blacklisted_genes', + nargs='+', + help='Blacklisted genes to filter.') @classmethod def _parse_args(cls, args): diff --git a/src/imfusion/main/ctg.py b/src/imfusion/main/ctg.py index ccedafe..db828a5 100644 --- a/src/imfusion/main/ctg.py +++ b/src/imfusion/main/ctg.py @@ -105,7 +105,7 @@ def parse_args(): required=True, type=Path, help='Path to the merged insertions file from ' - 'im-fusion merge.') + 'imfusion-merge.') base_group.add_argument( '--reference', @@ -179,7 +179,7 @@ def parse_args(): default=None, type=Path, help='Path to the merged expression file from ' - 'im-fusion merge.') + 'imfusion-merge.') de_group.add_argument( '--de_threshold', diff --git a/src/imfusion/main/expression.py b/src/imfusion/main/expression.py index 7b96f54..99c2e38 100644 --- a/src/imfusion/main/expression.py +++ b/src/imfusion/main/expression.py @@ -71,7 +71,7 @@ def parse_args(): type=Path, required=True, help='Path to the index of the augmented reference ' - 'generated by im-fusion build.') + 'generated by imfusion-build.') parser.add_argument('--output', type=Path, default=None) diff --git a/src/imfusion/main/merge.py b/src/imfusion/main/merge.py index ac28afc..906e471 100644 --- a/src/imfusion/main/merge.py +++ b/src/imfusion/main/merge.py @@ -16,7 +16,7 @@ def main(): - """Main function of im-fusion-merge.""" + """Main function of imfusion-merge.""" args = _parse_args() @@ -37,7 +37,7 @@ def main(): def _parse_args(): - """Parses command-line arguments for im-fusion-merge.""" + """Parses command-line arguments for imfusion-merge.""" parser = argparse.ArgumentParser() diff --git a/tests/imfusion/insertions/aligners/test_star.py b/tests/imfusion/insertions/aligners/test_star.py index fe4dada..e8ceb9f 100644 --- a/tests/imfusion/insertions/aligners/test_star.py +++ b/tests/imfusion/insertions/aligners/test_star.py @@ -248,6 +248,8 @@ def test_identify_insertions(self, read_paths, star_reference, star_mock = mocker.patch.object(star, 'star_align') mocker.patch.object(star.util, 'count_lines', return_value=8e6) + mocker.patch.object(star.pysam, 'index') + # Call identify insertions. fastq, fastq2 = read_paths diff --git a/tests/imfusion/insertions/aligners/test_tophat.py b/tests/imfusion/insertions/aligners/test_tophat.py index 782271e..6b4f6f7 100644 --- a/tests/imfusion/insertions/aligners/test_tophat.py +++ b/tests/imfusion/insertions/aligners/test_tophat.py @@ -92,6 +92,8 @@ def test_identify_insertions(self, read_paths, tophat_reference, tophat_mock = mocker.patch.object(tophat, 'tophat2_align') mocker.patch.object(tophat.util, 'count_lines', return_value=8e6) + mocker.patch.object(tophat.pysam, 'index') + # Call identify insertions. fastq, fastq2 = read_paths