diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index c06d8c0..4e546f6 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.3.1
+current_version = 0.3.2
[bumpversion:file:setup.py]
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 7b7fd37..6309121 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -15,7 +15,7 @@ Types of Contributions
Report Bugs
~~~~~~~~~~~
-Report bugs at https://github.com/jrderuiter/imfusion/issues.
+Report bugs at https://github.com/nki-ccb/imfusion/issues.
If you are reporting a bug, please include:
@@ -46,7 +46,7 @@ Submit Feedback
~~~~~~~~~~~~~~~
The best way to send feedback is to file an issue at
-https://github.com/jrderuiter/imfusion/issues.
+https://github.com/nki-ccb/imfusion/issues.
If you are proposing a feature:
@@ -105,5 +105,5 @@ Before you submit a pull request, check that it meets these guidelines:
your new functionality into a function with a docstring, and add the
feature to the list in README.rst.
3. The pull request should work for Python 2.7, 3.4 and 3.5. Check
- https://travis-ci.org/jrderuiter/imfusion/pull_requests
+ https://travis-ci.org/nki-ccb/imfusion/pull_requests
and make sure that the tests pass for all supported Python versions.
diff --git a/HISTORY.rst b/HISTORY.rst
index b557ab6..2cbcbf9 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -2,6 +2,13 @@
History
=======
+0.3.2 (2017-05-11)
+------------------
+
+* Properly added star-fusion support to star aligner (was previously not
+ fully merged).
+* Changed documentation URLs to new repository.
+
0.3.1 (2017-05-09)
------------------
diff --git a/Makefile b/Makefile
index f65c3ba..5e7f1e0 100644
--- a/Makefile
+++ b/Makefile
@@ -82,7 +82,7 @@ dist: clean ## builds source and wheel package
python setup.py sdist bdist_wheel
conda: clean-pyc ## build a conda release
- conda build --python 3.5 -c bioconda -c r -c jrderuiter conda
+ conda build --python 3.5 -c bioconda conda
conda-docker: clean-pyc
docker run -v `pwd`:/imfusion -t -i condaforge/linux-anvil /bin/sh -c 'cd /imfusion && ./scripts/conda_build_docker.sh'
diff --git a/README.rst b/README.rst
index e0a5e6d..f855ebf 100644
--- a/README.rst
+++ b/README.rst
@@ -1,9 +1,3 @@
-.. image:: https://img.shields.io/travis/jrderuiter/imfusion/develop.svg
- :target: https://travis-ci.org/jrderuiter/imfusion
-
-.. image:: https://img.shields.io/coveralls/jrderuiter/imfusion/develop.svg
- :target: https://coveralls.io/github/jrderuiter/imfusion
-
IM-Fusion
=========
@@ -42,12 +36,14 @@ Documentation
=============
IM-Fusion's documentation is available at
-`jrderuiter.github.io/imfusion `_.
+`nki-ccb.github.io/imfusion `_.
References
==========
-de Ruiter, JR. *et al.*, 2017. **"Identifying transposon insertions and
-their effects from RNA-sequencing data"** (*Under revision*).
+
+de Ruiter J.R., Kas S.M. *et al.* **"Identifying transposon insertions and their
+effects from RNA-sequencing data"** Nucleic Acids Research 2017, *in press*.
+
License
=======
diff --git a/conda/meta.yaml b/conda/meta.yaml
index 27ff977..76d7565 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,6 +1,6 @@
package:
name: imfusion
- version: 0.3.1
+ version: 0.3.2
build:
number: 0
@@ -71,7 +71,7 @@ test:
- featureCounts -v
about:
- home: https://github.com/jrderuiter/imfusion
+ home: https://github.com/nki-ccb/imfusion
license: MIT
summary: "IM-Fusion - Tool for identifying transposon insertions
and their effects from RNA-sequencing data"
diff --git a/docs/conf.py b/docs/conf.py
index b0b98bd..bca1a68 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -255,8 +255,7 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, 'im-fusion', u'IM-Fusion Documentation', [author],
- 1)]
+man_pages = [(master_doc, 'imfusion', u'IM-Fusion Documentation', [author], 1)]
# If true, show URL addresses after external links.
#man_show_urls = False
diff --git a/docs/extras.rst b/docs/extras.rst
index 3c23980..7aebe14 100644
--- a/docs/extras.rst
+++ b/docs/extras.rst
@@ -72,7 +72,7 @@ that this script does expect the required dependencies to be installed
(FusionFilter, Repeatmasker and Blast).
.. _FusionFilter wiki: https://github.com/FusionFilter/FusionFilter/wiki/Building-a-Custom-FusionFilter-Dataset
-.. _Python script: https://github.com/jrderuiter/imfusion/blob/develop/scripts/starfusion_build_reference.py
+.. _Python script: https://github.com/nki-ccb/imfusion/blob/develop/scripts/starfusion_build_reference.py
Identifying fusions
~~~~~~~~~~~~~~~~~~~
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 119b705..11438bc 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -1,7 +1,6 @@
Getting started
===============
-
Overview
--------
@@ -39,6 +38,10 @@ supporting code is provided for interactive analyses (such as plotting
differential expression) and manually running certain steps of the pipeline.
For more details, see :doc:`api`.
+For the STAR aligner, we also support identifying endogenous gene fusions
+using STAR-Fusion. See :doc:`extras` for more details about this type of
+analysis.
+
Required files
--------------
@@ -65,6 +68,8 @@ be either ‘SD’ or ‘SA’ for splice-donor or splice-acceptor sites respect
The field may also be left empty for other types of features, however these
features will not be used by IM-Fusion.
+For generating the exon-level expression counts, IM-Fusion needs a
+flattened exon representation of the reference gene features (in GTF format).
Example reference files
-----------------------
diff --git a/docs/index.rst b/docs/index.rst
index b7b7d2c..b2af8c1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -29,9 +29,19 @@ IM-Fusion has the following key features:
single insertion in a specific sample, or determine the general
effect of insertions on a given gene within the tumor cohort.
+Additionally, by integrating STAR-Fusion into its STAR-based insertion
+identification pipeline, IM-Fusion enables simultaneous identification of
+transposon insertions and endogenous gene fusions. As shown in our manuscript,
+this approach can identify endogenous gene-fusions driving tumorigenesis, which
+are impossible to identify using targeted DNA-based transposon-sequencing
+approaches.
+
+.. _STAR-Fusion: https://github.com/STAR-Fusion/STAR-Fusion/wiki
+
For more details on the approach and a comparison with existing DNA-sequencing
approaches, please see our paper **"Identifying transposon insertions and
-their effects from RNA-sequencing data"** (*Currently under revision*).
+their effects from RNA-sequencing data"** (Nucleic Acids Research 2017,
+*in press*).
.. toctree::
:maxdepth: 2
@@ -41,6 +51,7 @@ their effects from RNA-sequencing data"** (*Currently under revision*).
installation
getting_started
usage
+ extras
api
extras
contributing
diff --git a/docs/usage.rst b/docs/usage.rst
index 96ce735..f4002da 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -14,7 +14,7 @@ format) into a new Fasta file and builds the indices needed for alignment.
Separate sub-commands are provided for each supported aligner (currently STAR
and Tophat-Fusion).
-The basic command for building a (STAR-based) reference is as follows:
+The basic command for building a STAR-based reference is as follows:
.. code:: bash
@@ -90,7 +90,7 @@ reference, whilst the ``--output_dir`` argument specifies where the
sample output should be written.
An optional ``--assemble`` argument indicates whether IM-Fusion should perform
-a reference-guided transcript assembly. If given, IM-Fusion runs Stringtie
+a reference-guided transcript assembly. If given, IM-Fusion runs StringTie
after the RNA-seq alignment to detect novel gene transcripts based on the
RNA-seq alignment. The results of this assembly are subsequently used in the
insertion detection step to annotate insertions that involve novel transcripts.
@@ -105,10 +105,11 @@ The command for using Tophat-Fusion is nearly identical:
--output_dir output/sample_s1 \
--tophat_threads 4
-However, both aligners do have some aligner-specific arguments concerning the
-alignment. See the help of the respective sub-commands for more details. For
-STAR, special attention should be paid to memory usage, as STAR requires
-approximately 30GB of memory (per process) for loading the reference genome.
+However, as was the case when building the reference genomes, both aligners do
+have some aligner-specific arguments concerning the alignment. See the help of
+the respective sub-commands for more details. Again, for STAR special
+attention should be paid to memory usage, as STAR requires approximately
+30GB of memory for loading the reference genome.
Quantifying expression (per sample)
-----------------------------------
diff --git a/scripts/starfusion_build_reference.py b/scripts/starfusion_build_reference.py
index 261b9a6..e68631c 100644
--- a/scripts/starfusion_build_reference.py
+++ b/scripts/starfusion_build_reference.py
@@ -21,28 +21,41 @@ def main():
tmp_dir.mkdir(parents=True, exist_ok=False)
# Filter the patch chromosomes from the gtf, as these are
- # likely not present in the fasta.
- logging.info('- Generating cDNA sequences')
+ # likely not present in the fasta. Note we also filter rows without
+ # a transcript_id, as these seem to be problematic for STAR-Fusion.
+ logging.info('- Filtering GTF file')
gtf_path = tmp_dir / 'ref.gtf'
+ tmp_gtf_path = gtf_path.with_suffix('.gtf.tmp')
+
+ with tmp_gtf_path.open('wb') as file_:
+ check_call(
+ ['grep', '-v', r'^\(MG\|JH\|GL\)', str(args.gtf)], stdout=file_)
+
with gtf_path.open('wb') as file_:
- check_call(['grep', '-v', r'^\(MG\|JH\|GL\)', str(args.gtf)],
- stdout=file_)
+ check_call(['grep', 'transcript_id', str(tmp_gtf_path)], stdout=file_)
+
+ tmp_gtf_path.unlink()
# Create cDNA_seqs file.
+ logging.info('- Generating cDNA sequences')
+
cdna_path = tmp_dir / 'cDNA_seqs.fa'
with cdna_path.open('wb') as file_:
script_path = args.ff_path / 'util' / 'gtf_file_to_cDNA_seqs.pl'
- check_call(['perl', str(script_path), str(gtf_path), str(args.fasta)],
- stdout=file_)
+ check_call(
+ ['perl', str(script_path), str(gtf_path), str(args.fasta)],
+ stdout=file_)
# Build masked cDNA_seqs file using RepeatMasker.
# Note: requires library to be installed from http://www.girinst.org.
logging.info('- Masking repeats')
masked_path = cdna_path.with_suffix('.fa.masked')
- check_call([str(args.rm_path / 'RepeatMasker'), '-pa', str(args.threads), '-s',
- '-species', 'mouse', '-xsmall', str(cdna_path)])
+ check_call([
+ str(args.rm_path / 'RepeatMasker'), '-pa', str(args.threads), '-s',
+ '-species', 'mouse', '-xsmall', str(cdna_path)
+ ])
# Create blastpairs.
logging.info('- Creating blast pairs')
@@ -51,37 +64,35 @@ def main():
pair_path = tmp_dir / 'blast_pairs.outfmt6'
with pair_path.open('wb') as file_:
- check_call(['blastn',
- '-query', str(cdna_path),
- '-db', str(masked_path),
- '-max_target_seqs', '10000',
- '-outfmt', '6',
- '-evalue', '1e-3',
- '-lcase_masking',
- '-num_threads', str(args.threads),
- '-word_size', '11'],
- stdout=file_)
+ check_call(
+ [
+ 'blastn', '-query', str(cdna_path), '-db', str(masked_path),
+ '-max_target_seqs', '10000', '-outfmt', '6', '-evalue', '1e-3',
+ '-lcase_masking', '-num_threads', str(args.threads),
+ '-word_size', '11'
+ ],
+ stdout=file_)
pair_gz_path = pair_path.with_suffix('.gene_syms.outfmt6.gz')
with gzip.open(str(pair_gz_path), 'wb') as file_:
script_path = (args.ff_path / 'util' /
'blast_outfmt6_replace_trans_id_w_gene_symbol.pl')
- check_call(['perl', str(script_path), str(cdna_path), str(pair_path)],
- stdout=file_)
+ check_call(
+ ['perl', str(script_path), str(cdna_path), str(pair_path)],
+ stdout=file_)
# Prepare library.
logging.info('- Preparing library')
- script_path = args.ff_path / 'util' / 'prep_genome_lib.pl'
- check_call(['perl', str(script_path),
- '--genome_fa', str(args.fasta),
- '--gtf', str(gtf_path),
- '--blast_pairs', str(pair_gz_path),
- '--cdna_fa', str(cdna_path),
- '--CPU', str(args.threads),
- '--max_readlength', str(args.read_length),
- '--output_dir', str(args.output_dir)])
-
- # shutil.rmtree(str(args.tmp_dir))
+ script_path = args.ff_path / 'prep_genome_lib.pl'
+ check_call([
+ 'perl', str(script_path), '--genome_fa', str(args.fasta), '--gtf',
+ str(gtf_path), '--blast_pairs', str(pair_gz_path), '--cdna_fa',
+ str(cdna_path), '--CPU', str(args.threads), '--max_readlength',
+ str(args.read_length), '--output_dir', str(args.output_dir)
+ ])
+
+ shutil.rmtree(str(args.tmp_dir))
+
def parse_args():
"""Parses command line arguments."""
@@ -91,7 +102,7 @@ def parse_args():
parser.add_argument('--fasta', required=True, type=Path)
parser.add_argument('--gtf', required=True, type=Path)
parser.add_argument('--output_dir', required=True, type=Path)
-
+
parser.add_argument('--ff_path', required=False, default='', type=Path)
parser.add_argument('--rm_path', required=False, default='', type=Path)
diff --git a/setup.py b/setup.py
index cefc7ae..0267a62 100644
--- a/setup.py
+++ b/setup.py
@@ -27,12 +27,12 @@
setuptools.setup(
name='imfusion',
- version='0.3.1',
+ version='0.3.2',
description=('Tool for identifying transposon insertions in '
'Insertional Mutagenesis screens from gene-transposon '
'fusions using single- and paired-end RNA-sequencing data.'),
long_description=README + '\n\n' + HISTORY,
- url='https://github.com/jrderuiter/im-fusion',
+ url='https://github.com/nki-ccb/imfusion',
author='Julian de Ruiter',
author_email='julianderuiter@gmail.com',
license='MIT license',
diff --git a/src/imfusion/__init__.py b/src/imfusion/__init__.py
index fc8c8ee..f6c0fc7 100644
--- a/src/imfusion/__init__.py
+++ b/src/imfusion/__init__.py
@@ -2,4 +2,4 @@
__author__ = 'Julian de Ruiter'
__email__ = 'julianderuiter@gmail.com'
-__version__ = '0.3.1'
+__version__ = '0.3.2'
diff --git a/src/imfusion/build/indexers/base.py b/src/imfusion/build/indexers/base.py
index cd1c53b..0fb00e3 100644
--- a/src/imfusion/build/indexers/base.py
+++ b/src/imfusion/build/indexers/base.py
@@ -249,7 +249,10 @@ def configure_args(cls, parser):
help='Path to the transposon features (tsv).')
base_group.add_argument(
- '--output_dir', type=pathlib.Path, required=True)
+ '--output_dir',
+ type=pathlib.Path,
+ required=True,
+ help='Path to write the built reference.')
# Optional blacklist arguments.
blacklist_group = parser.add_argument_group('Blacklist arguments')
diff --git a/src/imfusion/external/__init__.py b/src/imfusion/external/__init__.py
index e69de29..40a96af 100644
--- a/src/imfusion/external/__init__.py
+++ b/src/imfusion/external/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
diff --git a/src/imfusion/external/star_fusion.py b/src/imfusion/external/star_fusion.py
new file mode 100644
index 0000000..7259470
--- /dev/null
+++ b/src/imfusion/external/star_fusion.py
@@ -0,0 +1,24 @@
+"""Module containing functions for calling star-fusion."""
+
+from .util import run_command
+
+
+def star_fusion(junction_path, reference_path, output_dir=None, log_path=None):
+ """Identifies endogenous fusions from an existing STAR alignment.
+
+ Parameters
+ ----------
+ reference_path : pathlib.Path
+ Path to the reference genome.
+ out_base_path : pathlib.Path
+ Base output path for the built index.
+ log_path : pathlib.Path
+ Where to write the log output.
+
+ """
+
+ args = [
+ 'STAR-Fusion', '--genome_lib_dir', str(reference_path), '-J',
+ str(junction_path), '--output_dir', str(output_dir)
+ ]
+ run_command(args=args, log_path=log_path)
diff --git a/src/imfusion/insertions/aligners/base.py b/src/imfusion/insertions/aligners/base.py
index 5ee7511..f9a6772 100644
--- a/src/imfusion/insertions/aligners/base.py
+++ b/src/imfusion/insertions/aligners/base.py
@@ -98,7 +98,7 @@ def configure_args(cls, parser):
type=pathlib.Path,
required=True,
help='Path to the index of the augmented reference '
- 'generated by im-fusion build.')
+ 'generated by imfusion-build.')
base_group.add_argument(
'--output_dir',
diff --git a/src/imfusion/insertions/aligners/star.py b/src/imfusion/insertions/aligners/star.py
index 5471feb..50511bb 100644
--- a/src/imfusion/insertions/aligners/star.py
+++ b/src/imfusion/insertions/aligners/star.py
@@ -13,13 +13,17 @@
from typing import Any
import re
+from future.utils import native_str
from intervaltree import IntervalTree
import numpy as np
import pandas as pd
+from pathlib2 import Path
+import pysam
import toolz
from imfusion.build.indexers.star import StarReference
from imfusion.external.star import star_align
+from imfusion.external.star_fusion import star_fusion
from imfusion.external.stringtie import stringtie_assemble
from imfusion.external.compound import sort_bam
from imfusion.external.util import which, parse_arguments
@@ -114,10 +118,17 @@ class StarAligner(Aligner):
distance of each other to be merged. The value should be chosen to
reflect the expected or emprical insert size.
max_junction_dist : int
- Maxixmum distance within which groups of spanning mates are assigned
+ Maximum distance within which groups of spanning mates are assigned
to a junction fusion (which is supported by split reads, so that its
exact position is known). Groups that cannot be assigned to a junction
fusion are considered to arise from a separate insertion.
+ star_fusion_ref_path : Path
+ Path to a STAR-Fusion reference. If given, STAR-Fusion is used to
+ identify endogenous gene fusions from IM-Fusions alignment. Identified
+ gene fusions are written to a gene_fusions.txt file in the output
+ directory. Note that the STAR-Fusion reference should use the same
+ reference genome as IM-Fusions reference to avoid compatability issues.
+ Requires STAR-Fusion to be installed.
"""
@@ -136,7 +147,8 @@ def __init__(
external_sort=False, # type: bool
merge_junction_dist=10, # type: int
max_spanning_dist=300, # type: int
- max_junction_dist=10000 # type: int
+ max_junction_dist=10000, # type: int
+ star_fusion_ref_path=None # type: Path
): # type: (...) -> None
super().__init__(reference=reference, logger=logger)
@@ -156,6 +168,8 @@ def __init__(
self._filter_orientation = filter_orientation
self._filter_blacklist = filter_blacklist
+ self._star_fusion_ref_path = star_fusion_ref_path
+
@property
def dependencies(self):
"""External dependencies required by aligner."""
@@ -165,6 +179,9 @@ def dependencies(self):
if self._assemble:
programs += ['stringtie']
+ if self._star_fusion_ref_path is not None:
+ programs += ['STAR-Fusion']
+
return programs
def identify_insertions(self, fastq_path, output_dir, fastq2_path=None):
@@ -195,9 +212,21 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None):
# Perform alignment using STAR.
alignment_path = output_dir / 'alignment.bam'
+ star_dir = output_dir / '_star'
+
if not alignment_path.exists():
self._logger.info('Performing alignment using STAR')
- self._align(fastq_path, output_dir, fastq2_path=fastq2_path)
+
+ self._align(
+ fastq_path=fastq_path,
+ output_dir=star_dir,
+ fastq2_path=fastq2_path)
+
+ path.symlink_relative(
+ src_path=star_dir / 'Aligned.sortedByCoord.out.bam',
+ dest_path=alignment_path)
+
+ pysam.index(native_str(alignment_path))
else:
self._logger.info('Using existing STAR alignment')
@@ -222,12 +251,11 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None):
else:
assembled_path = None
- # Extract identified fusions.
+ # Extract identified fusions and corresponding insertions.
self._logger.info('Extracting gene-transposon fusions')
- fusion_path = output_dir / 'fusions.out'
- fusions = list(self._extract_fusions(fusion_path))
+ junction_path = star_dir / 'Chimeric.out.junction'
+ fusions = list(self._extract_fusions(junction_path))
- # Extract insertions.
self._logger.info('Summarizing insertions')
insertions = list(
util.extract_insertions(
@@ -238,21 +266,30 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None):
ffpm_fastq_path=fastq_path,
chromosomes=None))
- # self._logger.info('Filtering insertions')
insertions = util.filter_insertions(
insertions,
features=self._filter_features,
orientation=self._filter_orientation,
blacklist=self._filter_blacklist)
+ # Identify endogenous fusions with STAR if requested.
+ if self._star_fusion_ref_path is not None:
+ self._logger.info('Identifying gene fusions using STAR-Fusion')
+ star_fusion_dir = output_dir / '_star_fusion'
+ star_fusion(
+ junction_path,
+ self._star_fusion_ref_path,
+ output_dir=star_fusion_dir)
+
+ path.symlink_relative(
+ src_path=star_fusion_dir /
+ 'star-fusion.fusion_candidates.final.abridged',
+ dest_path=output_dir / 'gene_fusions.txt')
+
for insertion in insertions:
yield insertion
def _align(self, fastq_path, output_dir, fastq2_path=None):
- # Put output into subdirectory, we will symlink the
- # expected outputs from STAR later.
- star_dir = output_dir / '_star'
-
# Gather default arguments.
sort_type = 'Unsorted' if self._external_sort else 'SortedByCoordinate'
@@ -269,26 +306,17 @@ def _align(self, fastq_path, output_dir, fastq2_path=None):
fastq_path=fastq_path,
fastq2_path=fastq2_path,
index_path=self._reference.index_path,
- output_dir=star_dir,
+ output_dir=output_dir,
extra_args=toolz.merge(args, self._extra_args))
# If not yet sorted, sort bam file using samtools/sambamba.
- sorted_bam_path = star_dir / 'Aligned.sortedByCoord.out.bam'
+ sorted_bam_path = output_dir / 'Aligned.sortedByCoord.out.bam'
if sort_type == 'Unsorted':
- unsorted_bam_path = star_dir / 'Aligned.out.bam'
+ unsorted_bam_path = output_dir / 'Aligned.out.bam'
sort_bam(unsorted_bam_path, sorted_bam_path, threads=self._threads)
unsorted_bam_path.unlink()
- # Symlink fusions and alignment into expected location.
- dest_bam_path = output_dir / 'alignment.bam'
- path.symlink_relative(
- src_path=sorted_bam_path, dest_path=dest_bam_path)
-
- path.symlink_relative(
- src_path=star_dir / 'Chimeric.out.junction',
- dest_path=output_dir / 'fusions.out')
-
def _extract_fusions(self, fusion_path):
# Read chimeric junction data.
chimeric_data = read_chimeric_junctions(fusion_path)
@@ -322,32 +350,103 @@ def configure_args(cls, parser):
super().configure_args(parser)
star_group = parser.add_argument_group('STAR arguments')
- star_group.add_argument('--star_threads', type=int, default=1)
- star_group.add_argument('--star_min_flank', type=int, default=12)
star_group.add_argument(
- '--star_external_sort', default=False, action='store_true')
- star_group.add_argument('--star_args', default='')
+ '--star_threads',
+ type=int,
+ default=1,
+ help='Number of threads to use when running STAR.')
- star_group.add_argument('--merge_junction_dist', default=10, type=int)
- star_group.add_argument('--max_spanning_dist', default=300, type=int)
- star_group.add_argument('--max_junction_dist', default=10000, type=int)
+ star_group.add_argument(
+ '--star_min_flank',
+ type=int,
+ default=12,
+ help=('Minimum mapped length of the two segments '
+ 'on each side of the fusion.'))
+
+ star_group.add_argument(
+ '--star_external_sort',
+ default=False,
+ action='store_true',
+ help=('Use samtools/sambamba for sorting, rather than STAR. '
+ 'Takes longer, but results in lower memory usage for '
+ 'large bam files.'))
+
+ star_group.add_argument(
+ '--star_args', default='', help='Additional args to pass to STAR.')
+
+ star_group.add_argument(
+ '--merge_junction_dist',
+ default=10,
+ type=int,
+ help=('Maximum distance within which fusions supported by split '
+ 'reads (overlapping the junction, so that the exact '
+ 'breakpoint is known) are merged. This merging avoids '
+ 'calling multiple fusions due to slight variations in the '
+ 'alignment, although this value should not be chosen '
+ 'too large to avoid merging distinct insertions.'))
+
+ star_group.add_argument(
+ '--max_spanning_dist',
+ default=300,
+ type=int,
+ help=(
+ 'Maximum distance within which spanning mate pairs (mates '
+ 'that do not overlap the fusion junction) are grouped when '
+ 'summarizing spanning chimeric reads. Both mates from two pairs'
+ ' need to be within this distance of each other to be merged. '
+ 'The value should be chosen to reflect the expected or '
+ 'emprical insert size.'))
+ star_group.add_argument(
+ '--max_junction_dist',
+ default=10000,
+ type=int,
+ help=('Maximum distance within which groups of spanning mates are '
+ 'assigned to a junction fusion (which is supported by split '
+ 'reads, so that its exact position is known). Groups that '
+ 'cannot be assigned to a junction fusion are considered to '
+ 'arise from a separate insertion.'))
assemble_group = parser.add_argument_group('Assembly')
assemble_group.add_argument(
- '--assemble', default=False, action='store_true')
+ '--assemble',
+ default=False,
+ action='store_true',
+ help='Perform de-novo transcript assembly using StringTie.')
filt_group = parser.add_argument_group('Filtering')
filt_group.add_argument(
'--no_filter_orientation',
dest='filter_orientation',
default=True,
- action='store_false')
+ action='store_false',
+ help=('Don\'t filter fusions with transposon features and genes '
+ 'in opposite (incompatible) orientations.'))
+
filt_group.add_argument(
'--no_filter_feature',
dest='filter_features',
default=True,
- action='store_false')
- filt_group.add_argument('--blacklisted_genes', nargs='+')
+ action='store_false',
+ help=('Don\'t filter fusions with non-SA/SD features.'))
+
+ filt_group.add_argument(
+ '--blacklisted_genes',
+ nargs='+',
+ help='Blacklisted genes to filter.')
+
+ sf_group = parser.add_argument_group('STAR-Fusion')
+ sf_group.add_argument(
+ '--star_fusion_reference',
+ type=Path,
+ default=None,
+ help=(
+ 'Path to a STAR-Fusion reference. If given, STAR-Fusion is '
+ 'used to identify endogenous gene fusions from IM-Fusions '
+ 'alignment. Identified gene fusions are written to a '
+ 'gene_fusions.txt file in the output directory. Note that the '
+ 'STAR-Fusion reference should use the same reference genome as '
+ 'IM-Fusions reference to avoid compatability issues.'
+ 'Requires STAR-Fusion to be installed.'))
@classmethod
def _parse_args(cls, args):
@@ -363,7 +462,8 @@ def _parse_args(cls, args):
max_junction_dist=args.max_junction_dist,
filter_features=args.filter_features,
filter_orientation=args.filter_orientation,
- filter_blacklist=args.blacklisted_genes)
+ filter_blacklist=args.blacklisted_genes,
+ star_fusion_ref_path=args.star_fusion_reference)
return toolz.merge(super()._parse_args(args), kws)
diff --git a/src/imfusion/insertions/aligners/tophat.py b/src/imfusion/insertions/aligners/tophat.py
index e7a0b75..634713d 100644
--- a/src/imfusion/insertions/aligners/tophat.py
+++ b/src/imfusion/insertions/aligners/tophat.py
@@ -6,9 +6,10 @@
from builtins import *
# pylint: enable=wildcard-import,redefined-builtin,unused-wildcard-import
-from pathlib2 import Path
-
+from future.utils import native_str
import pandas as pd
+from pathlib2 import Path
+import pysam
import toolz
from imfusion.build.indexers.tophat import TophatReference
@@ -104,6 +105,7 @@ def identify_insertions(self, fastq_path, output_dir, fastq2_path=None):
if not alignment_path.exists():
self._logger.info('Performing alignment using Tophat2')
self._align(fastq_path, output_dir, fastq2_path=fastq2_path)
+ pysam.index(native_str(alignment_path))
else:
self._logger.info('Using existing Tophat2 alignment')
@@ -200,26 +202,52 @@ def configure_args(cls, parser):
super().configure_args(parser)
group = parser.add_argument_group('Tophat2 arguments')
- group.add_argument('--tophat_threads', type=int, default=1)
- group.add_argument('--tophat_min_flank', type=int, default=12)
- group.add_argument('--tophat_args', type=parse_arguments, default='')
+ group.add_argument(
+ '--tophat_threads',
+ type=int,
+ default=1,
+ help='Number of threads to use when running Tophat2.')
+
+ group.add_argument(
+ '--tophat_min_flank',
+ type=int,
+ default=12,
+ help=('Minimum mapped length of the two segments '
+ 'on each side of the fusion.'))
+
+ group.add_argument(
+ '--tophat_args',
+ type=parse_arguments,
+ default='',
+ help='Additional args to pass to Tophat2.')
assemble_group = parser.add_argument_group('Assembly')
assemble_group.add_argument(
- '--assemble', default=False, action='store_true')
+ '--assemble',
+ default=False,
+ action='store_true',
+ help='Perform de-novo transcript assembly using StringTie.')
filt_group = parser.add_argument_group('Filtering')
filt_group.add_argument(
'--no_filter_orientation',
dest='filter_orientation',
default=True,
- action='store_false')
+ action='store_false',
+ help=('Don\'t filter fusions with transposon features and genes '
+ 'in opposite (incompatible) orientations.'))
+
filt_group.add_argument(
'--no_filter_feature',
dest='filter_features',
default=True,
- action='store_false')
- filt_group.add_argument('--blacklisted_genes', nargs='+')
+ action='store_false',
+ help=('Don\'t filter fusions with non-SA/SD features.'))
+
+ filt_group.add_argument(
+ '--blacklisted_genes',
+ nargs='+',
+ help='Blacklisted genes to filter.')
@classmethod
def _parse_args(cls, args):
diff --git a/src/imfusion/main/ctg.py b/src/imfusion/main/ctg.py
index ccedafe..db828a5 100644
--- a/src/imfusion/main/ctg.py
+++ b/src/imfusion/main/ctg.py
@@ -105,7 +105,7 @@ def parse_args():
required=True,
type=Path,
help='Path to the merged insertions file from '
- 'im-fusion merge.')
+ 'imfusion-merge.')
base_group.add_argument(
'--reference',
@@ -179,7 +179,7 @@ def parse_args():
default=None,
type=Path,
help='Path to the merged expression file from '
- 'im-fusion merge.')
+ 'imfusion-merge.')
de_group.add_argument(
'--de_threshold',
diff --git a/src/imfusion/main/expression.py b/src/imfusion/main/expression.py
index 7b96f54..99c2e38 100644
--- a/src/imfusion/main/expression.py
+++ b/src/imfusion/main/expression.py
@@ -71,7 +71,7 @@ def parse_args():
type=Path,
required=True,
help='Path to the index of the augmented reference '
- 'generated by im-fusion build.')
+ 'generated by imfusion-build.')
parser.add_argument('--output', type=Path, default=None)
diff --git a/src/imfusion/main/merge.py b/src/imfusion/main/merge.py
index ac28afc..906e471 100644
--- a/src/imfusion/main/merge.py
+++ b/src/imfusion/main/merge.py
@@ -16,7 +16,7 @@
def main():
- """Main function of im-fusion-merge."""
+ """Main function of imfusion-merge."""
args = _parse_args()
@@ -37,7 +37,7 @@ def main():
def _parse_args():
- """Parses command-line arguments for im-fusion-merge."""
+ """Parses command-line arguments for imfusion-merge."""
parser = argparse.ArgumentParser()
diff --git a/tests/imfusion/insertions/aligners/test_star.py b/tests/imfusion/insertions/aligners/test_star.py
index fe4dada..e8ceb9f 100644
--- a/tests/imfusion/insertions/aligners/test_star.py
+++ b/tests/imfusion/insertions/aligners/test_star.py
@@ -248,6 +248,8 @@ def test_identify_insertions(self, read_paths, star_reference,
star_mock = mocker.patch.object(star, 'star_align')
mocker.patch.object(star.util, 'count_lines', return_value=8e6)
+ mocker.patch.object(star.pysam, 'index')
+
# Call identify insertions.
fastq, fastq2 = read_paths
diff --git a/tests/imfusion/insertions/aligners/test_tophat.py b/tests/imfusion/insertions/aligners/test_tophat.py
index 782271e..6b4f6f7 100644
--- a/tests/imfusion/insertions/aligners/test_tophat.py
+++ b/tests/imfusion/insertions/aligners/test_tophat.py
@@ -92,6 +92,8 @@ def test_identify_insertions(self, read_paths, tophat_reference,
tophat_mock = mocker.patch.object(tophat, 'tophat2_align')
mocker.patch.object(tophat.util, 'count_lines', return_value=8e6)
+ mocker.patch.object(tophat.pysam, 'index')
+
# Call identify insertions.
fastq, fastq2 = read_paths