Skip to content

Commit

Permalink
Merge branch 'hotfix/1.1.0' into main
Browse files Browse the repository at this point in the history
Fixed bug that added 1 to total fragment length, this will slightly increase the fpbm_br and fpbm_nbr values
Added option to analyse long read data
  • Loading branch information
sb43 committed Dec 23, 2022
2 parents 30a304d + 76809db commit cbfb6f7
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 36 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@

## 1.1.0
* Added option to analyse long read data
* Fixed bug that added 1 to total fragment length, this will slightly increase the fpbm_br and fpbm_nbr values
* added option to add header line to output

## 1.0.0
* First release to calculate TA repeats FPBM values using samtools bedcoverage data

4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ USER root

MAINTAINER [email protected]

ENV ANALYSE_TA_VER '1.0.0'
ENV ANALYSE_TA_VER '1.1.0'

# install system tools
RUN apt-get -yq update
Expand Down Expand Up @@ -40,7 +40,7 @@ RUN pip3 install --install-option="--prefix=$CGP_OPT/python-lib" dist/$(ls -1 di
FROM ubuntu:20.04

LABEL uk.ac.sanger.cgp="Cancer Genome Project, Wellcome Sanger Institute" \
version="1.0.0" \
version="1.1.0" \
description="Tool to perform TA repeat bed coverage analysis"

### security upgrades and cleanup
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ Various exceptions can occur for malformed input files.

* ```test_sample 6.12 15.8``` command line output <sample_name> <mean_fpbm_broken> <mean_fpbm_non_broken>.

### outputFormat with dnovo flag set
* ```Applicable only to Long Read sequencing data ```
Use of dnovo flag will classify the TA repeat regions into broken and non-broken based on the user defined dnovo_cutoff parmater.
This is applicable for long read data where average length of TA repeat is known for each interval
* ```br:broken```
* ```nbr:non_broken```
* ```test_sample 6.12 15.8``` command line output <sample_name> <mean_fpbm_br> <mean_fpbm_nbr> <ref_br> <ref_nbr> <mean_fpbm_dnovo_br> <mean_fpbm_dnovo_br> <dnovo_br> <dnovo_nbr> <dnovo_in_ref_br> <dnovo_in_ref_nbr> <cumulative_fpbm_br> <cumulative_fpbm_nbr> <jaccard_br> <jaccard_nbr>.

## INSTALL
Installing via `pip install`. Simply execute with the path to the compiled 'whl' found on the [release page][analyse_ta-releases]:

Expand Down
87 changes: 74 additions & 13 deletions analyse_ta/commandline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import argparse
import pkg_resources

# load config and reference files....

version = pkg_resources.require("analyse_ta")[0].version
Expand All @@ -11,32 +12,92 @@
def main(): # pragma: no cover <--
usage = "\n %prog [options] -br input_br.bedcov -nbr input_nbr.bedcov -s <sample>"

optParser = argparse.ArgumentParser(prog='analyse_ta',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
optParser = argparse.ArgumentParser(
prog="analyse_ta", formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
optional = optParser._action_groups.pop()
required = optParser.add_argument_group('required arguments')
required = optParser.add_argument_group("required arguments")

required.add_argument(
"-br",
"--file_br",
type=str,
dest="file_br",
required=True,
default=None,
help="broken ta repeat bed coverage file",
)
required.add_argument(
"-nbr",
"--file_nbr",
type=str,
dest="file_nbr",
required=True,
default=None,
help="non broken ta repeat bed coverage file",
)

optional.add_argument(
"-s",
"--sample_name",
type=str,
dest="sample_name",
required=False,
default="test_sample",
help="sample name",
)

optional.add_argument(
"-dn",
"--dnovo",
action="store_true",
dest="dnovo",
default=False,
help="set flag to analyse dnovo long read data",
)

optional.add_argument(
"-ah",
"--add_header",
action="store_true",
dest="add_header",
default=False,
help="set flag to add_header line, useful in batch mode to set for first sample",
)

required.add_argument("-br", "--file_br", type=str, dest="file_br", required=True,
default=None, help="broken ta repeat bed coverage file")
required.add_argument("-nbr", "--file_nbr", type=str, dest="file_nbr", required=True,
default=None, help="non broken ta repeat bed coverage file")
optional.add_argument(
"-dn_cutoff",
"--dnovo_cutoff",
type=int,
dest="dnovo_cutoff",
required=False,
default=2,
help="cut off length ratio with ref interval to flag as broken region",
)

optional.add_argument("-s", "--sample_name", type=str, dest="sample_name", required=False,
default='test_sample', help="sample name")
optional.add_argument("-v", "--version", action='version', version='%(prog)s ' + version)
optional.add_argument("-q", "--quiet", action="store_false", dest="verbose", required=False, default=True)
optional.add_argument(
"-v", "--version", action="version", version="%(prog)s " + version
)
optional.add_argument(
"-q",
"--quiet",
action="store_false",
dest="verbose",
required=False,
default=True,
)

optParser._action_groups.append(optional)
if len(sys.argv) == 0:
optParser.print_help()
sys.exit(1)
opts = optParser.parse_args()
if not opts.file_nbr or not opts.file_br:
sys.exit('\nERROR Arguments required\n\tPlease run: analyse_ta.py --help\n')
sys.exit("\nERROR Arguments required\n\tPlease run: analyse_ta.py --help\n")
# vars function returns __dict__ of Namespace instance
processed = processcov.processBedCov(**vars(opts))
print(processed.results)


if __name__ == '__main__':
if __name__ == "__main__":
main()
135 changes: 116 additions & 19 deletions analyse_ta/process_bedcov.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,141 @@
import os
import sys
import pandas as pd
import numpy as np


'''
"""
This class claculates mean coverage for broken and non_broken TA reapeat depth output fron samtools bedcov
'''
"""


class processBedCov:
"""
Main class , loads user defined parameters and files
Main class , loads user defined parameters and files
"""

def __init__(self, **kwargs):
self.br_file = kwargs['file_br']
self.nbr_file = kwargs['file_nbr']
self.sample = kwargs.get('sample_name', 'test_sample')
self.br_file = kwargs["file_br"]
self.nbr_file = kwargs["file_nbr"]
self.dnovo = kwargs["dnovo"]
self.add_header = kwargs["add_header"]
self.dnovo_cutoff = kwargs["dnovo_cutoff"]
self.sample = kwargs.get("sample_name", "test_sample")
# check input data ...
self.results = self.process()

def process(self):
mydf_br = create_df_to_merge(self.br_file, 'br')
mydf_nbr = create_df_to_merge(self.nbr_file, 'nbr')
mydf_br = create_df_to_merge(self.br_file, "br", self.dnovo_cutoff, self.dnovo)
mydf_nbr = create_df_to_merge(
self.nbr_file, "nbr", self.dnovo_cutoff, self.dnovo
)
merged_df = pd.concat([mydf_nbr, mydf_br])
mean_fpmb_const = merged_df['frl'].sum(axis=0)
merged_df['fpbm'] = merged_df['frl'] / mean_fpmb_const * (10**6)
br_mean = merged_df.loc[merged_df.ta_type == 'br']
nbr_mean = merged_df.loc[merged_df.ta_type == 'nbr']
return f"{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}"
mean_fpmb_const = merged_df["frl"].sum(axis=0)
merged_df["fpbm"] = merged_df["frl"] / mean_fpmb_const * (10**6)
br_mean = merged_df.loc[merged_df.ta_type == "br"]
nbr_mean = merged_df.loc[merged_df.ta_type == "nbr"]
header = f"sample\tfpbm_br\tfpbm_nbr\n"
output = ""
if self.add_header:
output = header
if self.dnovo:
if self.add_header:
output = output.strip()
output = (f"{output}\tref_br\tref_nbr\tmean_fpbm_dnovo_br\tmean_fpbm_dnovo_br\tdnovo_br\tdnovo_nbr"
f"\tdnovo_in_ref_br\tdnovo_in_ref_nbr\tcumulative_fpbm_br"
f"\tcumulative_fpbm_nbr\tjaccard_br\tjaccard_nbr\n")
br_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "br"]
nbr_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "nbr"]
br_cumulative_mean = merged_df.loc[
(merged_df["ta_type_dnovo"] == "br") | (merged_df["ta_type"] == "br")
]
nbr_cumulative_mean = merged_df.loc[
(merged_df["ta_type_dnovo"] == "nbr") | (merged_df["ta_type"] == "nbr")
]
num_br = len(merged_df[merged_df["ta_type"] == "br"])
num_nbr = len(merged_df[merged_df["ta_type"] == "nbr"])
num_br_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "br"])
num_nbr_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "nbr"])
br_m11 = len(
merged_df[
(merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] == "br")
]
)
br_m01 = len(
merged_df[
(merged_df["ta_type_dnovo"] != "br") & (merged_df["ta_type"] == "br")
]
)
br_m10 = len(
merged_df[
(merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] != "br")
]
)
nbr_m11 = len(
merged_df[
(merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] == "nbr")
]
)
nbr_m01 = len(
merged_df[
(merged_df["ta_type_dnovo"] != "nbr") & (merged_df["ta_type"] == "nbr")
]
)
nbr_m10 = len(
merged_df[
(merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] != "nbr")
]
)

jindex_br = (br_m11 / (br_m01 + br_m10 + br_m11)) * 100
jindex_nbr = (nbr_m11 / (nbr_m01 + nbr_m10 + nbr_m11)) * 100

def create_df_to_merge(infile, ta_type):
return (
f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}"
f"\t{nbr_mean['fpbm'].mean(axis=0):.2f}\t{num_br}\t{num_nbr}"
f"\t{br_mean_dnovo['fpbm'].mean(axis=0):.2f}\t{nbr_mean_dnovo['fpbm'].mean(axis=0):.2f}"
f"\t{num_br_dnovo}\t{num_nbr_dnovo}\t{br_m11}\t{nbr_m11}"
f"\t{br_cumulative_mean['fpbm'].mean(axis=0):.2f}"
f"\t{nbr_cumulative_mean['fpbm'].mean(axis=0):.2f}\t{jindex_br:.2f}\t{jindex_nbr:.2f}"
)

output = f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}"
return output


def create_df_to_merge(infile, ta_type, dnovo_cutoff, dnovo=None):
"""
create pandas data frame
create pandas data frame
"""
if not os.path.isfile(infile):
print(f"File not found {infile}")
return None
df = pd.read_csv(infile, compression='infer', sep="\t", low_memory=False,
header=None, names=['chr', 'start', 'end', 'coverage'])
df['frl'] = df['coverage'] / (df['end'] - df['start']) + 1
df['ta_type'] = ta_type
df = pd.read_csv(
infile,
compression="infer",
sep="\t",
low_memory=False,
header=None,
names=["chr", "start", "end", "coverage"],
)

if dnovo:
df["frl"] = (df["coverage"] * 2) / ((df["end"] - df["start"]) + 1)
df["ta_type_dnovo"] = np.select(
[df["frl"] <= dnovo_cutoff, df["frl"] > dnovo_cutoff], ["nbr", "br"]
)
df["ta_type"] = ta_type

else:
df["frl"] = df["coverage"] / ((df["end"] - df["start"]) + 1)
df["ta_type"] = ta_type
return df


def _print_df(mydf, out_file):
if out_file:
mydf.to_csv(
out_file, sep="\t", mode="w", header=True, index=True, doublequote=False
)
else:
sys.exit("Outfile not provided")
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from setuptools import setup

config = {
'version': '1.0.0',
'version': '1.1.0',
'name': 'analyse_ta',
'description': 'Tool to analyse TA repeats bed coverage...',
'author': 'Shriram Bhosle',
Expand Down
17 changes: 16 additions & 1 deletion test/test_ta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,26 @@ class TestClass():
test_dir = configdir + '/data/'
options = {'file_br': test_dir + 'test_br.bedcov',
'file_nbr': test_dir + 'test_nbr.bedcov',
'dnovo':False,
'add_header':True,
'dnovo_cutoff':2,
'sample_name': 'my_test_sample',
}
options_ta_ext = {'file_br': test_dir + 'test_br.bedcov',
'file_nbr': test_dir + 'test_nbr.bedcov',
'dnovo':True,
'add_header':False,
'dnovo_cutoff':10,
'sample_name': 'my_test_sample',
}
processed=processcov.processBedCov(**options)
processed_ta_ext=processcov.processBedCov(**options_ta_ext)
# celline output
def test_bedcov(self):
f=self.processed
assert f.results == "my_test_sample\t35.23\t80.85"
assert f.results == "sample\tfpbm_br\tfpbm_nbr\nmy_test_sample\t33.04\t82.05"

def test_ta_ext(self):
f_ext=self.processed_ta_ext
assert f_ext.results == "my_test_sample\t33.04\t82.05\t5434\t10000\t79.39\t14.36\t11970\t3464\t2776\t806\t67.53\t67.78\t18.98\t6.37"

0 comments on commit cbfb6f7

Please sign in to comment.