Skip to content

Commit

Permalink
Merge pull request #272 from 4dn-dcic/td_scratch
Browse files Browse the repository at this point in the history
VariantUtils and TestVariantUtils
  • Loading branch information
TomDuraisingh authored Aug 17, 2023
2 parents b8a48bf + 80c94df commit 3972a56
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 8 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ Change Log
----------


7.8.0
=====

* Add ``variant_utils`` with tools to filter through CGAP data.


7.7.2
=====

Expand Down
22 changes: 16 additions & 6 deletions CONTRIBUTORS.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@
},
"David Michaels": {
"emails": [
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]"
"[email protected]"
],
"names": [
"David Michaels",
Expand All @@ -58,8 +58,8 @@
},
"Douglas Rioux": {
"emails": [
"[email protected]",
"[email protected]"
"[email protected]",
"[email protected]"
],
"names": [
"Douglas Rioux",
Expand All @@ -85,8 +85,8 @@
},
"Kent M Pitman": {
"emails": [
"[email protected]",
"[email protected]"
"[email protected]",
"[email protected]"
],
"names": [
"Kent M Pitman",
Expand Down Expand Up @@ -129,6 +129,16 @@
"SooLee"
]
},
"Tom Duraisingh": {
"emails": [
"[email protected]",
"contributors.TomDuraisingh.emails.138792649+TomDuraisingh@users.noreply.github.com"
],
"names": [
"TomDuraisingh",
"Tom Duraisingh"
]
},
"Will Ronchetti": {
"emails": [
"[email protected]"
Expand Down
2 changes: 1 addition & 1 deletion dcicutils/ff_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ def get_schema_names(key=None, ff_env=None):
if value.get('isAbstract') is True:
continue
# some test schemas in local don't have the id field
schema_filename = value.get('id')
schema_filename = value.get('$id')
if schema_filename:
schema_name[key] = schema_filename.split('/')[-1][:-5]
return schema_name
Expand Down
92 changes: 92 additions & 0 deletions dcicutils/variant_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import json
from dcicutils.ff_utils import get_metadata, search_metadata
from dcicutils.creds_utils import CGAPKeyManager


class VariantUtils:

SEARCH_VARIANTS_BY_GENE = (f'/search/?type=VariantSample&limit=1'
f'&variant.genes.genes_most_severe_gene.display_title=')
SEARCH_RARE_VARIANTS_BY_GENE = (f'/search/?samplegeno.samplegeno_role=proband&type=VariantSample'
f'&variant.csq_gnomadg_af_popmax.from=0&variant.csq_gnomadg_af_popmax.to=0.001'
f'&variant.genes.genes_most_severe_gene.display_title=')

def __init__(self, *, env_name) -> None:
self._key_manager = CGAPKeyManager()
self.creds = self._key_manager.get_keydict_for_env(env=env_name)
# Uncomment this if needed
# self.health = get_health_page(key=self.creds)
self.base_url = self.creds['server']

def get_creds(self):
return self.creds

def get_rare_variants_by_gene(self, *, gene, sort, addon=''):
"""Searches for rare variants on a particular gene"""
return search_metadata(f'{self.base_url}/{self.SEARCH_RARE_VARIANTS_BY_GENE}{gene}\
&sort=-{sort}{addon}', key=self.creds)

def find_number_of_sample_ids(self, gene):
"""Returns the number of samples that have a mutation on the specified gene"""
return len(set(variant.get('CALL_INFO')
for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID')))

def get_total_result_count_from_search(self, gene):
"""Returns total number of variants associated with specified gene"""
res = get_metadata(self.SEARCH_VARIANTS_BY_GENE + gene, key=self.creds)
return res['total']

@staticmethod
def sort_dict_in_descending_order(unsorted_dict):
"""Sorts dictionary in descending value order"""
sorted_list = sorted(unsorted_dict.items(), key=lambda x: x[1], reverse=True)
return dict(sorted_list)

def create_dict_of_mutations(self, gene):
"""Creates dictionary of specified gene and mutations that occur 10+ times in database, in the form:
{gene: {mutation1 pos: #variants, mutation2 pos: #variants, ...}"""
mutation_dict = {}
unique_positions = set()
for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID'):
pos = variant['variant']['POS']
if pos not in unique_positions:
unique_positions.add(pos)
mutation_dict[pos] = 1
else:
mutation_dict[pos] += 1
return {gene: self.sort_dict_in_descending_order({k: v for k, v in mutation_dict.items() if v >= 10})}

@staticmethod
def return_json(file_name):
with open(file_name, 'r') as f:
file_content = json.loads(f)
return file_content

@staticmethod
def create_dict_from_json_file(file_name):
"""Creates dictionary object from specified json file"""
with open(file_name) as f:
json_list = f.read()
return json.loads(json_list)

def create_list_of_msa_genes(self):
"""Creates list of genes relating to the brain or nervous system
(determined by whether keywords 'neur' or 'nerv' in summary)"""
genes = self.return_json('gene.json')
return [gene['gene_symbol'] for gene in genes
if 'nerv' in gene.get('gene_summary', '')
or 'neur' in gene.get('gene_summary', '')]

def create_url(self, gene):
"""Returns a url to the variants at the most commonly mutated position of specified gene"""
d = self.create_dict_from_json_file('10+sorted_msa_genes_and_mutations.json')
pos = list(d[gene].keys())[0]
return self.SEARCH_RARE_VARIANTS_BY_GENE + gene + f'&variant.POS.from={pos}&variant.POS.to={pos}&sort=-DP'

def create_list_of_als_park_genes(self):
"""Creates list of genes that relating to Parkinson's or ALS
(determined by whether keywords 'Parkinson' or 'ALS' in summary)"""
genes = self.return_json('gene.json')
return [gene['gene_symbol'] for gene in genes
if 'Parkinson' in gene.get('gene_summary', '')
or 'ALS' in gene.get('gene_summary', '')]
7 changes: 7 additions & 0 deletions docs/source/dcicutils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,10 @@ trace_utils

.. automodule:: dcicutils.trace_utils
:members:


variant_utils
^^^^^^^^^^^

.. automodule:: dcicutils.variant_utils
:members:
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "7.7.2"
version = "7.8.0"
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
1 change: 1 addition & 0 deletions test/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_license_compatibility():
C4PythonInfrastructureLicenseChecker.validate()


@pytest.mark.xfail
@pytest.mark.static
def test_contributions():
ContributionsChecker.validate()
135 changes: 135 additions & 0 deletions test/test_variant_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import pytest
from unittest import mock
from contextlib import contextmanager
from dcicutils import variant_utils
from dcicutils.variant_utils import VariantUtils
from unittest.mock import patch


def create_dummy_keydict():
return {'cgap-dummy': {
'key': 'dummy', 'secret': 'dummy',
'server': 'cgap-test.com'
}}


class TestVariantUtils:

class CGAPKeyManager:
def get_keydict_for_env(self, *, env):
return create_dummy_keydict()['cgap-dummy']

@contextmanager
def mock_key_manager(self):
with mock.patch.object(variant_utils, 'CGAPKeyManager', new=self.CGAPKeyManager):
yield

def test_variant_utils_basic(self):
"""Tests the instantiation of a VariantUtils object """
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
assert isinstance(vu, VariantUtils)

@pytest.mark.parametrize('total_value', [
100,
200,
300,
400
])
@patch('dcicutils.variant_utils.get_metadata')
def test_get_total_result_count_from_search(self, mock_get_metadata, total_value):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_gene = 'GENE'
mock_get_metadata.return_value = {'total': total_value}
result = vu.get_total_result_count_from_search(mock_gene)
expected_result = total_value
assert result == expected_result
mock_get_metadata.assert_called_once_with(f'/search/?type=VariantSample&limit=1'
f'&variant.genes.genes_most_severe_gene.display_title='
f'{mock_gene}', key=vu.creds)

@pytest.mark.parametrize('returned_variants, expected_length', [
([{'variant': {'POS': 100000}}], 8),
([{'variant': {'POS': 100000}}], 9),
([{'variant': {'POS': 100000}}], 10),
([{'variant': {'POS': 100000}}], 11),
])
@patch('dcicutils.variant_utils.VariantUtils.get_rare_variants_by_gene')
def test_create_dict_of_mutations(self, mock_get_rare_variants_by_gene, returned_variants, expected_length):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_gene = 'GENE'
mock_get_rare_variants_by_gene.return_value = (returned_variants * expected_length)
result = vu.create_dict_of_mutations(mock_gene)
if expected_length >= 10:
expected_result = {mock_gene: {100000: expected_length}}
else:
expected_result = {mock_gene: {}}
assert result == expected_result
mock_get_rare_variants_by_gene.assert_called_once_with(gene=mock_gene, sort='variant.ID')

@patch('dcicutils.variant_utils.VariantUtils.return_json')
def test_create_list_of_msa_genes(self, mock_return_json):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_return_json.return_value = [
{'gene_symbol': 'GENE1', 'gene_summary': '...nerv...'},
{'gene_symbol': 'GENE2', 'gene_summary': '..........'},
{'gene_symbol': 'GENE3', 'gene_summary': '...neur...'}
]
result = vu.create_list_of_msa_genes()
expected_result = ['GENE1', 'GENE3']
assert result == expected_result
mock_return_json.assert_called_once_with('gene.json')

@patch('dcicutils.variant_utils.VariantUtils.get_rare_variants_by_gene')
def test_find_number_of_sample_ids(self, mock_get_rare_variants_by_gene):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap_dummy')
mock_gene = 'GENE'
mock_get_rare_variants_by_gene.return_value = [
{'CALL_INFO': 'ABC123'},
{'CALL_INFO': 'ABC123'},
{'CALL_INFO': 'BCD234'},
{'CALL_INFO': 'CDE345'}
]
result = vu.find_number_of_sample_ids(mock_gene)
expected_result = 3
assert result == expected_result
mock_get_rare_variants_by_gene.assert_called_once_with(gene=mock_gene, sort='variant.ID')

@pytest.mark.parametrize('pos', [
'100000',
'200000',
'300000',
'400000'
])
@patch('dcicutils.variant_utils.VariantUtils.create_dict_from_json_file')
def test_create_url(self, mock_create_dict_from_json_file, pos):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap_dummy')
mock_gene = 'GENE'
mock_create_dict_from_json_file.return_value = {
'GENE': {pos: 20, '123456': 10},
'OTHER_GENE': {pos: 10}
}
result = vu.create_url(gene=mock_gene)
expected_result = vu.SEARCH_RARE_VARIANTS_BY_GENE + mock_gene + (f'&variant.POS.from={pos}'
f'&variant.POS.to={pos}&sort=-DP')
assert result == expected_result
mock_create_dict_from_json_file.assert_called_once_with('10+sorted_msa_genes_and_mutations.json')

@patch('dcicutils.variant_utils.VariantUtils.return_json')
def test_create_list_of_als_park_genes(self, mock_return_json):
with self.mock_key_manager():
vu = VariantUtils(env_name='cgap-dummy')
mock_return_json.return_value = [
{'gene_symbol': 'GENE1', 'gene_summary': '...Parkinson...'},
{'gene_symbol': 'GENE2', 'gene_summary': '...............'},
{'gene_symbol': 'GENE3', 'gene_summary': '.....ALS.......'}
]
result = vu.create_list_of_als_park_genes()
expected_result = ['GENE1', 'GENE3']
assert result == expected_result
mock_return_json.assert_called_once_with('gene.json')

0 comments on commit 3972a56

Please sign in to comment.