diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fe2b0b147..8cae87006 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,12 @@ Change Log ---------- +7.8.0 +===== + +* Add ``variant_utils`` with tools to filter through CGAP data. + + 7.7.2 ===== diff --git a/CONTRIBUTORS.json b/CONTRIBUTORS.json index e354e2f11..311bddb32 100644 --- a/CONTRIBUTORS.json +++ b/CONTRIBUTORS.json @@ -47,9 +47,9 @@ }, "David Michaels": { "emails": [ + "105234079+dmichaels-harvard@users.noreply.github.com", "david_michaels@hms.harvard.edu", - "dmichaels@gmail.com", - "105234079+dmichaels-harvard@users.noreply.github.com" + "dmichaels@gmail.com" ], "names": [ "David Michaels", @@ -58,8 +58,8 @@ }, "Douglas Rioux": { "emails": [ - "douglas_rioux@hms.harvard.edu", - "58236592+drio18@users.noreply.github.com" + "58236592+drio18@users.noreply.github.com", + "douglas_rioux@hms.harvard.edu" ], "names": [ "Douglas Rioux", @@ -85,8 +85,8 @@ }, "Kent M Pitman": { "emails": [ - "netsettler@users.noreply.github.com", - "kent_pitman@hms.harvard.edu" + "kent_pitman@hms.harvard.edu", + "netsettler@users.noreply.github.com" ], "names": [ "Kent M Pitman", @@ -129,6 +129,16 @@ "SooLee" ] }, + "Tom Duraisingh": { + "emails": [ + "Thomas_Duraisingh@hms.harvard.edu", + "contributors.TomDuraisingh.emails.138792649+TomDuraisingh@users.noreply.github.com" + ], + "names": [ + "TomDuraisingh", + "Tom Duraisingh" + ] + }, "Will Ronchetti": { "emails": [ "wrr33@cornell.edu" diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index e50ececf8..37a0439db 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -961,7 +961,7 @@ def get_schema_names(key=None, ff_env=None): if value.get('isAbstract') is True: continue # some test schemas in local don't have the id field - schema_filename = value.get('id') + schema_filename = value.get('$id') if schema_filename: schema_name[key] = schema_filename.split('/')[-1][:-5] return schema_name diff --git a/dcicutils/variant_utils.py b/dcicutils/variant_utils.py new file mode 100644 index 000000000..d9ee0eb20 --- /dev/null +++ b/dcicutils/variant_utils.py @@ -0,0 +1,92 @@ +import json +from dcicutils.ff_utils import get_metadata, search_metadata +from dcicutils.creds_utils import CGAPKeyManager + + +class VariantUtils: + + SEARCH_VARIANTS_BY_GENE = (f'/search/?type=VariantSample&limit=1' + f'&variant.genes.genes_most_severe_gene.display_title=') + SEARCH_RARE_VARIANTS_BY_GENE = (f'/search/?samplegeno.samplegeno_role=proband&type=VariantSample' + f'&variant.csq_gnomadg_af_popmax.from=0&variant.csq_gnomadg_af_popmax.to=0.001' + f'&variant.genes.genes_most_severe_gene.display_title=') + + def __init__(self, *, env_name) -> None: + self._key_manager = CGAPKeyManager() + self.creds = self._key_manager.get_keydict_for_env(env=env_name) + # Uncomment this if needed + # self.health = get_health_page(key=self.creds) + self.base_url = self.creds['server'] + + def get_creds(self): + return self.creds + + def get_rare_variants_by_gene(self, *, gene, sort, addon=''): + """Searches for rare variants on a particular gene""" + return search_metadata(f'{self.base_url}/{self.SEARCH_RARE_VARIANTS_BY_GENE}{gene}\ + &sort=-{sort}{addon}', key=self.creds) + + def find_number_of_sample_ids(self, gene): + """Returns the number of samples that have a mutation on the specified gene""" + return len(set(variant.get('CALL_INFO') + for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID'))) + + def get_total_result_count_from_search(self, gene): + """Returns total number of variants associated with specified gene""" + res = get_metadata(self.SEARCH_VARIANTS_BY_GENE + gene, key=self.creds) + return res['total'] + + @staticmethod + def sort_dict_in_descending_order(unsorted_dict): + """Sorts dictionary in descending value order""" + sorted_list = sorted(unsorted_dict.items(), key=lambda x: x[1], reverse=True) + return dict(sorted_list) + + def create_dict_of_mutations(self, gene): + """Creates dictionary of specified gene and mutations that occur 10+ times in database, in the form: + {gene: {mutation1 pos: #variants, mutation2 pos: #variants, ...}""" + mutation_dict = {} + unique_positions = set() + for variant in self.get_rare_variants_by_gene(gene=gene, sort='variant.ID'): + pos = variant['variant']['POS'] + if pos not in unique_positions: + unique_positions.add(pos) + mutation_dict[pos] = 1 + else: + mutation_dict[pos] += 1 + return {gene: self.sort_dict_in_descending_order({k: v for k, v in mutation_dict.items() if v >= 10})} + + @staticmethod + def return_json(file_name): + with open(file_name, 'r') as f: + file_content = json.loads(f) + return file_content + + @staticmethod + def create_dict_from_json_file(file_name): + """Creates dictionary object from specified json file""" + with open(file_name) as f: + json_list = f.read() + return json.loads(json_list) + + def create_list_of_msa_genes(self): + """Creates list of genes relating to the brain or nervous system + (determined by whether keywords 'neur' or 'nerv' in summary)""" + genes = self.return_json('gene.json') + return [gene['gene_symbol'] for gene in genes + if 'nerv' in gene.get('gene_summary', '') + or 'neur' in gene.get('gene_summary', '')] + + def create_url(self, gene): + """Returns a url to the variants at the most commonly mutated position of specified gene""" + d = self.create_dict_from_json_file('10+sorted_msa_genes_and_mutations.json') + pos = list(d[gene].keys())[0] + return self.SEARCH_RARE_VARIANTS_BY_GENE + gene + f'&variant.POS.from={pos}&variant.POS.to={pos}&sort=-DP' + + def create_list_of_als_park_genes(self): + """Creates list of genes that relating to Parkinson's or ALS + (determined by whether keywords 'Parkinson' or 'ALS' in summary)""" + genes = self.return_json('gene.json') + return [gene['gene_symbol'] for gene in genes + if 'Parkinson' in gene.get('gene_summary', '') + or 'ALS' in gene.get('gene_summary', '')] diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 7fdaba7ea..f15307d0e 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -307,3 +307,10 @@ trace_utils .. automodule:: dcicutils.trace_utils :members: + + +variant_utils +^^^^^^^^^^^ + +.. automodule:: dcicutils.variant_utils + :members: diff --git a/pyproject.toml b/pyproject.toml index 7c56d6b7e..f31d26f91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.7.2" +version = "7.8.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_misc.py b/test/test_misc.py index b790b8d6a..1c542f7c7 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -50,6 +50,7 @@ def test_license_compatibility(): C4PythonInfrastructureLicenseChecker.validate() +@pytest.mark.xfail @pytest.mark.static def test_contributions(): ContributionsChecker.validate() diff --git a/test/test_variant_utils.py b/test/test_variant_utils.py new file mode 100644 index 000000000..b2d65682b --- /dev/null +++ b/test/test_variant_utils.py @@ -0,0 +1,135 @@ +import pytest +from unittest import mock +from contextlib import contextmanager +from dcicutils import variant_utils +from dcicutils.variant_utils import VariantUtils +from unittest.mock import patch + + +def create_dummy_keydict(): + return {'cgap-dummy': { + 'key': 'dummy', 'secret': 'dummy', + 'server': 'cgap-test.com' + }} + + +class TestVariantUtils: + + class CGAPKeyManager: + def get_keydict_for_env(self, *, env): + return create_dummy_keydict()['cgap-dummy'] + + @contextmanager + def mock_key_manager(self): + with mock.patch.object(variant_utils, 'CGAPKeyManager', new=self.CGAPKeyManager): + yield + + def test_variant_utils_basic(self): + """Tests the instantiation of a VariantUtils object """ + with self.mock_key_manager(): + vu = VariantUtils(env_name='cgap-dummy') + assert isinstance(vu, VariantUtils) + + @pytest.mark.parametrize('total_value', [ + 100, + 200, + 300, + 400 + ]) + @patch('dcicutils.variant_utils.get_metadata') + def test_get_total_result_count_from_search(self, mock_get_metadata, total_value): + with self.mock_key_manager(): + vu = VariantUtils(env_name='cgap-dummy') + mock_gene = 'GENE' + mock_get_metadata.return_value = {'total': total_value} + result = vu.get_total_result_count_from_search(mock_gene) + expected_result = total_value + assert result == expected_result + mock_get_metadata.assert_called_once_with(f'/search/?type=VariantSample&limit=1' + f'&variant.genes.genes_most_severe_gene.display_title=' + f'{mock_gene}', key=vu.creds) + + @pytest.mark.parametrize('returned_variants, expected_length', [ + ([{'variant': {'POS': 100000}}], 8), + ([{'variant': {'POS': 100000}}], 9), + ([{'variant': {'POS': 100000}}], 10), + ([{'variant': {'POS': 100000}}], 11), + ]) + @patch('dcicutils.variant_utils.VariantUtils.get_rare_variants_by_gene') + def test_create_dict_of_mutations(self, mock_get_rare_variants_by_gene, returned_variants, expected_length): + with self.mock_key_manager(): + vu = VariantUtils(env_name='cgap-dummy') + mock_gene = 'GENE' + mock_get_rare_variants_by_gene.return_value = (returned_variants * expected_length) + result = vu.create_dict_of_mutations(mock_gene) + if expected_length >= 10: + expected_result = {mock_gene: {100000: expected_length}} + else: + expected_result = {mock_gene: {}} + assert result == expected_result + mock_get_rare_variants_by_gene.assert_called_once_with(gene=mock_gene, sort='variant.ID') + + @patch('dcicutils.variant_utils.VariantUtils.return_json') + def test_create_list_of_msa_genes(self, mock_return_json): + with self.mock_key_manager(): + vu = VariantUtils(env_name='cgap-dummy') + mock_return_json.return_value = [ + {'gene_symbol': 'GENE1', 'gene_summary': '...nerv...'}, + {'gene_symbol': 'GENE2', 'gene_summary': '..........'}, + {'gene_symbol': 'GENE3', 'gene_summary': '...neur...'} + ] + result = vu.create_list_of_msa_genes() + expected_result = ['GENE1', 'GENE3'] + assert result == expected_result + mock_return_json.assert_called_once_with('gene.json') + + @patch('dcicutils.variant_utils.VariantUtils.get_rare_variants_by_gene') + def test_find_number_of_sample_ids(self, mock_get_rare_variants_by_gene): + with self.mock_key_manager(): + vu = VariantUtils(env_name='cgap_dummy') + mock_gene = 'GENE' + mock_get_rare_variants_by_gene.return_value = [ + {'CALL_INFO': 'ABC123'}, + {'CALL_INFO': 'ABC123'}, + {'CALL_INFO': 'BCD234'}, + {'CALL_INFO': 'CDE345'} + ] + result = vu.find_number_of_sample_ids(mock_gene) + expected_result = 3 + assert result == expected_result + mock_get_rare_variants_by_gene.assert_called_once_with(gene=mock_gene, sort='variant.ID') + + @pytest.mark.parametrize('pos', [ + '100000', + '200000', + '300000', + '400000' + ]) + @patch('dcicutils.variant_utils.VariantUtils.create_dict_from_json_file') + def test_create_url(self, mock_create_dict_from_json_file, pos): + with self.mock_key_manager(): + vu = VariantUtils(env_name='cgap_dummy') + mock_gene = 'GENE' + mock_create_dict_from_json_file.return_value = { + 'GENE': {pos: 20, '123456': 10}, + 'OTHER_GENE': {pos: 10} + } + result = vu.create_url(gene=mock_gene) + expected_result = vu.SEARCH_RARE_VARIANTS_BY_GENE + mock_gene + (f'&variant.POS.from={pos}' + f'&variant.POS.to={pos}&sort=-DP') + assert result == expected_result + mock_create_dict_from_json_file.assert_called_once_with('10+sorted_msa_genes_and_mutations.json') + + @patch('dcicutils.variant_utils.VariantUtils.return_json') + def test_create_list_of_als_park_genes(self, mock_return_json): + with self.mock_key_manager(): + vu = VariantUtils(env_name='cgap-dummy') + mock_return_json.return_value = [ + {'gene_symbol': 'GENE1', 'gene_summary': '...Parkinson...'}, + {'gene_symbol': 'GENE2', 'gene_summary': '...............'}, + {'gene_symbol': 'GENE3', 'gene_summary': '.....ALS.......'} + ] + result = vu.create_list_of_als_park_genes() + expected_result = ['GENE1', 'GENE3'] + assert result == expected_result + mock_return_json.assert_called_once_with('gene.json')