From bbcb5d54b3f0fa5d66cb8719e5a86f2d0d0f1264 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 13 Sep 2023 11:43:43 +0200 Subject: [PATCH 1/4] Decrease execution time --- workflow/scripts/weighted_distances.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/workflow/scripts/weighted_distances.py b/workflow/scripts/weighted_distances.py index b091183..79d8728 100644 --- a/workflow/scripts/weighted_distances.py +++ b/workflow/scripts/weighted_distances.py @@ -48,7 +48,7 @@ def tsv_from_seq(tsv_reference:str ,reference:str , reference_name:str) -> pd.Da pos = [] alt = [] for i in range(1,len(tsv_reference) + 1): - if i not in mask: + if i not in mask and tsv_reference[i -1] != reference[i-1]: pos.append(i) alt.append(reference[i -1]) @@ -103,29 +103,31 @@ def calc_fst_weir_cockerham(hs:float, ht:float) -> float: return (ht - hs) / ht if ht != 0 else 0 -def get_dif_n(df:pd.DataFrame, COV1:str, COV2:str, mask:tuple, reference:str, freq:dict) -> float: +def get_dif_n(df:pd.DataFrame, COV1:str, COV2:str, reference:str, freq:dict) -> float: + + positions = df["POS"].unique().tolist() df1 = df[df["REGION"] == COV1] df2 = df[df["REGION"] == COV2] - return sum([calc_fst_weir_cockerham(*calc_heterozygosities(df1, df2, i, reference, freq)) - for i in range(len(reference)) if i + 1 not in mask]) + return sum([calc_fst_weir_cockerham(*calc_heterozygosities(df1, df2, i -1, reference, freq)) + for i in positions]) -def _calculate_distance(df:pd.DataFrame, sample:str, mask_positions:tuple, reference:str, freq:dict, cov_list:list) -> list: - return [get_dif_n(df, sample, cov, mask_positions, reference, freq) for cov in cov_list] +def _calculate_distance(df:pd.DataFrame, sample:str,reference:str, freq:dict, cov_list:list) -> list: + return [get_dif_n(df, sample, cov, reference, freq) for cov in cov_list] def get_matrix(df:pd.DataFrame, cov_list:list, reference:str, freq:dict, num_jobs:int) -> pd.DataFrame: distance_matrix = {} - mask_positions = parse_vcf() + with mp.Pool(num_jobs) as pool: results = pool.starmap( _calculate_distance, - [ (df, sample, mask_positions, reference, freq, cov_list) for sample in cov_list ] + [ (df, sample, reference, freq, cov_list) for sample in cov_list ] ) for i, sample in enumerate(cov_list): From a4fdb4b7ace724e10adf1b9def8e3360007123f8 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 13 Sep 2023 12:56:22 +0200 Subject: [PATCH 2/4] Solve format errors and empty positions list bug --- workflow/scripts/weighted_distances.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/weighted_distances.py b/workflow/scripts/weighted_distances.py index 79d8728..df76514 100644 --- a/workflow/scripts/weighted_distances.py +++ b/workflow/scripts/weighted_distances.py @@ -106,11 +106,13 @@ def calc_fst_weir_cockerham(hs:float, ht:float) -> float: def get_dif_n(df:pd.DataFrame, COV1:str, COV2:str, reference:str, freq:dict) -> float: positions = df["POS"].unique().tolist() - + if len(positions) == 0: + return 0 + df1 = df[df["REGION"] == COV1] df2 = df[df["REGION"] == COV2] - return sum([calc_fst_weir_cockerham(*calc_heterozygosities(df1, df2, i -1, reference, freq)) + return sum([calc_fst_weir_cockerham(*calc_heterozygosities(df1, df2, i-1, reference, freq)) for i in positions]) @@ -122,7 +124,6 @@ def get_matrix(df:pd.DataFrame, cov_list:list, reference:str, freq:dict, num_job distance_matrix = {} - with mp.Pool(num_jobs) as pool: results = pool.starmap( From 6a48dd5529f367b86c9625ca60cbba9f2a49c9f1 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 13 Sep 2023 22:25:26 +0200 Subject: [PATCH 3/4] Store positions to integrers --- workflow/scripts/weighted_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/weighted_distances.py b/workflow/scripts/weighted_distances.py index df76514..3661db8 100644 --- a/workflow/scripts/weighted_distances.py +++ b/workflow/scripts/weighted_distances.py @@ -105,7 +105,7 @@ def calc_fst_weir_cockerham(hs:float, ht:float) -> float: def get_dif_n(df:pd.DataFrame, COV1:str, COV2:str, reference:str, freq:dict) -> float: - positions = df["POS"].unique().tolist() + positions = df["POS"].astype("Int64").unique().tolist() if len(positions) == 0: return 0 From d17d1fb70e03c74746f9dac9bfc41b0dab93c64b Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 13 Sep 2023 23:58:48 +0200 Subject: [PATCH 4/4] Provide the list of samples to weighted distances script --- workflow/rules/distances.smk | 1 + workflow/scripts/weighted_distances.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk index 1a03a5f..dc7107d 100644 --- a/workflow/rules/distances.smk +++ b/workflow/rules/distances.smk @@ -2,6 +2,7 @@ rule weighted_distances: threads: 4 conda: "../envs/biopython.yaml" params: + samples = expand("{sample}", sample = iter_samples()), mask_class = ["mask"], tsv_reference = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", reference = OUTDIR/"reference.fasta" diff --git a/workflow/scripts/weighted_distances.py b/workflow/scripts/weighted_distances.py index 3661db8..37e08a3 100644 --- a/workflow/scripts/weighted_distances.py +++ b/workflow/scripts/weighted_distances.py @@ -159,7 +159,8 @@ def main(): logging.info("Reading input tables") df = read_and_concatenate_tsvs(snakemake.input.tsv, reference, outgroup, outgroup_name) - cov_list = df["REGION"].unique().tolist() + cov_list = snakemake.params.samples + cov_list.append(outgroup_name) logging.info(f"Parallelizing the calculation with {snakemake.threads} jobs") freq = create_freq_dict(snakemake.input.tsv)