fix: de correction (#96)

* fix: taking absolute l2fc values to sort * fix: explicitely refitting genewise dispersions * fix: less warning for font selection, tinkering with parameterization fit * feat: allowing more threads per de analysis * feat: entered default threads for the index and aligning rules * feat: allowing for different plot format outputs * feat: categorized deseq2 configurations, added plot figure type selection * feat: new samples * temp: temorarily removed temp flag * refactor: more stample diff exp output * feat: CI config updated for changes in code * style: format string instead of string concatenation * style: format string instead of string concatenation * fix: syntax * fix: added missing config option * fix: removed printf debugging output * feat: using inference to set threading globally * fix: removed 'batch' from design factors * style: one more format string instead of path concatenation * fix: using nonsense alpha for ci * fix: using nonsense alpha for ci
snakemake-workflows · Oct 12, 2024 · f56f660 · f56f660
1 parent e838bdd
commit f56f660
Show file tree

Hide file tree

Showing 8 changed files with 185 additions and 125 deletions.
diff --git a/.test/config-simple/config.yml b/.test/config-simple/config.yml
@@ -64,40 +64,49 @@ min_gene_expr: 10
 # Minimum transcript counts
 min_feature_expr: 3
 
-# This section defines the deseq2 plot and data handling parameters
-#
-# the "design factors" are the confounding variables to be adjusted for
-# during the normalization. They must be given in the configuration (samples.csv)
-design_factors:
-    - "condition"
-#
-# The (log2) log fold change under the null hypothesis. (default: 0).
-lfc_null: 0.1
-#
-# The alternative hypothesis for computing wald p-values. By default,
-# the normal Wald test assesses deviation of the estimated log fold
-# change from the null hypothesis, as given by lfc_null.
-# One of ["greaterAbs", "lessAbs", "greater", "less"] or None.
-# The alternative hypothesis corresponds to what the user wants to
-# find rather than the null hypothesis. (default: None).
-alt_hypothesis: "greaterAbs"
-#
-# The marker size in points**2 (typographic points are 1/72 in.).
-# Default is rcParams['lines.markersize'] ** 2.# minimum count to
-# be considered for subsequent analysis
-point_width: 20
-#
-#
-mincount: 10
-#
-# Type I error cutoff value
-alpha: 10
-#
-# in addition to the full heatmap, plot the top number of different
-# values, ranked by the top ratio between the two traits
-threshold_plot: 10
-#
-# the heatmap color map
-# see https://seaborn.pydata.org/tutorial/color_palettes.htm for an overview
-colormap: "flare"
+
+# This section defines the pyDESeq2 plot and data handling parameters
+deseq2:
+    # normalization fit type, must be 'parametric' or 'mean'
+    fit_type: "mean"
+    # the "design factors" are the confounding variables to be adjusted fr
+    # during normalization. They must be given in the configuration (samples.csv).
+    design_factors: 
+        - "condition"
+    #
+    # the "continous factors" are non-categorial factors to be considered
+    #continuous_factors:
+    #    - 
+    #
+    # The (log2) log fold change under the null hypothesis. (default: 0).
+    lfc_null: 0.5
+    #
+    # The alternative hypothesis for computing wald p-values. By default,
+    # the normal Wald test assesses deviation of the estimated log fold
+    # change from the null hypothesis, as given by lfc_null.
+    # One of ["greaterAbs", "lessAbs", "greater", "less"] or None.
+    # The alternative hypothesis corresponds to what the user wants to
+    # find rather than the null hypothesis. (default: None).
+    alt_hypothesis: "greaterAbs"
+    #
+    # The marker size in points**2 (typographic points are 1/72 in.).
+    # Default is rcParams['lines.markersize'] ** 2.# minimum count to
+    # be considered for subsequent analysis
+    point_width: 20
+    #
+    # we disrecard loci with count number lower 'mincount'
+    mincount: 10
+    #
+    # Type I error cutoff value:
+    alpha: 10
+    #
+    # in addition to the full heatmap, plot the top number of different
+    # values, ranked by the top ratio between the two traits
+    threshold_plot: 10
+    #
+    # the heatmap color map
+    # see https://seaborn.pydata.org/tutorial/color_palettes.htm for an overview
+    colormap: "flare"
+    #plot figure type
+    figtype: "png"
 
diff --git a/config/Mainz-MogonNHR/config.yml b/config/Mainz-MogonNHR/config.yml
@@ -13,6 +13,7 @@ repo: "https://github.com/snakemake-workflows/transriptome-differential-expressi
 
 ## Workflow-specific Parameters:
 
+# the reference genome respectively transcriptome, is defined here:
 ref:
     species: "Chironomus riparius"
     # NCBI accession number of the reference data set
@@ -63,39 +64,46 @@ min_gene_expr: 10
 min_feature_expr: 3
 
 # This section defines the pyDESeq2 plot and data handling parameters
-#
-# the "design factors" are the confounding variables to be adjusted fr
-# during normalization. They must be given in the configuration (samples.csv).
-design_factors: 
-  - "batch"
-  - "condition"
-#
-# The (log2) log fold change under the null hypothesis. (default: 0).
-lfc_null: 1
-#
-# The alternative hypothesis for computing wald p-values. By default,
-# the normal Wald test assesses deviation of the estimated log fold
-# change from the null hypothesis, as given by lfc_null.
-# One of ["greaterAbs", "lessAbs", "greater", "less"] or None.
-# The alternative hypothesis corresponds to what the user wants to
-# find rather than the null hypothesis. (default: None).
-alt_hypothesis: "greaterAbs"
-#
-# The marker size in points**2 (typographic points are 1/72 in.).
-# Default is rcParams['lines.markersize'] ** 2.# minimum count to
-# be considered for subsequent analysis
-point_width: 20
-#
-# we disrecard loci with count number lower 'mincount'
-mincount: 10
-#
-# Type I error cutoff value:
-alpha: 0.05
-#
-# in addition to the full heatmap, plot the top number of different
-# values, ranked by the top ratio between the two traits
-threshold_plot: 10
-#
-# the heatmap color map
-# see https://seaborn.pydata.org/tutorial/color_palettes.htm for an overview
-colormap: "flare"
+deseq2:
+    # normalization fit type, must be 'parametric' or 'mean'
+    fit_type: "mean"
+    # the "design factors" are the confounding variables to be adjusted fr
+    # during normalization. They must be given in the configuration (samples.csv).
+    design_factors: 
+        - "condition"
+    #
+    # the "continous factors" are non-categorial factors to be considered
+    #continuous_factors:
+    #    - 
+    #
+    # The (log2) log fold change under the null hypothesis. (default: 0).
+    lfc_null: 0.5
+    #
+    # The alternative hypothesis for computing wald p-values. By default,
+    # the normal Wald test assesses deviation of the estimated log fold
+    # change from the null hypothesis, as given by lfc_null.
+    # One of ["greaterAbs", "lessAbs", "greater", "less"] or None.
+    # The alternative hypothesis corresponds to what the user wants to
+    # find rather than the null hypothesis. (default: None).
+    alt_hypothesis: "greaterAbs"
+    #
+    # The marker size in points**2 (typographic points are 1/72 in.).
+    # Default is rcParams['lines.markersize'] ** 2.# minimum count to
+    # be considered for subsequent analysis
+    point_width: 20
+    #
+    # we disrecard loci with count number lower 'mincount'
+    mincount: 10
+    #
+    # Type I error cutoff value:
+    alpha: 0.05
+    #
+    # in addition to the full heatmap, plot the top number of different
+    # values, ranked by the top ratio between the two traits
+    threshold_plot: 10
+    #
+    # the heatmap color map
+    # see https://seaborn.pydata.org/tutorial/color_palettes.htm for an overview
+    colormap: "flare"
+    #plot figure type
+    figtype: "png"
diff --git a/config/Mainz-MogonNHR/samples.csv b/config/Mainz-MogonNHR/samples.csv
@@ -1,13 +1,21 @@
 sample  condition   condition2	batch    platform    purity
-m18_bc01    male    condition2	batch1          NANOPORE    1
-m18_bc02    male    condition2	batch1          NANOPORE    1
-m18_bc03    female  condition2	batch1          NANOPORE    1
-m18_bc04    female  condition2	batch1          NANOPORE    1
-m18_bc05    female  condition2	batch1          NANOPORE    1
-m18_bc06    male    condition2	batch1          NANOPORE    1
-ma_bc07     male    condition2	batch2          NANOPORE    1
-ma_bc08     male    condition2	batch2          NANOPORE    1
-ma_bc09     female  condition2	batch2          NANOPORE    1
+m18_bc01    male    imago	batch1          NANOPORE    1
+m18_bc02    male    imago	batch1          NANOPORE    1
+m18_bc03    female  imago	batch1          NANOPORE    1
+m18_bc04    female  imago	batch1          NANOPORE    1
+m18_bc05    female  imago	batch1          NANOPORE    1
+m18_bc06    male    imago	batch1          NANOPORE    1
+#ma_bc07     male    imago	batch2          NANOPORE    1
+#ma_bc08     male    imago	batch2          NANOPORE    1
+#ma_bc09     female  imago	batch2          NANOPORE    1
 #ma_bc10     male    condition2	batch2          NANOPORE    1
 #ma_bc11     male    condition2	batch2          NANOPORE    1
 #ma_bc12     female  condition2	batch2          NANOPORE    1
+m18_bc10    male    larval  batch3  NANOPORE    1
+m18_bc11    female  larval  batch3  NANOPORE    1
+m18_bc12    male    larval  batch3  NANOPORE    1
+m18_bc13    female  larval  batch3  NANOPORE    1
+m18_bc14    female  larval  batch3  NANOPORE    1 
+#m18_bc15   unkown  embryo  batch3  NANOPORE    1
+m18_bc16    male    larval  batch3  NANOPORE    1
+#m18_bc17   unkown  embryo  batch3  NANOPORE    1
diff --git a/workflow/profile/Mainz-MogonNHR/config.yaml b/workflow/profile/Mainz-MogonNHR/config.yaml
@@ -19,7 +19,7 @@ set-resources:
         runtime: "30m"
 
     map_reads:
-        cpus_per_task: 32
+        threads: 32
         mem_mb_per_cpu: 1800
         runtime: "3h"
         slurm_partition: "smallcpu" # needs benchmarking
@@ -65,6 +65,6 @@ set-resources:
         runtime: "1h"
 
     de_analysis:
-        cpus_per_task: 4
+        cpus_per_task: 8
         mem_mb_per_cpu: 5000
         runtime: "1h"
diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk
@@ -7,6 +7,7 @@ rule build_minimap_index:  ## build minimap2 index
         extra=config["minimap_index_opts"],
     log:
         "logs/minimap2/index.log",
+    threads: 4
     wrapper:
         "v3.13.4/bio/minimap2/index"
 
@@ -22,5 +23,6 @@ rule map_reads:
         "logs/minimap2/mapping_{sample}.log",
     params:
         extra=f"-p {config['secondary_score_ratio']} -N {config['maximum_secondary']} {config['minimap2_opts']}",
+    threads: 32
     wrapper:
         "v3.13.4/bio/minimap2/aligner"
diff --git a/workflow/rules/commons.smk b/workflow/rules/commons.smk
@@ -77,8 +77,8 @@ def rule_all_input():
         expand("counts/{sample}_salmon/quant.sf", sample=samples["sample"])
     )
     all_input.append("merged/all_counts.tsv")
-    all_input.append("de_analysis/dispersion_graph.svg")
-    all_input.append("de_analysis/ma_graph.svg")
-    all_input.append("de_analysis/heatmap.svg")
+    all_input.append(f"de_analysis/dispersion_graph.{config['deseq2']['figtype']}")
+    all_input.append(f"de_analysis/ma_graph.{config['deseq2']['figtype']}")
+    all_input.append(f"de_analysis/heatmap.{config['deseq2']['figtype']}")
     all_input.append("de_analysis/lfc_analysis.csv")
     return all_input
diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -3,49 +3,50 @@ rule de_analysis:
         all_counts=rules.merge_counts.output,
     output:
         dispersion_graph=report(
-            "de_analysis/dispersion_graph.svg",
+            f"de_analysis/dispersion_graph.{config['deseq2']['figtype']}",
             category="Results",
             caption="../report/dispersion_graph.rst",
             labels={
                 "figure": "Dispersion graph",
             },
         ),
         ma_graph=report(
-            "de_analysis/ma_graph.svg",
+            f"de_analysis/ma_graph.{config['deseq2']['figtype']}",
             category="Results",
             caption="../report/ma_graph.rst",
             labels={
                 "figure": "MA plot",
             },
         ),
         de_heatmap=report(
-            "de_analysis/heatmap.svg",
+            f"de_analysis/heatmap.{config['deseq2']['figtype']}",
             category="Results",
             caption="../report/heatmap.rst",
             labels={
                 "figure": "Gene heatmap",
             },
         ),
         correlation_matrix=report(
-            "de_analysis/correlation_matrix.svg",
+            f"de_analysis/correlation_matrix.{config['deseq2']['figtype']}",
             category="Results",
             caption="../report/correlation_matrix.rst",
             labels={
                 "figure": "Correlation matrix",
             },
         ),
-        normalized_counts="de_analysis/normalized_counts.csv",
+        normalized_counts=report("de_analysis/normalized_counts.csv"),
         de_top_heatmap=report(
-            "de_analysis/heatmap_top.svg",
+            f"de_analysis/heatmap_top.{config['deseq2']['figtype']}",
             category="Results",
             caption="../report/heatmap_top.rst",
             labels={
                 "figure": "Top gene heatmap",
             },
         ),
+        sorted_normalized_counts=report("de_analysis/sorted_normalized_counts.csv"),
         lfc_analysis="de_analysis/lfc_analysis.csv",
         volcano_plot=report(
-            "de_analysis/volcano_plot.svg",
+            f"de_analysis/volcano_plot.{config['deseq2']['figtype']}",
             category="Results",
             caption="../report/volcano_plot.rst",
             labels={
@@ -56,7 +57,7 @@ rule de_analysis:
         samples=samples,
     log:
         "logs/de_analysis.log",
-    threads: 4
+    threads: 8
     conda:
         "../envs/pydeseq2.yml"
     script: