From 6c1e48df21c82dbbff6586bde86936c594730a00 Mon Sep 17 00:00:00 2001 From: "Kamil S. Jaron" Date: Mon, 4 Nov 2024 02:47:10 -0800 Subject: [PATCH] Sploidyplot (#169) * resolved speckliness with a new alt plot, its not yet the default, but it works, can be run with --alt_plot flag * fixing #153 - suprise suprise, smudgeplot package already has a function for weighted quantiles :-) * keeping track of all the testing and proof of concept functions, most notably, local agregation for the alt plot * quantile filtering is now applied after coverage filter * adding the first draft of peak fitting using centrality * adding code for popart plot * added peak agregation algorithm in the main smudgeplot source code * adding python funciton that generates the smudge container given coverges * the new version is nearly working the only missing bit is the smudgeplot itself * the over-very-lnog time playiung around stuff * histogramless, but working version of smudgeplot ORIEL * updating readme with the new imlpemenation * giant cleanup - removing the R package and simplifying the installation * fixing / simplifying installation instructions * streamlining interface and allowing for plotting parameters to be passed to all subprocess * making title work * fixing interface problem to call hetmer module; adding kmer numbers to the log; adding exploration of 1/2 of the final coverage, but it still does not solve the problem of autotetraploids * adding parametrised coverage range to explore * adding error filtering step that stops inference of coverage for datasets with very high proportion of errors (>0.8) * adding small explorations on alternative centralities * making fix * small fixes --- .gitignore | 2 + DESCRIPTION | 14 - Makefile | 17 +- NAMESPACE | 32 - R/annotate_peaks.R | 16 - R/annotate_summits.R | 19 - R/bw.nrdW.R | 18 - R/coverage_histogram.R | 52 - R/estimate_1n_coverage_1d_subsets.R | 65 -- R/estimate_1n_coverage_highest_peak.R | 27 - R/filter_peak_sizes.R | 10 - R/filter_peaks.R | 14 - R/generate_summary.R | 40 - R/get_1d_peaks.R | 24 - R/get_col_ramp.R | 19 - R/get_peak_sizes.R | 14 - R/get_peak_summary.R | 22 - R/get_smudge_container.R | 49 - R/get_trinoploid_1n_est.R | 20 - R/guess_genome_structure.R | 14 - R/peak_agregation.R | 48 - R/plot_expected_haplotype_structure.R | 21 - R/plot_histograms.R | 68 -- R/plot_legend.R | 26 - R/plot_seq_error_line.R | 13 - R/plot_smudgeplot.R | 36 - R/reduce_structure_representation.R | 19 - R/round_minor_variant_cov.R | 13 - R/rounding.R | 16 - R/smudge_summary.R | 11 - R/smudge_warn.R | 12 - R/transform_minor_variant_cov.R | 13 - R/transform_pair_cov.R | 11 - R/wtd.quantile.R | 25 - README.md | 39 +- exec/centrality_plot.R | 11 + exec/smudgeplot.py | 348 +++++- exec/smudgeplot_plot.R | 423 +++---- install.R | 5 - man/annotate_peaks.Rd | 11 - man/annotate_summits.Rd | 17 - man/bw.nrdW.Rd | 12 - man/coverage_histogram.Rd | 11 - man/estimate_1n_coverage_1d_subsets.Rd | 11 - man/estimate_1n_coverage_highest_peak.Rd | 11 - man/filter_peak_sizes.Rd | 11 - man/filter_peaks.Rd | 11 - man/generate_summary.Rd | 11 - man/get_1d_peaks.Rd | 17 - man/get_col_ramp.Rd | 14 - man/get_peak_sizes.Rd | 11 - man/get_peak_summary.Rd | 11 - man/get_smudge_container.Rd | 11 - man/get_trinoploid_1n_est.Rd | 11 - man/guess_genome_structure.Rd | 11 - man/peak_agregation.Rd | 16 - man/plot_expected_haplotype_structure.Rd | 17 - man/plot_histograms.Rd | 19 - man/plot_legend.Rd | 11 - man/plot_seq_error_line.Rd | 11 - man/plot_smudgeplot.Rd | 24 - man/reduce_structure_representation.Rd | 14 - man/round_minor_variant_cov.Rd | 11 - man/rounding.Rd | 11 - man/smudge_summary.Rd | 12 - man/smudge_warn.Rd | 12 - man/transform_minor_variant_cov.Rd | 12 - man/transform_pair_cov.Rd | 12 - man/wtd.quantile.Rd | 14 - playground/alternative_fitting/README.md | 250 +++++ .../alternative_plot_covA_covB.R | 49 + .../alternative_plotting.R | 48 + .../alternative_plotting_functions.R | 119 ++ .../alternative_plotting_testing.R | 90 ++ .../alternative_fitting/pair_clustering.py | 84 ++ .../fooling_around_impossible_locations.R | 113 -- ...wberry_full_kmer_families_fooling_around.R | 154 +-- playground/popart.R | 56 + tests/README.md | 55 +- tests/testthat.R | 4 - tests/testthat/fmlo_smaple_cov_file.tsv | 1000 ----------------- .../test-estimate_1n_coverage_1d_subsets.R | 18 - tests/testthat/test-fitting.R | 46 - tests/testthat/test-get_smudge_container.R | 22 - tests/testthat/test-get_summary.R | 49 - 85 files changed, 1298 insertions(+), 2893 deletions(-) delete mode 100644 DESCRIPTION delete mode 100644 NAMESPACE delete mode 100644 R/annotate_peaks.R delete mode 100644 R/annotate_summits.R delete mode 100644 R/bw.nrdW.R delete mode 100644 R/coverage_histogram.R delete mode 100644 R/estimate_1n_coverage_1d_subsets.R delete mode 100644 R/estimate_1n_coverage_highest_peak.R delete mode 100644 R/filter_peak_sizes.R delete mode 100644 R/filter_peaks.R delete mode 100644 R/generate_summary.R delete mode 100644 R/get_1d_peaks.R delete mode 100644 R/get_col_ramp.R delete mode 100644 R/get_peak_sizes.R delete mode 100644 R/get_peak_summary.R delete mode 100644 R/get_smudge_container.R delete mode 100644 R/get_trinoploid_1n_est.R delete mode 100644 R/guess_genome_structure.R delete mode 100644 R/peak_agregation.R delete mode 100644 R/plot_expected_haplotype_structure.R delete mode 100644 R/plot_histograms.R delete mode 100644 R/plot_legend.R delete mode 100644 R/plot_seq_error_line.R delete mode 100644 R/plot_smudgeplot.R delete mode 100644 R/reduce_structure_representation.R delete mode 100644 R/round_minor_variant_cov.R delete mode 100644 R/rounding.R delete mode 100644 R/smudge_summary.R delete mode 100644 R/smudge_warn.R delete mode 100644 R/transform_minor_variant_cov.R delete mode 100644 R/transform_pair_cov.R delete mode 100644 R/wtd.quantile.R create mode 100644 exec/centrality_plot.R delete mode 100644 install.R delete mode 100644 man/annotate_peaks.Rd delete mode 100644 man/annotate_summits.Rd delete mode 100644 man/bw.nrdW.Rd delete mode 100644 man/coverage_histogram.Rd delete mode 100644 man/estimate_1n_coverage_1d_subsets.Rd delete mode 100644 man/estimate_1n_coverage_highest_peak.Rd delete mode 100644 man/filter_peak_sizes.Rd delete mode 100644 man/filter_peaks.Rd delete mode 100644 man/generate_summary.Rd delete mode 100644 man/get_1d_peaks.Rd delete mode 100644 man/get_col_ramp.Rd delete mode 100644 man/get_peak_sizes.Rd delete mode 100644 man/get_peak_summary.Rd delete mode 100644 man/get_smudge_container.Rd delete mode 100644 man/get_trinoploid_1n_est.Rd delete mode 100644 man/guess_genome_structure.Rd delete mode 100644 man/peak_agregation.Rd delete mode 100644 man/plot_expected_haplotype_structure.Rd delete mode 100644 man/plot_histograms.Rd delete mode 100644 man/plot_legend.Rd delete mode 100644 man/plot_seq_error_line.Rd delete mode 100644 man/plot_smudgeplot.Rd delete mode 100644 man/reduce_structure_representation.Rd delete mode 100644 man/round_minor_variant_cov.Rd delete mode 100644 man/rounding.Rd delete mode 100644 man/smudge_summary.Rd delete mode 100644 man/smudge_warn.Rd delete mode 100644 man/transform_minor_variant_cov.Rd delete mode 100644 man/transform_pair_cov.Rd delete mode 100644 man/wtd.quantile.Rd create mode 100644 playground/alternative_fitting/README.md create mode 100644 playground/alternative_fitting/alternative_plot_covA_covB.R create mode 100644 playground/alternative_fitting/alternative_plotting.R create mode 100644 playground/alternative_fitting/alternative_plotting_functions.R create mode 100644 playground/alternative_fitting/alternative_plotting_testing.R create mode 100644 playground/alternative_fitting/pair_clustering.py delete mode 100644 playground/fooling_around_impossible_locations.R create mode 100644 playground/popart.R delete mode 100644 tests/testthat.R delete mode 100644 tests/testthat/fmlo_smaple_cov_file.tsv delete mode 100644 tests/testthat/test-estimate_1n_coverage_1d_subsets.R delete mode 100644 tests/testthat/test-fitting.R delete mode 100644 tests/testthat/test-get_smudge_container.R delete mode 100644 tests/testthat/test-get_summary.R diff --git a/.gitignore b/.gitignore index 7df2771..5a8d476 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,7 @@ figures playground docs exec/PloidyPlot +exec/hetmers *.o .DS_Store +smu2text_smu diff --git a/DESCRIPTION b/DESCRIPTION deleted file mode 100644 index 0e64686..0000000 --- a/DESCRIPTION +++ /dev/null @@ -1,14 +0,0 @@ -Package: smudgeplot -Type: Package -Title: -Version: 0.1.3 -Date: 2016-05-31 -Author: Kamil S. Jaron -Maintainer: Kamil S. Jaron -Description: The package to plot smudgeplot -Depends: R (>= 3.3.1), viridis, argparse -License: GPL-3 -LazyLoad: yes -RoxygenNote: 7.2.3 -Suggests: testthat -Encoding: UTF-8 diff --git a/Makefile b/Makefile index 83170da..ad29043 100644 --- a/Makefile +++ b/Makefile @@ -1,30 +1,23 @@ # PATH for libraries is guessed -RPACKAGE = smudgeplot -$(eval RPATH := $(shell Rscript -e "noquote(.libPaths())" | tail -1 | cut -f 2 -d ' ')) CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ifndef INSTALL_PREFIX INSTALL_PREFIX = /usr/local endif -R_INSTALLATION = $(RPATH)/$(RPACKAGE) -HET_KMERS_INST = $(INSTALL_PREFIX)/bin/smudgeplot.py $(INSTALL_PREFIX)/bin/PloidyPlot -SMUDGEPLOT_INST = $(INSTALL_PREFIX)/bin/smudgeplot_plot.R +HET_KMERS_INST = $(INSTALL_PREFIX)/bin/smudgeplot.py $(INSTALL_PREFIX)/bin/hetmers +SMUDGEPLOT_INST = $(INSTALL_PREFIX)/bin/smudgeplot_plot.R $(INSTALL_PREFIX)/bin/centrality_plot.R .PHONY : install -install : $(HET_KMERS_INST) $(SMUDGEPLOT_INST) $(CUTOFF_INST) $(R_INSTALLATION) -# - removing the R package from the list +install : $(HET_KMERS_INST) $(SMUDGEPLOT_INST) $(CUTOFF_INST) $(INSTALL_PREFIX)/bin/% : exec/% install -C $< $(INSTALL_PREFIX)/bin -$(R_INSTALLATION) : R/* - Rscript install.R - -exec/PloidyPlot: src_ploidyplot/PloidyPlot.c src_ploidyplot/libfastk.c src_ploidyplot/libfastk.h src_ploidyplot/matrix.c src_ploidyplot/matrix.h +exec/hetmers: src_ploidyplot/PloidyPlot.c src_ploidyplot/libfastk.c src_ploidyplot/libfastk.h src_ploidyplot/matrix.c src_ploidyplot/matrix.h gcc $(CFLAGS) -o $@ src_ploidyplot/PloidyPlot.c src_ploidyplot/libfastk.c src_ploidyplot/matrix.c -lpthread -lm .PHONY : clean clean : - rm -f exec/PloidyPlot + rm -f exec/hetmers diff --git a/NAMESPACE b/NAMESPACE deleted file mode 100644 index e9a49e9..0000000 --- a/NAMESPACE +++ /dev/null @@ -1,32 +0,0 @@ -# Generated by roxygen2: do not edit by hand - -export(annotate_peaks) -export(annotate_summits) -export(bw.nrdW) -export(coverage_histogram) -export(estimate_1n_coverage_1d_subsets) -export(estimate_1n_coverage_highest_peak) -export(filter_peak_sizes) -export(filter_peaks) -export(generate_summary) -export(get_1d_peaks) -export(get_col_ramp) -export(get_peak_sizes) -export(get_peak_summary) -export(get_smudge_container) -export(get_trinoploid_1n_est) -export(guess_genome_structure) -export(peak_agregation) -export(plot_expected_haplotype_structure) -export(plot_histograms) -export(plot_legend) -export(plot_seq_error_line) -export(plot_smudgeplot) -export(reduce_structure_representation) -export(round_minor_variant_cov) -export(rounding) -export(smudge_summary) -export(smudge_warn) -export(transform_minor_variant_cov) -export(transform_pair_cov) -export(wtd.quantile) diff --git a/R/annotate_peaks.R b/R/annotate_peaks.R deleted file mode 100644 index f1fdc80..0000000 --- a/R/annotate_peaks.R +++ /dev/null @@ -1,16 +0,0 @@ -#' @title annotate_peaks -#' -#' @description -#' \code{annotate_peaks} -#' -#' @export - -# annotate peaks -annotate_peaks <- function(.peak_points, .min_kmerpair_cov, .max_kmerpair_cov){ - for (summit in which(.peak_points$summit == T)){ - peak <- .peak_points$peak[summit] - to_plot <- .peak_points[.peak_points$peak == peak, c('x','y')] - to_plot <- to_plot[!is.na(to_plot$x),] - points(transform_x(to_plot$y), transform_y(to_plot$x, .min_kmerpair_cov, .max_kmerpair_cov), pch = peak) - } -} \ No newline at end of file diff --git a/R/annotate_summits.R b/R/annotate_summits.R deleted file mode 100644 index 0221c4d..0000000 --- a/R/annotate_summits.R +++ /dev/null @@ -1,19 +0,0 @@ -#' @title annotate_summits -#' -#' @description -#' \code{annotate_summits} -#' -#' @export - -annotate_summits <- function(.peak_points, .peak_sizes, .min_kmerpair_cov, .max_kmerpair_cov, col = 'red'){ - for (summit in which(.peak_points$summit == T)){ - peak <- .peak_points$peak[summit] - peak_size <- .peak_sizes$rel_size[.peak_sizes$peak == peak] - # points(transform_x(.peak_points$y[summit]), - # transform_y(.peak_points$x[summit], .min_kmerpair_cov, .max_kmerpair_cov)) - text(transform_x(.peak_points$y[summit]), - transform_y(.peak_points$x[summit], .min_kmerpair_cov, .max_kmerpair_cov), - round(peak_size, 3), - offset = 0, cex = 1.2, col = col, xpd=NA) - } -} \ No newline at end of file diff --git a/R/bw.nrdW.R b/R/bw.nrdW.R deleted file mode 100644 index 92266c3..0000000 --- a/R/bw.nrdW.R +++ /dev/null @@ -1,18 +0,0 @@ -#' @title bw.nrdW -#' -#' @description -#' \code{bw.nrdW} is a function that mimics bw.nrd0 - Silverman's ‘rule of thumb’ for estimating the kernel width, but using weighted values; -#' this can be useful if one wants to mimics kerned smoothing to a histogram using counts as weights -#' -#' @export - -bw.nrdW <- function (x, w) -{ - if (length(x) < 2L) - stop("need at least 2 data points") - Ex <- weighted.mean(x, w) - hi <- sqrt(1 / (sum(w) - 1) * sum(w * (x - Ex)^2)) #wieighted SD - if (!(lo <- min(hi, wtd.iqr(x, w)/1.34))) - (lo <- hi) || (lo <- abs(x[1L])) || (lo <- 1) - 0.9 * lo * sum(w)^(-0.2) -} diff --git a/R/coverage_histogram.R b/R/coverage_histogram.R deleted file mode 100644 index bb84cf9..0000000 --- a/R/coverage_histogram.R +++ /dev/null @@ -1,52 +0,0 @@ -#' @title coverage_histogram -#' -#' @description -#' \code{coverage_histogram} plots a histogram from a table that contains both values (what is normally x) and their respective frequencies (a column called 'freq'); -#' -#' @export - -# coverage_histogram(.cov_tab, bins = .bins, xlim = c(0, 0.5), -# column = 'minor_variant_rel_cov', horiz = FALSE, .col) -# bins <- .bins -# xlim <- c(0, 0.5) -# column <- 'minor_variant_rel_cov' -# horiz <- FALSE - -coverage_histogram <- function(.cov_tab, bins, xlim, column, horiz, .col){ - - # define breaks of the histogram - prettified_bins <- length(pretty(c(xlim[1],xlim[2]), n = bins)) - x_breaks <- seq(xlim[1], xlim[2], length = prettified_bins) - # identify buckets of each kmer pair - # Note: these are right open intervals so the last ratio bucket contain 0.5 - kmerpairs2buckets <- list(findInterval(.cov_tab[, column], x_breaks, left.open = TRUE)) - # and for each bucket aggregate their frequencies - agregated_freq <- aggregate(.cov_tab[, 'freq'], by = kmerpairs2buckets, sum) - - # from, to are the borders of the bucket, pos is for plotting, and freq is the agregated value - ratio_hist <- data.frame(from = x_breaks[1:(prettified_bins-1)], to = x_breaks[2:prettified_bins], freq = 0) - ratio_hist[, 'pos'] <- (ratio_hist[, 'from'] + ratio_hist[, 'to']) / 2 - ratio_hist[agregated_freq[, 1], 'freq'] <- agregated_freq[, 2] - - width = (ratio_hist[1, 'to'] - ratio_hist[1, 'from']) / 2 - - # horizontal plot - if (horiz){ - # c(bottom, left, top, right) - par(mar=c(3.8,0,0,1)) - plot(NULL, type="n", xlab="", ylab="", axes=FALSE, - ylim=xlim, xlim=c(0, max(ratio_hist[, 'freq']))) - for ( i in 1:prettified_bins) { - rect(0, ratio_hist[i, 'pos'] - width, ratio_hist[i, 'freq'], ratio_hist[i, 'pos'] + width, col = .col, border = FALSE) - } - } else { - # vertical plot - par(mar=c(0,3.8,1,0)) - plot(NULL, type="n", xlab="", ylab="", axes=FALSE, - ylim=c(0, max(ratio_hist[, 'freq'])), xlim=xlim) - for ( i in 1:prettified_bins) { - rect(ratio_hist[i, 'pos'] - width, 0, ratio_hist[i, 'pos'] + width, ratio_hist[i, 'freq'], col = .col, border = FALSE) - } - } -} - diff --git a/R/estimate_1n_coverage_1d_subsets.R b/R/estimate_1n_coverage_1d_subsets.R deleted file mode 100644 index 74a31b3..0000000 --- a/R/estimate_1n_coverage_1d_subsets.R +++ /dev/null @@ -1,65 +0,0 @@ -#' @title estimate_1n_coverage_1d_subsets -#' -#' @description -#' \code{estimate_1n_coverage_1d_subsets} is used as the first proxy of 1n coverage. The estimate is subsequentially refined by the brightest smudge. -#' -#' @export - -estimate_1n_coverage_1d_subsets <- function(.cov_tab){ - total_kmers <- sum(.cov_tab[, 3]) - - minor_freq_subsets <- list( - AB_subset = .cov_tab[, 'minor_variant_rel_cov'] > 0.48, - AAB_subset = .cov_tab[, 'minor_variant_rel_cov'] < 0.3433333 & .cov_tab[, 'minor_variant_rel_cov'] > 0.3233333, - AAAB_subset = .cov_tab[, 'minor_variant_rel_cov'] < 0.26 & .cov_tab[, 'minor_variant_rel_cov'] > 0.24, - AAAAB_subset = .cov_tab[, 'minor_variant_rel_cov'] < 0.21 & .cov_tab[, 'minor_variant_rel_cov'] > 0.19, - AAAAAB_subset = .cov_tab[, 'minor_variant_rel_cov'] < 0.1766667 & .cov_tab[, 'minor_variant_rel_cov'] > 0.1566667 - ) - - peak_frame_2 <- get_1d_peaks(.cov_tab[minor_freq_subsets[[1]], ], 3) - peak_frame_3 <- get_1d_peaks(.cov_tab[minor_freq_subsets[[2]], ], 2) - peak_frame_4 <- get_1d_peaks(.cov_tab[minor_freq_subsets[[3]], ], 1) - peak_frame_5 <- get_1d_peaks(.cov_tab[minor_freq_subsets[[4]], ], 1) - peak_frame_6 <- get_1d_peaks(.cov_tab[minor_freq_subsets[[5]], ], 1) - peak_frame_6_max_height = which.max(peak_frame_6$height) - - skip_peak_4 = FALSE - if ( !is.na(peak_frame_2$cov[1]) & !is.na(peak_frame_4$cov[1]) ){ - if ( round(peak_frame_4$cov[1] / peak_frame_2$cov[1]) >= 2 ) { #This is characteristic of allotetraploids which do not have a high AAAB signal. In this case, peak_frame_4 may pick up on the AAAAAABB signal especially if there is high repetitiveness - skip_peak_4 = TRUE #peak_frame_4 will not be included in the weighted mean, since it will artificially inflate the estimate - } - } - skip_peak_5 = FALSE - if ( !is.na(peak_frame_5$cov[1]) & !is.na(peak_frame_6$cov[1]) ){ - if (abs(peak_frame_6$cov[1] - peak_frame_5$cov[1]) <= 5 & peak_frame_6$height[1] > peak_frame_5$height[1]) { #This is characteristic of hexaploids in which peak_frame_5 may pick up on the AAAAAB signal - skip_peak_5 = TRUE #peak_frame_5 will not be included in the weighted mean, since it will artificially inflate the estimate - } - } - - if ( !is.na(peak_frame_2$cov[1]) ){ - peak_frame_2$cov <- peak_frame_2$cov / (2 * round(peak_frame_2$cov / peak_frame_2$cov[1])) - } - if ( !is.na(peak_frame_3$cov[1]) ){ - peak_frame_3$cov <- peak_frame_3$cov / (3 * round(peak_frame_3$cov / peak_frame_3$cov[1])) - } - if ( !is.na(peak_frame_4$cov[1]) ){ - peak_frame_4$cov <- peak_frame_4$cov / (4 * round(peak_frame_4$cov / peak_frame_4$cov[1])) - } - if ( !is.na(peak_frame_5$cov[1]) ){ - peak_frame_5$cov <- peak_frame_5$cov / (5 * round(peak_frame_5$cov / peak_frame_5$cov[1])) - } - if ( !is.na(peak_frame_6$cov[1]) ){ - peak_frame_6$cov <- peak_frame_6$cov / (6 * round(peak_frame_6$cov / peak_frame_6$cov[peak_frame_6_max_height])) - } - - if (skip_peak_4) { - peak_frame <- rbind(peak_frame_2, peak_frame_3, peak_frame_5, peak_frame_6) - } else if (skip_peak_5) { - peak_frame <- rbind(peak_frame_2, peak_frame_3, peak_frame_4, peak_frame_6) - } else { - peak_frame <- rbind(peak_frame_2, peak_frame_3, peak_frame_4, peak_frame_5, peak_frame_6) - } - - peak_frame <- peak_frame[is.finite(rowSums(peak_frame)),] - weighted.mean(peak_frame$cov, peak_frame$height, na.rm = T) -} diff --git a/R/estimate_1n_coverage_highest_peak.R b/R/estimate_1n_coverage_highest_peak.R deleted file mode 100644 index 2b22772..0000000 --- a/R/estimate_1n_coverage_highest_peak.R +++ /dev/null @@ -1,27 +0,0 @@ -#' @title estimate_1n_coverage_highest_peak -#' -#' @description -#' \code{estimate_1n_coverage_highest_peak} -#' -#' @export - -estimate_1n_coverage_highest_peak <- function(.filt_peak_sizes, .cov_tab, .draft_1n){ - # 3 indications - # AAB is presumabely highest - # AAB is has the lowest coverages that correspondents to mutliples of two - # the is not other peak around 1/6 and 1/2 - highest_peak <- which.max(.filt_peak_sizes[, 'rel_size']) - - if(is.na(.draft_1n)){ - # assuming that the greatest peak is the diploid one - draft_1n <- .filt_peak_sizes[highest_peak, 'pair_cov'] / 2 - } else { - draft_1n <- .draft_1n - } - - highest_min_cov <- .filt_peak_sizes[highest_peak, 'minor_variant_cov_rounded'] - highest_subset <- .cov_tab[, 'minor_variant_rel_cov'] < (highest_min_cov + 0.01) & .cov_tab[, 'minor_variant_rel_cov'] > (highest_min_cov - 0.01) - major_peaks <- get_1d_peaks(.cov_tab[highest_subset, ], 1, 1) - major_cov <- major_peaks$cov[which.max(major_peaks$height)] - major_cov / round(major_cov / draft_1n) -} diff --git a/R/filter_peak_sizes.R b/R/filter_peak_sizes.R deleted file mode 100644 index d224d2f..0000000 --- a/R/filter_peak_sizes.R +++ /dev/null @@ -1,10 +0,0 @@ -#' @title filter_peak_sizes -#' -#' @description -#' \code{filter_peak_sizes} -#' -#' @export - -filter_peak_sizes <- function(.peak_sizes, .threshold = 0.005){ - .peak_sizes[!.peak_sizes$rel_size < .threshold,] -} \ No newline at end of file diff --git a/R/filter_peaks.R b/R/filter_peaks.R deleted file mode 100644 index c443c88..0000000 --- a/R/filter_peaks.R +++ /dev/null @@ -1,14 +0,0 @@ -#' @title filter_peaks -#' -#' @description -#' \code{filter_peaks} -#' -#' @export - -filter_peaks <- function(.peak_points, .peak_sizes){ - to_filter <- !.peak_points$peak %in% .peak_sizes$peak - - .peak_points$peak[to_filter] <- NA - .peak_points$summit[to_filter] <- F - return(.peak_points) -} \ No newline at end of file diff --git a/R/generate_summary.R b/R/generate_summary.R deleted file mode 100644 index b0aec12..0000000 --- a/R/generate_summary.R +++ /dev/null @@ -1,40 +0,0 @@ -#' @title generate_summary -#' -#' @description -#' \code{generate_summary} -#' -#' @export - -generate_summary <- function(.args, .smudge_summary){ - smudge_summary(.args$output, "1n coverage estimates (Coverage of every haplotype; Don't confuse with genome coverage which is (ploidy * 1n coverage).)") - smudge_summary(.args$output, "* User defined 1n coverage:", .args$n_cov) - smudge_summary(.args$output, "* Subset 1n coverage estimate:", .smudge_summary$n_subset_est) - smudge_summary(.args$output, "* Highest peak 1n coverage estimate:", round(.smudge_summary$n_peak_est, 1)) - smudge_summary(.args$output, "1n coverage used in smudgeplot (one of the three above):", round(.smudge_summary$n, 1)) - - smudge_summary(.args$output, "* Proposed ploidy:", .smudge_summary$genome_ploidy) - - for_sure_heterozygous <- sapply(.smudge_summary$peak_sizes$structure, function(x){sum(unlist(strsplit(x, split = '')) == 'B') == 1}) - # THIS LINE - minimal_number_of_heterozygous_loci <- ceiling(sum(.smudge_summary$peak_sizes$abs_size[for_sure_heterozygous]) / .args$kmer_size) - - smudge_summary(.args$output, "* Minimal number of heterozygous loci:", minimal_number_of_heterozygous_loci) - smudge_summary(.args$output, "Note: This number is NOT an estimate of the total number of heterozygous loci, it's merely setting the lower boundary if the inference of heterozygosity peaks is correct.") - - smudge_summary(.args$output, "* Proportion of heterozygosity carried by pairs in different genome copies (table)") - tab_to_print <- data.frame( genome_copies = 2:max(.smudge_summary$peak_sizes$ploidy), - propotion_of_heterozygosity = round(sapply(2:max(.smudge_summary$peak_sizes$ploidy), function(x){ sum(.smudge_summary$peak_sizes$rel_size[.smudge_summary$peak_sizes$ploidy == x]) }), 2)) - smudge_summary(.args$output, paste0(capture.output(tab_to_print), collapse = "\n")) - - carried_by_paralogs <- sum(.smudge_summary$peak_sizes$rel_size[.smudge_summary$peak_sizes$ploidy > .smudge_summary$genome_ploidy]) - smudge_summary(.args$output, "* Proportion of heterozygosity carried by paralogs:", round(carried_by_paralogs, 3)) - - smudge_summary(.args$output, "* Summary of all detected peaks (table)") - tab_to_print <- .smudge_summary$peak_sizes[,c(11,2,3,8,9)] - tab_to_print[,c(2:5)] <- round(tab_to_print[,c(2:5)], 2) - colnames(tab_to_print) <- c('peak','kmers [#]', 'kmers [proportion]', 'summit B / (A + B)', 'summit A + B') - smudge_summary(.args$output, paste0(capture.output(tab_to_print), collapse = "\n")) - - write.table(tab_to_print, paste0(.args$output, '_summary_table.tsv'), - quote = F, sep = '\t', row.names = F) -} diff --git a/R/get_1d_peaks.R b/R/get_1d_peaks.R deleted file mode 100644 index 54f437c..0000000 --- a/R/get_1d_peaks.R +++ /dev/null @@ -1,24 +0,0 @@ -#' @title get_1d_peaks -#' -#' @description -#' \code{get_1d_peaks} is a legacy code -#' -#' @export - -get_1d_peaks <- function(.cov_tab, .num_of_peaks = 3, .adjust = 10, .peak_frame = data.frame(), .depth = 1){ - if (nrow(.cov_tab) < 2){ - return(data.frame(cov = NA, height = NA)) - } - if (nrow(.peak_frame) < .num_of_peaks & .depth < 11){ - kmer_pairs <- sum(.cov_tab[, 'freq']) - # used weighted version of silverman's rule of thumb - bw <- bw.nrdW(.cov_tab[, 'total_pair_cov'], .cov_tab[, 'freq']) - # density wants to have the weights relative, I will divide it and them multiply the fitted heights (alternatively, I could suppres warnings) - rel_weights <- .cov_tab[, 'freq'] / kmer_pairs - d <- density(.cov_tab[, 'total_pair_cov'], weights = rel_weights, adjust = .adjust, bw = bw) # returns the density data - selected_points <- which(diff(sign(diff(d$y))) == -2) + 1 - .peak_frame <- data.frame(cov = d$x[selected_points], height = d$y[selected_points] * kmer_pairs) - .peak_frame <- get_1d_peaks(.cov_tab, .num_of_peaks, .adjust - 1, .peak_frame, .depth + 1) - } - .peak_frame -} \ No newline at end of file diff --git a/R/get_col_ramp.R b/R/get_col_ramp.R deleted file mode 100644 index 9159c77..0000000 --- a/R/get_col_ramp.R +++ /dev/null @@ -1,19 +0,0 @@ -#' @title get_col_ramp -#' -#' @description -#' \code{get_col_ramp} returns a colour ramp used by smudgeplots; -#' args need to contain args$col_ramp and args$invert_cols features -#' the first needs to be a ramp-generating function (e.g. "viridis", "grey.colors", "magma" or "mako") -#' args$invert_cols is just TRUE or FALSE (if to revert the colours in the palete) -#' -#' @export - -get_col_ramp <- function(.args, delay = 0){ - colour_ramp <- eval(parse(text = paste0(.args$col_ramp,"(", 32 - delay, ")"))) - if (.args$invert_cols){ - colour_ramp <- rev(colour_ramp) - } - colour_ramp <- c(rep(colour_ramp[1], delay), colour_ramp) - return(colour_ramp) -} - diff --git a/R/get_peak_sizes.R b/R/get_peak_sizes.R deleted file mode 100644 index b52eeaa..0000000 --- a/R/get_peak_sizes.R +++ /dev/null @@ -1,14 +0,0 @@ -#' @title get_peak_sizes -#' -#' @description -#' \code{get_peak_sizes} -#' -#' @export - -get_peak_sizes <- function(.peak_points){ - peaks <- .peak_points[.peak_points$summit == T,'peak'] - peak_sizes <- sapply(peaks, function(x){sum(.peak_points[.peak_points$peak == x, 'vals'], na.rm = T)}) - data.frame(peak = peaks, - abs_size = peak_sizes, - rel_size = peak_sizes / sum(peak_sizes)) -} \ No newline at end of file diff --git a/R/get_peak_summary.R b/R/get_peak_summary.R deleted file mode 100644 index cbfbfeb..0000000 --- a/R/get_peak_summary.R +++ /dev/null @@ -1,22 +0,0 @@ -#' @title get_peak_summary -#' -#' @description -#' \code{get_peak_summary} -#' -#' @export - -get_peak_summary <- function(.peak_points, .smudge_container, .treshold = 0.005){ - peak_sizes <- get_peak_sizes(.peak_points) - - filt_peak_sizes <- filter_peak_sizes(peak_sizes, .treshold) - filt_peak_points <- filter_peaks(.peak_points, filt_peak_sizes) - - filt_peak_sizes <- merge(filt_peak_sizes, filt_peak_points[filt_peak_points$summit == T,]) - filt_peak_sizes$minor_variant_cov <- transform_minor_variant_cov(filt_peak_sizes$x, .smudge_container) - filt_peak_sizes$pair_cov <- transform_pair_cov(filt_peak_sizes$y, .smudge_container) - - # genome_copy_nuber <- round(filt_peak_sizes$pair_cov / (min(filt_peak_sizes$pair_cov) / 2)) - filt_peak_sizes$minor_variant_cov_rounded <- sapply(filt_peak_sizes$minor_variant_cov, round_minor_variant_cov) - - return(filt_peak_sizes) -} \ No newline at end of file diff --git a/R/get_smudge_container.R b/R/get_smudge_container.R deleted file mode 100644 index 4323312..0000000 --- a/R/get_smudge_container.R +++ /dev/null @@ -1,49 +0,0 @@ -#' @title get_smudge_container -#' -#' @description -#' \code{get_smudge_container} is inspired by https://stackoverflow.com/a/18103689/2962344 -#' -#' @export - -get_smudge_container <- function(.cov_tab, .nbins = 20, - .xlim = c(0, 0.5), .ylim = NA){ - - total_pair_cov <- .cov_tab[, 1] + .cov_tab[, 2] - minor_variant_rel_cov <- .cov_tab[, 1] / total_pair_cov - - if (any(is.na(.ylim))) { - .ylim <- c(0, max(.total_pair_cov)) - } - - smudge_container <- list() - smudge_container$x <- seq(.xlim[1], ((.nbins - 1) / .nbins) * .xlim[2], length = .nbins) - smudge_container$y <- c(seq(.ylim[1], ((.nbins - 1) / .nbins) * .ylim[2], length = .nbins), .ylim[2]) - - xcoords <- findInterval(minor_variant_rel_cov, smudge_container$x, left.open = TRUE) - ycoords <- findInterval(total_pair_cov, smudge_container$y, left.open = TRUE) # all >.ylim[2] will be in nbin+1 th bin - - if ("freq" %in% colnames(.cov_tab)) { # ploidy-plot style "cov1 cov2 density" table - group_by <- list(paste(xcoords, ycoords)) - agregated_counts <- aggregate(.cov_tab[, 3], by = group_by, sum) - agregated_coordinates <- matrix(as.numeric(unlist(strsplit(agregated_counts[, 1], " "))), ncol = 2, byrow = TRUE) - coordinates_in_range <- !agregated_coordinates[, 2] == .nbins+1 # the rows that belong to nbins+1 bin need to be removed; this is identify which to keep - agregated_counts <- agregated_counts[coordinates_in_range, 2] # remove nbin+1 in counts, and get rid keep only the counts (not cooredinates) - agregated_coordinates <- agregated_coordinates[coordinates_in_range, ] # and subset the separated coordinates too - } else { # original smudgepot style "cov1 cov2" table; - agregated_counts <- as.data.frame(table(xcoords, ycoords), stringsAsFactors = FALSE) - agregated_counts[, 1] <- as.numeric(agregated_counts[, 1]) - agregated_counts[, 2] <- as.numeric(agregated_counts[, 2]) - agregated_counts <- agregated_counts[!agregated_counts[, 2] == .nbins + 1, ] - agregated_coordinates <- cbind(agregated_counts[, 1], agregated_counts[, 2]) - agregated_counts <- agregated_counts[, 3] - } - - smudge_container$dens <- matrix(0, .nbins, .nbins) - smudge_container$dens[agregated_coordinates] <- agregated_counts - - smudge_container$y <- smudge_container$y[1:.nbins] - smudge_container$z <- log10(smudge_container$dens) - smudge_container$z[is.infinite(smudge_container$z)] <- 0 - - return(smudge_container) -} \ No newline at end of file diff --git a/R/get_trinoploid_1n_est.R b/R/get_trinoploid_1n_est.R deleted file mode 100644 index 6179936..0000000 --- a/R/get_trinoploid_1n_est.R +++ /dev/null @@ -1,20 +0,0 @@ -#' @title get_trinoploid_1n_est -#' -#' @description -#' \code{get_trinoploid_1n_est} -#' -#' @export - -get_trinoploid_1n_est <- function(.filt_peak_sizes){ - trinploids <- .filt_peak_sizes$minor_variant_cov_rounded == 0.33 - if(any(trinploids)){ - triplod_1n_guess <- min(.filt_peak_sizes$pair_cov[trinploids]) / 3 - genome_counts_by_trip_1n_est <- round(.filt_peak_sizes$pair_cov / triplod_1n_guess) - if(sum(genome_counts_by_trip_1n_est == 3) == 1){ - # min triploid peak look like triploid - return(triplod_1n_guess) - } - } - # unable to guess - return(NA) -} diff --git a/R/guess_genome_structure.R b/R/guess_genome_structure.R deleted file mode 100644 index 0ec44f6..0000000 --- a/R/guess_genome_structure.R +++ /dev/null @@ -1,14 +0,0 @@ -#' @title guess_genome_structure -#' -#' @description -#' \code{guess_genome_structure} -#' -#' @export - -guess_genome_structure <- function(.filt_peak_sizes_line, .n){ - both_count_in_genome <- round(as.numeric(.filt_peak_sizes_line['pair_cov']) / .n) - minor_count_in_genome <- round(as.numeric(.filt_peak_sizes_line['minor_variant_cov']) * both_count_in_genome) - paste0( - c(rep('A', both_count_in_genome - minor_count_in_genome), - rep('B', minor_count_in_genome)), collapse = '') -} \ No newline at end of file diff --git a/R/peak_agregation.R b/R/peak_agregation.R deleted file mode 100644 index dafa49f..0000000 --- a/R/peak_agregation.R +++ /dev/null @@ -1,48 +0,0 @@ -#' @title peak_agregation -#' -#' @description -#' \code{peak_agregation} is the function that agregates tiles from the smudge_container -#' into smudges - individual distributions -#' -#' @param .smudge_container - a smudge_container object; a list with `x`,`y` axies of the 2d histogram and `dens` matrix of 2h histogram values. -#' Usually output of function \code{get_smudge_container} -#' -#' @export - -peak_agregation <- function(.smudge_container, .L = NA){ - if ( is.na(.L) ){ .L <- min(.smudge_container$y) / 2 } # guess L if not specified - .L <- max(c(.L, 10)) - nbins <- length(.smudge_container$x) - peak_points <- data.frame(vals = as.vector(.smudge_container$dens), - x = rep(1:nbins, nbins), - y = rep(1:nbins, each = nbins)) - peak_points$peak <- NA - peak_points$summit <- NA - peak_points <- peak_points[order(peak_points$vals, decreasing = T),] - #TODO no go zone x,y <- .smudge_container - nogo_x <- findInterval(.L / .smudge_container$y, .smudge_container$x, left.open = T) - # mark the first - peak_points$peak[1] <- 1 - peak_points$summit[1] <- T - - for (point in 2:nrow(peak_points)) { - parsed_points <- peak_points[1:point-1,] - x_n <- abs(parsed_points[,'x'] - peak_points[point,'x']) <= 1 - y_n <- abs(parsed_points[,'y'] - peak_points[point,'y']) <= 1 - if (any(x_n & y_n)) { - tiles_around <- parsed_points[x_n & y_n,] - peak_points$peak[point] <- tiles_around$peak[which.max(tiles_around$vals)] - peak_points$summit[point] <- F - } else { - # filt_peak_sizes$y -> pair_coverage - # parse only point above eror line - if ( peak_points[point,'x'] > nogo_x[peak_points[point,'y']] ) { - peak_points$peak[point] <- max(parsed_points$peak, na.rm = T) + 1 - peak_points$summit[point] <- T - } else { - peak_points$summit[point] <- F - } - } - } - return(peak_points) -} diff --git a/R/plot_expected_haplotype_structure.R b/R/plot_expected_haplotype_structure.R deleted file mode 100644 index c2b7649..0000000 --- a/R/plot_expected_haplotype_structure.R +++ /dev/null @@ -1,21 +0,0 @@ -#' @title plot_expected_haplotype_structure -#' -#' @description -#' \code{plot_expected_haplotype_structure} adds genome scture labels to a smudgeplot -#' -#' @export - -plot_expected_haplotype_structure <- function(.n, .peak_sizes, - .adjust = F, .cex = 1.3, xmax = 0.49){ - borercases <- .peak_sizes$corrected_minor_variant_cov == 0.5 - structures <- reduce_structure_representation(.peak_sizes) - - for(i in 1:nrow(.peak_sizes)){ - # xmax is in the middle of the last square in the 2d histogram, - # which is too far from the edge, so I average it with 0.49 - # witch will pull the label bit to the edge - text( ifelse( borercases[i] & .adjust, (xmax + 0.49) / 2, .peak_sizes$corrected_minor_variant_cov[i]), - .peak_sizes$ploidy[i] * .n, structures[i], - offset = 0, cex = .cex, xpd = T, pos = ifelse( borercases[i] & .adjust, 2, 1)) - } -} \ No newline at end of file diff --git a/R/plot_histograms.R b/R/plot_histograms.R deleted file mode 100644 index 5fc86c5..0000000 --- a/R/plot_histograms.R +++ /dev/null @@ -1,68 +0,0 @@ -#' @title plot_histograms -#' -#' @description -#' \code{plot_histograms} makes 2 plots - histograms of the both margins of smudge plot -#' -#' @export - -# .cov_tab <- cov_tab -# .smudge_summary <- smudge_summary -# .fig_title <- NA -# .ylim <- ylim -# .bins <- 150 - -plot_histograms <- function(.cov_tab, .smudge_summary, - .fig_title = NA, .cex = 1.4, .col = NA, - .ylim = NA, .bins = 100){ - - if( is.na(.col) ){ - .col <- rgb(0.8352, 0.2431, 0.3098) - } - - # removing pairs with excess coverage - .cov_tab <- .cov_tab[.cov_tab[, 'total_pair_cov'] < .ylim[2], ] - - # minor_variant_rel_cov HISTOGRAM - top - coverage_histogram(.cov_tab, bins = .bins, xlim = c(0, 0.5), - column = 'minor_variant_rel_cov', horiz = FALSE, .col) - - if (!(is.na(.fig_title))) { - mtext(bquote(italic(.(.fig_title))), side=3, adj=0, line=-3, cex = .cex + 0.2) - } - - if (length(.smudge_summary$genome_ploidy) > 0){ - if ( .smudge_summary$genome_ploidy < 9){ - ploidytext <- switch(.smudge_summary$genome_ploidy - 1, - p2 = 'diploid', - p3 = 'triploid', - p4 = 'tetraploid', - p5 = 'pentaploid', - p6 = 'hexaploid', - p7 = 'heptaploid', - p8 = 'octoploid') - } else { - ploidytext <- paste0(.smudge_summary$genome_ploidy, '-ploid') - } - - if(!(is.na(.smudge_summary$genome_ploidy))){ - mtext(paste('proposed', ploidytext), side=3, adj=0.05, line=-5, cex = .cex - 0.2) - } - } - - - # total pair coverage HISTOGRAM - right - coverage_histogram(.cov_tab, bins = .bins, xlim = .ylim, # this is because horizonal is TRUE - column = 'total_pair_cov', horiz = TRUE, .col) - - if (.smudge_summary$n > 0){ #0 means it was not infered - legend('bottomright', bty = 'n', paste('1n = ', round(.smudge_summary$n)), cex = .cex - 0.1) - - .peak_sizes <- data.frame(peak = reduce_structure_representation(.smudge_summary$peak_sizes), - size = .smudge_summary$peak_sizes[,3]) - - if(! any(is.na(.peak_sizes))){ - legend('topleft', bty = 'n', .peak_sizes[,1], cex = .cex - 0.2) - legend('topright', bty = 'n', legend = round(.peak_sizes[,2], 2), cex = .cex - 0.2) - } - } -} diff --git a/R/plot_legend.R b/R/plot_legend.R deleted file mode 100644 index dacd44c..0000000 --- a/R/plot_legend.R +++ /dev/null @@ -1,26 +0,0 @@ -#' @title plot_legend -#' -#' @description -#' \code{plot_legend} generate topright corener in the smudgeplot -#' -#' @export - -plot_legend <- function(.k, .colour_ramp, .log_scale = T){ - par(mar=c(0,0,2,1)) - plot.new() - print_title <- ifelse(.log_scale, 'log kmers pairs', 'kmers pairs') - title(print_title) - for(i in 1:32){ - rect(0,(i - 0.01) / 33, 0.5, (i + 0.99) / 33, col = .colour_ramp[i]) - } - kmer_max <- max(smudge_container$dens) - if( .log_scale == T ){ - for(i in 0:6){ - text(0.75, i / 6, rounding(10^(log10(kmer_max) * i / 6)), offset = 0) - } - } else { - for(i in 0:6){ - text(0.75, i / 6, rounding(kmer_max * i / 6), offset = 0) - } - } -} \ No newline at end of file diff --git a/R/plot_seq_error_line.R b/R/plot_seq_error_line.R deleted file mode 100644 index 915c202..0000000 --- a/R/plot_seq_error_line.R +++ /dev/null @@ -1,13 +0,0 @@ -#' @title plot_seq_error_line -#' -#' @description -#' \code{plot_seq_error_line} add error line to plot it's L / cov -#' -#' @export - -plot_seq_error_line <- function(.cov_tab, .L = NA, .col = 'black'){ - if (is.na(.L)){ .L <- min(.cov_tab[, 'covB']) } - max_cov_pair <- max(.cov_tab[, 'total_pair_cov']) - cov_range <- seq(2 * .L, max_cov_pair, length = 500) - lines((.L - 1) / cov_range, cov_range, lwd = 2.5, lty = 2, col = .col) -} \ No newline at end of file diff --git a/R/plot_smudgeplot.R b/R/plot_smudgeplot.R deleted file mode 100644 index 173c127..0000000 --- a/R/plot_smudgeplot.R +++ /dev/null @@ -1,36 +0,0 @@ -#' @title plot_smudgeplot -#' -#' @description -#' \code{plot_smudgeplot} is the core smudgeplot function to plot the core of the smudgeplot -#' the cental 2D histogram/ landscape plot / heatmap / howeveryouwantyoucallthis plot -#' -#' @param .k - the matrix of densities or list of x,y axies and z, the matrix. Usually output of function -#' -#' @param .n - the 1n coverage, the coverage of the haploid genome (can be estimated by TODO function) -#' -#' @param .colour_ramp - the colour pallete for the heat colours -#' -#' @param .cex - the size of the axis labels [default 1.4] -#' -#' @author Kamil Jaron \email{kamiljaron at gmail.com} -#' -#' @export - -plot_smudgeplot <- function(.k, .n, .colour_ramp, .cex = 1.4){ - # margins 'c(bottom, left, top, right)' - par(mar=c(4.8,4.8,1,1)) - # 2D HISTOGRAM - image(.k, col = .colour_ramp, - xlab = 'Normalized minor kmer coverage: B / (A + B)', - ylab = 'Total coverage of the kmer pair: A + B', cex.lab = 1.4, - axes=F - ) - - if (.n == 0){ - axis(2) - } else { - axis(2, at = 2:8 * .n, labels = paste(2:8, 'n')) - } - axis(1, at = c(1/5, 1/4, 1/3, max(.k$x)), - labels = c('1/5', '1/4', '1/3', '1/2')) -} \ No newline at end of file diff --git a/R/reduce_structure_representation.R b/R/reduce_structure_representation.R deleted file mode 100644 index 2e5ac0f..0000000 --- a/R/reduce_structure_representation.R +++ /dev/null @@ -1,19 +0,0 @@ -#' @title reduce_structure_representation -#' -#' @description -#' \code{reduce_structure_representation} is the function that collapses labels loger than 4 characters into 4 character representations -#' -#' @param .peak_sizes - a table with peak sizes, the truly important column in there is just the "structure" one -#' -#' @export - -reduce_structure_representation <- function(.peak_sizes){ - structures_to_adjust <- sapply(.peak_sizes$structure, nchar) > 4 - if (any(structures_to_adjust)) { - decomposed_struct <- strsplit(.peak_sizes$structure[structures_to_adjust], '') - As <- sapply(decomposed_struct, function(x){ sum(x == 'A') } ) - Bs <- sapply(decomposed_struct, length) - As - .peak_sizes$structure[structures_to_adjust] <- paste0(As, 'A', Bs, 'B') - } - .peak_sizes$structure -} diff --git a/R/round_minor_variant_cov.R b/R/round_minor_variant_cov.R deleted file mode 100644 index 2fdf8fc..0000000 --- a/R/round_minor_variant_cov.R +++ /dev/null @@ -1,13 +0,0 @@ -#' @title round_minor_variant_cov -#' -#' @description -#' \code{round_minor_variant_cov} -#' -#' @export - -round_minor_variant_cov <- function(.min_cov){ - # 0.5 0.4 0.33 0.25 0.2 - # 1/2 2/5 1/3 1/4 1/5 - expected_freq <- c(0.5, 0.4, 0.33, 0.25, 0.2, 0.166, 0.1428, 0.125) - expected_freq[which.min(abs(expected_freq - .min_cov))] -} \ No newline at end of file diff --git a/R/rounding.R b/R/rounding.R deleted file mode 100644 index 2c9dfa7..0000000 --- a/R/rounding.R +++ /dev/null @@ -1,16 +0,0 @@ -#' @title rounding -#' -#' @description -#' \code{rounding} rounds to thousands or hundreds if the number is smaller than 1000 -#' -#' @export - -rounding <- function(number){ - if(number > 1000){ - round(number / 1000) * 1000 - } else if (number > 100){ - round(number / 100) * 100 - } else { - round(number / 10) * 10 - } -} \ No newline at end of file diff --git a/R/smudge_summary.R b/R/smudge_summary.R deleted file mode 100644 index 17de5b1..0000000 --- a/R/smudge_summary.R +++ /dev/null @@ -1,11 +0,0 @@ -#' @title smudge_summary -#' -#' @description -#' \code{smudge_summary} a function that is used to report all the summary output messages -#' like this I will be able to easily redirect the waring message to file as well -#' -#' @export - -smudge_summary <- function(.filename, ...){ - write(paste(..., sep = '\t'), paste0(.filename, "_verbose_summary.txt"), append = T) -} diff --git a/R/smudge_warn.R b/R/smudge_warn.R deleted file mode 100644 index 04512b3..0000000 --- a/R/smudge_warn.R +++ /dev/null @@ -1,12 +0,0 @@ -#' @title smudge_warn -#' -#' @description -#' \code{smudge_warn} a function that is used to report all the wanrning messages -#' like this I will be able to easily redirect the waring message to file as well -#' -#' @export - -smudge_warn <- function(.filename, ...){ - write(paste(..., sep = '\t'), stderr()) - write(paste(..., sep = '\t'), paste0(.filename, "_warnings.txt"), append = T) -} \ No newline at end of file diff --git a/R/transform_minor_variant_cov.R b/R/transform_minor_variant_cov.R deleted file mode 100644 index 02c34f7..0000000 --- a/R/transform_minor_variant_cov.R +++ /dev/null @@ -1,13 +0,0 @@ -#' @title transform_minor_variant_cov -#' -#' @description -#' \code{transform_minor_variant_cov} -#' to transform coordinates of smudgeplot to minor_variant_rel_cov -#' -#' @export - -transform_minor_variant_cov <- function(.x, .smudge_container){ - # .x corresponds to <.x, .x+1> interval - # I add 1/2 of the size of the interval to return mean interval values - .smudge_container$x[.x] + (diff(.smudge_container$x)[1] / 2) -} \ No newline at end of file diff --git a/R/transform_pair_cov.R b/R/transform_pair_cov.R deleted file mode 100644 index 27c53e9..0000000 --- a/R/transform_pair_cov.R +++ /dev/null @@ -1,11 +0,0 @@ -#' @title transform_pair_cov -#' -#' @description -#' \code{transform_pair_cov} -#' to transform coordinates of smudgeplot to total_pair_cov -#' -#' @export - -transform_pair_cov <- function(.y, .smudge_container){ - .smudge_container$y[.y] + (diff(.smudge_container$y)[1] / 2) -} \ No newline at end of file diff --git a/R/wtd.quantile.R b/R/wtd.quantile.R deleted file mode 100644 index c7a73e9..0000000 --- a/R/wtd.quantile.R +++ /dev/null @@ -1,25 +0,0 @@ -#' @title wtd.quantile & wtd.iqr -#' -#' @description -#' \code{wtd.quantile} -#' calculates a quantile using "wighted data", specifically designed for "histogram-like" data -#' \code{wtd.iqr} -#' 0.75 - 0.25 weighted quantile -#' -#' @export - -wtd.quantile <- function(x, q=0.25, weight=NULL) { - o <- order(x) - n <- sum(weight) - order <- 1 + (n - 1) * q - low <- pmax(floor(order), 1) - high <- pmin(ceiling(order), n) - low_contribution <- high - order - allq <- approx(x=cumsum(weight[o])/sum(weight), y=x[o], xout = c(low, high)/n, method = "constant", - f = 1, rule = 2)$y - low_contribution * allq[1] + (1 - low_contribution) * allq[2] -} - -wtd.iqr <- function(x, w=NULL) { - wtd.quantile(x, q=0.75, weight=w) - wtd.quantile(x, q=0.25, weight=w) -} diff --git a/README.md b/README.md index 98705cd..e0d75c9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,14 @@ The big changes are + the backend by Gene is paralelized and massively faster + the intermediate file will be a flat file with the 2d histogram with cov1, cov2, freq columns (as opposed to list of coverages of pairs cov1 cov2); + at least for now WE LOSE the ability to extract sequences of the kmers in the pair; this functionality will hopefully restore at some point together with functionality to assess the quality of assembly. - + the smudge detection algorithm is under revision and a **new version will be released on 18th of October 2024** + + we added "run all" functionality for people that want "FastK database -> plot" type of solution. + + completelly revamped plot showing how all individual kmer pairs insead of agregating them into squares + + new smudge detection algorithm based on grid projection on the smudge plane (working, but under revisions at the moment) + + R package smudgeplot was retired and is no longer used + +We keep the same pythonic interface, the interface of older smudgeplot and this version are very similar and largely compatible. + +Current state: RUNNING; beta-testing; ### Install the whole thing @@ -39,21 +46,21 @@ will install smudgeplot to `~/bin/`. #### Manual installation -Installing the `R` package: - -```bash -# cd smudgeplot -Rscript -e 'install.packages(".", repos = NULL, type="source")' # this will install smudgeplot R package; -``` - Compiling the `C` executable ``` -make exec/PloidyPlot # this will compile PloidyPlot backend +make exec/hetmers # this will compile hetmers (kmer pair searching engine of PloidyPlot) backend ``` Now you can move all three files from the `exec` directory somewhere your system will see it (or alternativelly, you can add that directory to `$PATH` variable). +``` +install -C exec/smudgeplot.py /usr/local/bin +install -C exec/hetmers /usr/local/bin +install -C exec/smudgeplot_plot.R /usr/local/bin +install -C exec/centrality_plot.R /usr/local/bin +``` + ### Runing this version on Sacharomyces data Requires ~2.1GB of space and `FastK` and `smudgeplot` installed. @@ -69,18 +76,24 @@ mv *fastq.gz data/Scer/ # run FastK to create a k-mer database FastK -v -t4 -k31 -M16 -T4 data/Scer/SRR3265401_[12].fastq.gz -Ndata/Scer/FastK_Table -# Run PloidyPlot to find all k-mer pairs in the dataset -PloidyPlot -e12 -k -v -T4 -odata/Scer/kmerpairs data/Scer/FastK_Table +# Find all k-mer pairs in the dataset using hetmer module +smudgeplot.py hetmers -L 12 -t 4 -o data/Scer/kmerpairs --verbose data/Scer/FastK_Table # this now generated `data/Scer/kmerpairs_text.smu` file; # it's a flat file with three columns; covB, covA and freq (the number of k-mer pairs with these respective coverages) # use the .smu file to infer ploidy and create smudgeplot -smudgeplot.py plot -n 15 -t Sacharomyces -o data/Scer/trial_run data/Scer/kmerpairs_text.smu +smudgeplot.py all -o data/Scer/trial_run data/Scer/kmerpairs_text.smu -# check that 5 files are generated (2 pdfs; a summary tsv table, and two txt logs) +# check that bunch files are generated (3 pdfs; some summary tables and logs) ls data/Scer/trial_run_* ``` +The y-axis scaling is by default 100, one can spcify argument `ylim` to scale it differently + +```bash +smudgeplot.py all -o data/Scer/trial_run_ylim70 data/Scer/kmerpairs_text.smu -ylim 70 +``` + And that's it for now! I will be streamlining this over the next few days so hopefully it will all work with a single command; ### How smudgeplot works diff --git a/exec/centrality_plot.R b/exec/centrality_plot.R new file mode 100644 index 0000000..b73a85e --- /dev/null +++ b/exec/centrality_plot.R @@ -0,0 +1,11 @@ +#!/usr/bin/env Rscript +args = commandArgs(trailingOnly=TRUE) +input_name <- args[1] +# input_name = '~/test/daAchMill1_centralities.txt' + +output_name <- gsub('.txt', '.pdf', input_name) +tested_covs <- read.table(input_name, col.names = c('cov', 'centrality')) + +pdf(output_name) + plot(tested_covs[, 'cov'], tested_covs[, 'centrality'], xlab = 'Coverage', ylab = 'Centrality [(theoretical_center - actual_center) / coverage ]', pch = 20) +dev.off() \ No newline at end of file diff --git a/exec/smudgeplot.py b/exec/smudgeplot.py index 89a2ea6..4a3d6e4 100755 --- a/exec/smudgeplot.py +++ b/exec/smudgeplot.py @@ -2,14 +2,22 @@ import argparse import sys +import numpy as np +from pandas import read_csv # type: ignore +from pandas import DataFrame # type: ignore +from numpy import arange +from numpy import argmin +from numpy import concatenate from os import system from math import log from math import ceil -# hetkmers dependencies +from statistics import fmean from collections import defaultdict -from itertools import combinations +# import matplotlib as mpl +# import matplotlib.pyplot as plt +# from matplotlib.pyplot import plot -version = '0.3.0 oriel' +version = '0.4.0dev' ############################ # processing of user input # @@ -20,9 +28,11 @@ def __init__(self): argparser = argparse.ArgumentParser( # description='Inference of ploidy and heterozygosity structure using whole genome sequencing data', usage='''smudgeplot [options] \n -tasks: cutoff Calculate meaningful values for lower kmer histogram cutoff. - hetmers Calculate unique kmer pairs from a FastK k-mer database. - plot Generate 2d histogram; infere ploidy and plot a smudgeplot.\n\n''') +tasks: cutoff Calculate meaningful values for lower kmer histogram cutoff. + hetmers Calculate unique kmer pairs from a FastK k-mer database. + peak_agregation Agregates smudges using local agregation algorithm. + plot Generate 2d histogram; infere ploidy and plot a smudgeplot. + all Runs all the steps (with default options)\n\n''') # removing this for now; # extract Extract kmer pairs within specified coverage sum and minor covrage ratio ranges argparser.add_argument('task', help='Task to execute; for task specific options execute smudgeplot -h') @@ -64,26 +74,13 @@ def plot(self): Generate 2d histogram; infer ploidy and plot a smudgeplot. ''' argparser = argparse.ArgumentParser(prog = 'smudgeplot plot', description='Generate 2d histogram for smudgeplot') - argparser.add_argument('infile', nargs='?', help='name of the input tsv file with covarages and frequencies (default \"coverages_2.smu\")."') + argparser.add_argument('infile', help='name of the input tsv file with covarages and frequencies') + argparser.add_argument('smudgefile', help='name of the input tsv file with sizes of individual smudges') + argparser.add_argument('n', help='The expected haploid coverage.', type=float) argparser.add_argument('-o', help='The pattern used to name the output (smudgeplot).', default='smudgeplot') - argparser.add_argument('-q', help='Remove kmer pairs with coverage over the specified quantile; (default none).', type=float, default=1) - argparser.add_argument('-L', help='The lower boundary used when dumping kmers (default min(total_pair_cov) / 2).', type=int, default=0) - argparser.add_argument('-c', '-cov_filter', help='Filter pairs with one of them having coverage bellow specified threshold (default 0; disables parameter L)', type=int, default=0) - argparser.add_argument('-n', help='The expected haploid coverage (default estimated from data).', type=float, default=0) - argparser.add_argument('-t', '--title', help='name printed at the top of the smudgeplot (default none).', default='') - argparser.add_argument('-ylim', help='The upper limit for the coverage sum (the y axis)', type = int, default=0) - # argparser.add_argument('-m', '-method', help='The algorithm for annotation of smudges (default \'local_aggregation\')', default='local_aggregation') - argparser.add_argument('-nbins', help='The number of nbins used for smudgeplot matrix (nbins x nbins) (default autodetection).', type=int, default=0) - # argparser.add_argument('-kmer_file', help='Name of the input files containing kmer seuqences (assuming the same order as in the coverage file)', default = "") - argparser.add_argument('--homozygous', action="store_true", default = False, help="Assume no heterozygosity in the genome - plotting a paralog structure; (default False).") - argparser.add_argument('--just_plot', action="store_true", default = False, help="Turns off the inference of coverage and annotation of smudges; simply generates smudgeplot. (default False)") - - - # plotting arugments - argparser.add_argument('-col_ramp', help='An R palette used for the plot (default "viridis", other sensible options are "magma", "mako" or "grey.colors" - recommended in combination with --invert_cols).', default='viridis') - argparser.add_argument('--invert_cols', action="store_true", default = False, help="Revert the colour palette (default False).") - argparser.add_argument('--plot_err_line', action="store_true", default = False, help="Add a line to the plot denoting where the error k-mers and genomic k-mers will likely pair up the most (default False).") + argparser = self.add_plotting_arguments(argparser) + self.arguments = argparser.parse_args(sys.argv[2:]) def cutoff(self): @@ -95,19 +92,51 @@ def cutoff(self): argparser.add_argument('boundary', help='Which bounary to compute L (lower) or U (upper)') self.arguments = argparser.parse_args(sys.argv[2:]) - def extract(self): + def peak_agregation(self): ''' Extract kmer pairs within specified coverage sum and minor covrage ratio ranges. ''' - argparser = argparse.ArgumentParser(prog = 'smudgeplot extract', description='Extract kmer pairs within specified coverage sum and minor covrage ratio ranges.') - argparser.add_argument("-cov", "--coverageFile",required=True, help="coverage file for the kmer pairs") - argparser.add_argument("-seq", "--seqFile",required=True, help="sequences of the kmer pairs") - argparser.add_argument("-minc", "--countMin",required=True, help="lower bound of the summed coverage", type=int) - argparser.add_argument("-maxc", "--countMax",required=True, help="upper bound of the summed coverage", type=int) - argparser.add_argument("-minr", "--ratioMin",required=True, help="lower bound of minor allele ratio", type=float) - argparser.add_argument("-maxr", "--ratioMax",required=True, help="upper bound of minor allele ratio", type=float) + argparser = argparse.ArgumentParser() + argparser.add_argument('infile', nargs='?', help='name of the input tsv file with covarages and frequencies.') + argparser.add_argument('-nf', '-noise_filter', help='Do not agregate into smudge k-mer pairs with frequency lower than this parameter', type=int, default=50) + argparser.add_argument('-d', '-distance', help='Manthattan distance of k-mer pairs that are considered neioboring for the local agregation purposes.', type=int, default=5) + argparser.add_argument('--mask_errors', help='instead of reporting assignments to individual smudges, just remove all monotonically decreasing points from the error line', action="store_true", default = False) self.arguments = argparser.parse_args(sys.argv[2:]) + def all(self): + argparser = argparse.ArgumentParser() + argparser.add_argument('infile', nargs='?', help='name of the input tsv file with covarages and frequencies.') + argparser.add_argument('-o', help='The pattern used to name the output (smudgeplot).', default='smudgeplot') + argparser.add_argument('-cov_min', help='Minimal coverage to explore (default 6)', default=6, type = int) + argparser.add_argument('-cov_max', help='Maximal coverage to explore (default 50)', default=60, type = int) + argparser.add_argument('-cov', help='Define coverage instead of infering it. Disables cov_min and cov_max.', default=0, type=int) + + argparser = self.add_plotting_arguments(argparser) + + self.arguments = argparser.parse_args(sys.argv[2:]) + + def add_plotting_arguments(self, argparser): + argparser.add_argument('-c', '-cov_filter', help='Filter pairs with one of them having coverage bellow specified threshold (default 0; disables parameter L)', type=int, default=0) + argparser.add_argument('-t', '--title', help='name printed at the top of the smudgeplot (default none).', default='') + argparser.add_argument('-ylim', help='The upper limit for the coverage sum (the y axis)', type = int, default=0) + argparser.add_argument('-col_ramp', help='An R palette used for the plot (default "viridis", other sensible options are "magma", "mako" or "grey.colors" - recommended in combination with --invert_cols).', default='viridis') + argparser.add_argument('--invert_cols', action="store_true", default = False, help="Revert the colour palette (default False).") + return(argparser) + + def format_aguments_for_R_plotting(self): + plot_args = "" + if self.arguments.c != 0: + plot_args += " -c " + str(self.arguments.c) + if self.arguments.title: + plot_args += " -t \"" + self.arguments.title + "\"" + if self.arguments.ylim != 0: + plot_args += " -ylim " + str(self.arguments.ylim) + if self.arguments.col_ramp: + plot_args += " -col_ramp \"" + self.arguments.col_ramp + "\"" + if self.arguments.invert_cols: + plot_args += " --invert_cols" + return(plot_args) + ############### # task cutoff # ############### @@ -144,6 +173,160 @@ def cutoff(args): sys.stdout.write(str(U)) sys.stdout.flush() +######################## +# task peak_agregation # +######################## + +def load_hetmers(smufile): + cov_tab = read_csv(smufile, names = ['covB', 'covA', 'freq'], sep='\t') + cov_tab = cov_tab.sort_values('freq', ascending = False) + return(cov_tab) + +def local_agregation(cov_tab, distance, noise_filter, mask_errors): + # generate a dictionary that gives us for each combination of coverages a frequency + cov2freq = defaultdict(int) + cov2peak = defaultdict(int) + + L = min(cov_tab['covB']) # important only when --mask_errors is on + + ### clustering + next_peak = 1 + for idx, covB, covA, freq in cov_tab.itertuples(): + cov2freq[(covA, covB)] = freq # a make a frequency dictionary on the fly, because I don't need any value that was not processed yet + if freq < noise_filter: + break + highest_neigbour_coords = (0, 0) + highest_neigbour_freq = 0 + # for each kmer pair I will retrieve all neibours (Manhattan distance) + for xA in range(covA - distance,covA + distance + 1): + # for explored A coverage in neiborhood, we explore all possible B coordinates + distanceB = distance - abs(covA - xA) + for xB in range(covB - distanceB,covB + distanceB + 1): + xB, xA = sorted([xA, xB]) # this is to make sure xB is smaller than xA + # iterating only though those that were assigned already + # and recroding only the one with highest frequency + if cov2peak[(xA, xB)] and cov2freq[(xA, xB)] > highest_neigbour_freq: + highest_neigbour_coords = (xA, xB) + highest_neigbour_freq = cov2freq[(xA, xB)] + if highest_neigbour_freq > 0: + cov2peak[(covA, covB)] = cov2peak[(highest_neigbour_coords)] + else: + # print("new peak:", (covA, covB)) + if mask_errors: + if covB < L + distance: + cov2peak[(covA, covB)] = 1 # error line + else: + cov2peak[(covA, covB)] = 0 # central smudges + else: + cov2peak[(covA, covB)] = next_peak # if I want to keep info about all locally agregated smudges + next_peak += 1 + return(cov2peak) + +def peak_agregation(args): + ### load data + cov_tab = load_hetmers(args.infile) + + cov2peak = local_agregation(cov_tab, args.d, args.nf, mask_errors = False) + + cov_tab = cov_tab.sort_values(['covA', 'covB'], ascending = True) + for idx, covB, covA, freq in cov_tab.itertuples(): + peak = cov2peak[(covA, covB)] + sys.stdout.write(f"{covB}\t{covA}\t{freq}\t{peak}\n") + sys.stdout.flush() + +def get_smudge_container(cov_tab, cov, smudge_filter): + smudge_container = dict() + genomic_cov_tab = cov_tab[cov_tab['peak'] == 0] # this removed all the marked errors + total_kmer_pairs = sum(genomic_cov_tab['freq']) + + for Bs in range(1,9): + min_cov = 0 if Bs == 1 else cov * (Bs - 0.5) + max_cov = cov * (Bs + 0.5) + cov_tab_isoB = genomic_cov_tab.loc[(genomic_cov_tab["covB"] > min_cov) & (genomic_cov_tab["covB"] < max_cov)] # + + for As in range(Bs,(17 - Bs)): + min_cov = 0 if As == 1 else cov * (As - 0.5) + max_cov = cov * (As + 0.5) + cov_tab_iso_smudge = cov_tab_isoB.loc[(cov_tab_isoB["covA"] > min_cov) & (cov_tab_isoB["covA"] < max_cov)] + if sum(cov_tab_iso_smudge['freq']) / total_kmer_pairs > smudge_filter: + # sys.stderr.write(f"{As}A{Bs}B: {sum(cov_tab_iso_smudge['freq']) / total_kmer_pairs}\n") + smudge_container["A" * As + "B" * Bs] = cov_tab_iso_smudge + return(smudge_container) + +def get_centrality(smudge_container, cov): + centralities = list() + freqs = list() + for smudge in smudge_container.keys(): + As = smudge.count('A') + Bs = smudge.count('B') + smudge_tab = smudge_container[smudge] + kmer_in_the_smudge = sum(smudge_tab['freq']) + freqs.append(kmer_in_the_smudge) + # center as a a mean + # center_A = sum((smudge_tab['freq'] * smudge_tab['covA'])) / kmer_in_the_smudge + # center_B = sum((smudge_tab['freq'] * smudge_tab['covB'])) / kmer_in_the_smudge + # center as a mode + center = smudge_tab.loc[smudge_tab['freq'].idxmax()] + center_A = center['covA'] + center_B = center['covB'] + ## emprical to edge + # distA = min([abs(smudge_tab['covA'].max() - center['covA']), abs(center['covA'] - smudge_tab['covA'].min())]) + # distB = min([abs(smudge_tab['covB'].max() - center['covB']), abs(center['covB'] - smudge_tab['covB'].min())]) + ## theoretical to edge + # distA = min(abs(center['covA'] - (cov * (As - 0.5))), abs((cov * (As + 0.5)) - center['covA'])) + # distB = min(abs(center['covB'] - (cov * (Bs - 0.5))), abs((cov * (Bs + 0.5)) - center['covB'])) + ## theoretical relative distance to the center + distA = abs((center_A - (cov * As)) / cov) + distB = abs((center_B - (cov * Bs)) / cov) + + # sys.stderr.write(f"Processing: {As}A{Bs}B; with center: {distA}, {distB}\n") + centrality = distA + distB + centralities.append(centrality) + + if len(centralities) == 0: + return(1) + return(fmean(centralities, weights=freqs)) + +def test_coverage_range(cov_tab, min_c, max_c, smudge_size_cutoff = 0.02): + # covs_to_test = range(min_c, max_c) + covs_to_test = arange(min_c + 0.05, max_c + 0.05, 2) + cov_centralities = list() + for cov in covs_to_test: + smudge_container = get_smudge_container(cov_tab, cov, smudge_size_cutoff) + cov_centralities.append(get_centrality(smudge_container, cov)) + + best_coverage = covs_to_test[argmin(cov_centralities)] + + tenths_to_test = arange(best_coverage - 1.9, best_coverage + 1.9, 0.2) + tenths_centralities = list() + for cov in tenths_to_test: + smudge_container = get_smudge_container(cov_tab, cov, smudge_size_cutoff) + tenths_centralities.append(get_centrality(smudge_container, cov)) + + best_tenth = tenths_to_test[argmin(tenths_centralities)] + sys.stderr.write(f"Best coverage to precsion of one tenth: {round(best_tenth, 2)}\n") + + hundredths_to_test = list(arange(best_tenth - 0.19, best_tenth + 0.19, 0.01)) + hundredths_centralities = list() + for cov in hundredths_to_test: + smudge_container = get_smudge_container(cov_tab, cov, smudge_size_cutoff) + hundredths_centralities.append(get_centrality(smudge_container, cov)) + + final_cov = hundredths_to_test[argmin(hundredths_centralities)] + just_to_be_sure_cov = final_cov/2 + + hundredths_to_test.append(just_to_be_sure_cov) + smudge_container = get_smudge_container(cov_tab, just_to_be_sure_cov, smudge_size_cutoff) + hundredths_centralities.append(get_centrality(smudge_container, just_to_be_sure_cov)) + + final_cov = hundredths_to_test[argmin(hundredths_centralities)] + sys.stderr.write(f"Best coverage to precision of one hundreth: {round(final_cov, 3)}\n") + + all_coverages = concatenate((covs_to_test, tenths_to_test, hundredths_to_test)) + all_centralities = concatenate((cov_centralities, tenths_centralities, hundredths_centralities)) + + return(DataFrame({'coverage': all_coverages, 'centrality': all_centralities})) + ##################### # the script itself # ##################### @@ -172,44 +355,87 @@ def main(): plot_args += " -P" + args.tmp plot_args += " " + args.infile - sys.stderr.write("Calling: PloidyPlot " + plot_args + "\n") - system("PloidyPlot " + plot_args) + sys.stderr.write("Calling: hetmers (PloidyPlot kmer pair search) " + plot_args + "\n") + system("hetmers " + plot_args) if _parser.task == "plot": # the plotting script is expected ot be installed in the system as well as the R library supporting it args = _parser.arguments - plot_args = "-i \"" + args.infile + "\" -o \"" + args.o + "\"" - if args.q != 1: - plot_args += " -q " + str(args.q) - if args.L != 0: - plot_args += " -L " + str(args.L) - if args.c != 0: - plot_args += " -c " + str(args.c) - if args.n != 0: - plot_args += " -n " + str(args.n) - if args.title: - plot_args += " -t \"" + args.title + "\"" - if args.ylim != 0: - plot_args += " -ylim " + str(args.ylim) - if args.col_ramp: - plot_args += " -col_ramp \"" + args.col_ramp + "\"" - if args.nbins != 0: - plot_args += " -nbins " + str(args.nbins) - if args.homozygous: - plot_args += " --homozygous" - if args.invert_cols: - plot_args += " --invert_cols" - if args.plot_err_line: - plot_args += " --plot_err_line" - if args.just_plot: - plot_args += " --just_plot" + + plot_args = f'-i "{args.infile}" -s "{args.smudgefile}" -n {args.n} -o "{args.o}" ' + _parser.format_aguments_for_R_plotting() sys.stderr.write("Calling: smudgeplot_plot.R " + plot_args + "\n") system("smudgeplot_plot.R " + plot_args) - if _parser.task == "extract": - extract_kmer_pairs(_parser.arguments) + if _parser.task == "peak_agregation": + peak_agregation(_parser.arguments) + + if _parser.task == "all": + args = _parser.arguments + + sys.stderr.write("\nLoading data\n") + cov_tab = load_hetmers(args.infile) + # cov_tab = load_hetmers("data/dicots/smu_files/daAchMill1.k31_ploidy.smu.txt") + + sys.stderr.write("\nMasking errors using local agregation algorithm\n") + cov2peak = local_agregation(cov_tab, distance = 1, noise_filter = 1000, mask_errors = True) + cov_tab['peak'] = [cov2peak[(covA, covB)] for idx, covB, covA, freq in cov_tab.itertuples()] + cov_tab = cov_tab.sort_values(['covA', 'covB'], ascending = True) + total_kmers = sum(cov_tab['freq']) + genomic_kmers = sum(cov_tab[cov_tab['peak'] == 0]['freq']) + total_error_kmers = sum(cov_tab[cov_tab['peak'] == 1]['freq']) + error_fraction = total_error_kmers / total_kmers + sys.stderr.write(f"Total kmers: {total_kmers}\n\tGenomic kmers: {genomic_kmers}\n\tSequencing errors: {total_error_kmers}\n\tFraction or errors: {round(total_error_kmers/total_kmers, 3)}") + + with open(args.o + "_masked_errors_smu.txt", 'w') as error_annotated_smu: + error_annotated_smu.write("covB\tcovA\tfreq\tis_error\n") + for idx, covB, covA, freq, is_error in cov_tab.itertuples(): + error_annotated_smu.write(f"{covB}\t{covA}\t{freq}\t{is_error}\n") # might not be needed + + sys.stderr.write("\nInfering 1n coverage using grid algorihm\n") + + smudge_size_cutoff = 0.001 # this is % of all k-mer pairs smudge needs to have to be considered a valid smudge + + if args.cov == 0: # not specified user coverage + centralities = test_coverage_range(cov_tab, args.cov_min, args.cov_max, smudge_size_cutoff) + np.savetxt(args.o + "_centralities.txt", np.around(centralities, decimals=6), fmt="%.4f", delimiter = '\t') + # plot(centralities['coverage'], centralities['coverage']) + + if error_fraction < 0.75: + cov = centralities['coverage'][argmin(centralities['centrality'])] + else: + sys.stderr.write(f"Too many errors observed: {error_fraction}, not trusting coverage inference\n") + cov = 0 + + sys.stderr.write(f"\nInferred coverage: {cov}\n") + else: + cov = args.cov + + final_smudges = get_smudge_container(cov_tab, cov, 0.03) + # sys.stderr.write(str(final_smudges) + '\n') + + annotated_smudges = list(final_smudges.keys()) + smudge_sizes = [round(sum(final_smudges[smudge]['freq']) / genomic_kmers, 4) for smudge in annotated_smudges] + + sys.stderr.write(f'Detected smudges / sizes ({args.o} + "_smudge_sizes.txt):"\n') + sys.stderr.write('\t' + str(annotated_smudges) + '\n') + sys.stderr.write('\t' + str(smudge_sizes) + '\n') + + # saving smudge sizes + smudge_table = DataFrame({'smudge': annotated_smudges, 'size': smudge_sizes}) + np.savetxt(args.o + "_smudge_sizes.txt", smudge_table, fmt='%s', delimiter = '\t') + + sys.stderr.write("\nPlotting\n") + + system("centrality_plot.R " + args.o + "_centralities.txt") + # Rscript playground/alternative_fitting/alternative_plotting_testing.R -i data/dicots/peak_agregation/$ToLID.cov_tab_peaks -o data/dicots/peak_agregation/$ToLID + args = _parser.arguments + + plot_args = f'-i "{args.o}_masked_errors_smu.txt" -s "{args.o}_smudge_sizes.txt" -n {round(cov, 3)} -o "{args.o}" ' + _parser.format_aguments_for_R_plotting() + + sys.stderr.write("Calling: smudgeplot_plot.R " + plot_args + "\n") + system("smudgeplot_plot.R " + plot_args) sys.stderr.write("\nDone!\n") exit(0) diff --git a/exec/smudgeplot_plot.R b/exec/smudgeplot_plot.R index 0b9d8f1..0cc1069 100755 --- a/exec/smudgeplot_plot.R +++ b/exec/smudgeplot_plot.R @@ -2,7 +2,157 @@ suppressPackageStartupMessages(library("methods")) suppressPackageStartupMessages(library("argparse")) -suppressPackageStartupMessages(library("smudgeplot")) +suppressPackageStartupMessages(library("viridis")) + +# suppressPackageStartupMessages(library("smudgeplot")) + +################# +### funcitons ### +################# +# retirying the smudgeplot R package +get_col_ramp <- function(.args, delay = 0){ + colour_ramp <- eval(parse(text = paste0(.args$col_ramp,"(", 32 - delay, ")"))) + if (.args$invert_cols){ + colour_ramp <- rev(colour_ramp) + } + colour_ramp <- c(rep(colour_ramp[1], delay), colour_ramp) + return(colour_ramp) +} + +wtd.quantile <- function(x, q=0.25, weight=NULL) { + o <- order(x) + n <- sum(weight) + order <- 1 + (n - 1) * q + low <- pmax(floor(order), 1) + high <- pmin(ceiling(order), n) + low_contribution <- high - order + allq <- approx(x=cumsum(weight[o])/sum(weight), y=x[o], xout = c(low, high)/n, method = "constant", + f = 1, rule = 2)$y + low_contribution * allq[1] + (1 - low_contribution) * allq[2] +} + +wtd.iqr <- function(x, w=NULL) { + wtd.quantile(x, q=0.75, weight=w) - wtd.quantile(x, q=0.25, weight=w) +} + +plot_alt <- function(cov_tab, ylim, colour_ramp, log = F){ + A_equals_B <- cov_tab[, 'covA'] == cov_tab[, 'covB'] + cov_tab[A_equals_B, 'freq'] <- cov_tab[A_equals_B, 'freq'] * 2 + if (log){ + cov_tab[, 'freq'] <- log10(cov_tab[, 'freq']) + } + cov_tab$col <- colour_ramp[1 + round(31 * cov_tab[, 'freq'] / max(cov_tab[, 'freq']))] + + # c(bottom, left, top, right) + par(mar=c(4.8,4.8,1,1)) + plot(NULL, xlim = c(0, 0.5), ylim = ylim, + xlab = 'Normalized minor kmer coverage: B / (A + B)', + ylab = 'Total coverage of the kmer pair: A + B', cex.lab = 1.4, bty = 'n') + min_cov_to_plot <- max(ylim[1],min(cov_tab[, 'total_pair_cov'])) + nothing <- sapply(min_cov_to_plot:ylim[2], plot_one_coverage, cov_tab) + return(0) +} + +plot_one_coverage <- function(cov, cov_tab){ + cov_row_to_plot <- cov_tab[cov_tab[, 'total_pair_cov'] == cov, ] + width <- 1 / (2 * cov) + cov_row_to_plot$left <- cov_row_to_plot[, 'minor_variant_rel_cov'] - width + cov_row_to_plot$right <- sapply(cov_row_to_plot[, 'minor_variant_rel_cov'], function(x){ min(0.5, x + width)}) + apply(cov_row_to_plot, 1, plot_one_box, cov) +} + +plot_one_box <- function(one_box_row, cov){ + left <- as.numeric(one_box_row['left']) + right <- as.numeric(one_box_row['right']) + rect(left, cov - 0.5, right, cov + 0.5, col = one_box_row['col'], border = NA) +} + +plot_isoA_line <- function (.covA, .L, .col = "black", .ymax = 250, .lwd, .lty) { + min_covB <- .L # min(.cov_tab[, 'covB']) # should be L really + max_covB <- .covA + B_covs <- seq(min_covB, max_covB, length = 500) + isoline_x <- B_covs/ (B_covs + .covA) + isoline_y <- B_covs + .covA + lines(isoline_x[isoline_y < .ymax], isoline_y[isoline_y < .ymax], lwd = .lwd, lty = .lty, col = .col) +} + +plot_isoB_line <- function (.covB, .ymax, .col = "black", .lwd, .lty) { + cov_range <- seq((2 * .covB) - 2, .ymax, length = 500) + lines((.covB)/cov_range, cov_range, lwd = .lwd, lty = .lty, col = .col) +} + +plot_iso_grid <- function(.cov, .L, .ymax, .col = 'black', .lwd = 2, .lty = 2){ + for (i in 0:15){ + cov <- (i + 0.5) * .cov + plot_isoA_line(cov, .L = .L, .ymax = .ymax, .col, .lwd = .lwd, .lty = .lty) + if (i < 8){ + plot_isoB_line(cov, .ymax, .col, .lwd = .lwd, .lty = .lty) + } + } +} + +plot_expected_haplotype_structure <- function(.n, .peak_sizes, + .adjust = F, .cex = 1.3, xmax = 0.49){ + .peak_sizes <- .peak_sizes[.peak_sizes[, 'size'] > 0.05, ] + .peak_sizes[, 'ploidy'] <- nchar(.peak_sizes[, 'structure']) + + decomposed_struct <- strsplit(.peak_sizes[, 'structure'], '') + .peak_sizes[, 'corrected_minor_variant_cov'] <- sapply(decomposed_struct, function(x){ sum(x == 'B') } ) / .peak_sizes[, 'ploidy'] + .peak_sizes[, 'label'] <- reduce_structure_representation(.peak_sizes[, 'structure']) + + borercases <- .peak_sizes$corrected_minor_variant_cov == 0.5 + + for(i in 1:nrow(.peak_sizes)){ + # xmax is in the middle of the last square in the 2d histogram, + # which is too far from the edge, so I average it with 0.49 + # witch will pull the label bit to the edge + text( ifelse( borercases[i] & .adjust, (xmax + 0.49) / 2, .peak_sizes$corrected_minor_variant_cov[i]), + .peak_sizes$ploidy[i] * .n, .peak_sizes[i, 'label'], + offset = 0, cex = .cex, xpd = T, pos = ifelse( borercases[i] & .adjust, 2, 1)) + } +} + +reduce_structure_representation <- function(smudge_labels){ + structures_to_adjust <- (sapply(smudge_labels, nchar) > 4) + + if ( any(structures_to_adjust) ) { + decomposed_struct <- strsplit(smudge_labels[structures_to_adjust], '') + As <- sapply(decomposed_struct, function(x){ sum(x == 'A') } ) + Bs <- sapply(decomposed_struct, length) - As + smudge_labels[structures_to_adjust] <- paste0(As, 'A', Bs, 'B') + } + return(smudge_labels) +} + +plot_legend <- function(kmer_max, .colour_ramp, .log_scale = T){ + par(mar=c(0,0,2,1)) + plot.new() + print_title <- ifelse(.log_scale, 'log kmers pairs', 'kmers pairs') + title(print_title) + for(i in 1:32){ + rect(0,(i - 0.01) / 33, 0.5, (i + 0.99) / 33, col = .colour_ramp[i]) + } + # kmer_max <- max(smudge_container$dens) + if( .log_scale == T ){ + for(i in 0:6){ + text(0.75, i / 6, rounding(10^(log10(kmer_max) * i / 6)), offset = 0) + } + } else { + for(i in 0:6){ + text(0.75, i / 6, rounding(kmer_max * i / 6), offset = 0) + } + } +} + +rounding <- function(number){ + if(number > 1000){ + round(number / 1000) * 1000 + } else if (number > 100){ + round(number / 100) * 100 + } else { + round(number / 10) * 10 + } +} ############# ## SETTING ## @@ -12,7 +162,9 @@ parser <- ArgumentParser() parser$add_argument("--homozygous", action="store_true", default = F, help="Assume no heterozygosity in the genome - plotting a paralog structure; [default FALSE]") parser$add_argument("-i", "--input", default = "*_smu.txt", - help="name of the input tsv file with covarages [default \"coverages_2.tsv\"]") + help="name of the input tsv file with covarages [default \"*_smu.txt\"]") +parser$add_argument("-s", "--smudges", default = "not_specified", + help="name of the input tsv file with annotated smudges and their respective sizes") parser$add_argument("-o", "--output", default = "smudgeplot", help="name pattern used for the output files (OUTPUT_smudgeplot.png, OUTPUT_summary.txt, OUTPUT_warrnings.txt) [default \"smudgeplot\"]") parser$add_argument("-t", "--title", @@ -21,248 +173,139 @@ parser$add_argument("-q", "--quantile_filt", type = "double", help="Remove kmer pairs with coverage over the specified quantile; [default none]") parser$add_argument("-n", "--n_cov", type = "double", help="the haploid coverage of the sequencing data [default inference from data]") -parser$add_argument("-L", "--low_cutoff", type = "integer", - help="the lower boundary used when dumping kmers [default min(total_pair_cov) / 2]") parser$add_argument("-c", "-cov_filter", type = "integer", - help="Filter pairs with one of them having coverage bellow specified threshold [default 0; disables parameter L]") + help="Filter pairs with one of them having coverage bellow specified threshold [default 0]") parser$add_argument("-ylim", type = "integer", help="The upper limit for the coverage sum (the y axis)") -parser$add_argument("-nbins", type = "integer", - help="the number of nbins used for smudgeplot matrix (nbins x nbins) [default autodetection]") parser$add_argument("-col_ramp", default = "viridis", help="A colour ramp available in your R session [viridis]") parser$add_argument("--invert_cols", action="store_true", default = F, help="Set this flag to invert colorus of Smudgeplot (dark for high, light for low densities)") -parser$add_argument("--plot_err_line", action="store_true", default = F, - help="Set this flag to add a line of theh higher expected occurance of errors paired with genomic k-mers") -parser$add_argument("--just_plot", action="store_true", default = F, - help="Turns off the inference of coverage and annotation of smudges; simply generates smudgeplot. (default False)") - - + args <- parser$parse_args() colour_ramp_log <- get_col_ramp(args, 16) # create palette for the log plots colour_ramp <- get_col_ramp(args) # create palette for the linear plots -iterative_nbins <- F -if( is.null(args$nbins) ){ - args$nbins <- 40 - iterative_nbins <- T -} - -# for easier manipulation I store estimated things in a list -smudge_summary <- list() - -smudge_warn(args$output, "\n######################") -smudge_warn(args$output, "## INPUT PROCESSING ##") -smudge_warn(args$output, "######################") - if ( !file.exists(args$input) ) { stop("The input file not found. Please use --help to get help", call.=FALSE) } -cov_tab <- read.table(args$input, col.names = c('covB', 'covA', 'freq')) +cov_tab <- read.table(args$input, header = T) # col.names = c('covB', 'covA', 'freq', 'is_error'), +smudge_tab <- read.table(args$smudges, col.names = c('structure', 'size')) # total covarate of the kmer pair cov_tab[, 'total_pair_cov'] <- cov_tab[, 'covA'] + cov_tab[, 'covB'] # calcualte relative coverage of the minor allele cov_tab[, 'minor_variant_rel_cov'] <- cov_tab[, 'covB'] / cov_tab[, 'total_pair_cov'] -if ( !is.null(args$q) ){ - # quantile filtering (remove top q%, it's not really informative) - threshold <- quantile(cov_tab[, 'total_pair_cov'], args$q) - high_cov_filt <- cov_tab[, 'total_pair_cov'] < threshold - smudge_warn(args$output, "Removing", sum(cov_tab[!high_cov_filt, 'freq']), - "kmer pairs with coverage higher than", - threshold, paste0("(", args$q, " quantile)")) - cov_tab <- cov_tab[high_cov_filt, ] -} +##### coverage filtering if ( !is.null(args$c) ){ threshold <- args$c low_cov_filt <- cov_tab[, 'covA'] < threshold | cov_tab[, 'covB'] < threshold - smudge_warn(args$output, "Removing", sum(cov_tab[low_cov_filt, 'freq']), - "kmer pairs for which one of the pair had coverage below", - threshold, paste0("(Specified by argument -c ", args$c, ")")) + # smudge_warn(args$output, "Removing", sum(cov_tab[low_cov_filt, 'freq']), + # "kmer pairs for which one of the pair had coverage below", + # threshold, paste0("(Specified by argument -c ", args$c, ")")) cov_tab <- cov_tab[!low_cov_filt, ] - smudge_warn(args$output, "Processing", sum(cov_tab[, 'freq']), "kmer pairs") - L <- min(cov_tab[, 'covB']) -} else { - L <- ifelse(length(args$L) == 0, min(cov_tab[, 'covB']), args$L) + # smudge_warn(args$output, "Processing", sum(cov_tab[, 'freq']), "kmer pairs") } -smudge_summary$n_subset_est <- round(estimate_1n_coverage_1d_subsets(cov_tab), 1) -if (!args$just_plot){ - draft_n <- ifelse(length(args$n_cov) == 0, smudge_summary$n_subset_est, args$n_cov) - ylim <- c(min(cov_tab[, 'total_pair_cov']) - 1, # or 0? - min(10*draft_n, max(cov_tab[, 'total_pair_cov']))) +##### quantile filtering +if ( !is.null(args$q) ){ + # quantile filtering (remove top q%, it's not really informative) + threshold <- wtd.quantile(cov_tab[, 'total_pair_cov'], args$q, cov_tab[, 'freq']) + high_cov_filt <- cov_tab[, 'total_pair_cov'] < threshold + # smudge_warn(args$output, "Removing", sum(cov_tab[!high_cov_filt, 'freq']), + # "kmer pairs with coverage higher than", + # threshold, paste0("(", args$q, " quantile)")) + cov_tab <- cov_tab[high_cov_filt, ] +} + +cov <- args$n_cov +if (cov == wtd.quantile(cov_tab[, 'total_pair_cov'], 0.95, cov_tab[, 'freq'])){ + ylim <- c(min(cov_tab[, 'total_pair_cov']), max(cov_tab[, 'total_pair_cov'])) } else { - ylim <- c(0,max(cov_tab[, 'total_pair_cov'])) # if there is no inference, the default coverage is max cov + ylim <- c(min(cov_tab[, 'total_pair_cov']) - 1, # or 0? + min(max(100, 10*cov), max(cov_tab[, 'total_pair_cov']))) } xlim <- c(0, 0.5) +error_fraction <- sum(cov_tab[, 'is_error'] * cov_tab[, 'freq']) / sum(cov_tab[, 'freq']) * 100 +error_string <- paste("err =", round(error_fraction, 1), "%") +cov_string <- paste0("1n = ", cov) -if (!is.null(args$ylim)){ # if ylim is specified, set the bounday by the argument instead +if (!is.null(args$ylim)){ # if ylim is specified, set the boundary by the argument instead ylim[2] <- args$ylim } +fig_title <- ifelse(length(args$title) == 0, NA, args$title[1]) +# histogram_bins = max(30, args$nbins) -smudge_warn(args$output, "\n#############") -smudge_warn(args$output, "## SUMMARY ##") -smudge_warn(args$output, "#############") - -dulpicit_structures <- T -repeat { - smudge_container <- get_smudge_container(cov_tab, .nbins = args$nbins, .xlim = xlim, .ylim = ylim) - - peak_points <- peak_agregation(smudge_container) - if (args$just_plot){ - break - } - peak_sizes <- get_peak_summary(peak_points, smudge_container, 0.02) - - the_smallest_n <- min(get_trinoploid_1n_est(peak_sizes), draft_n) - smudge_summary$n_peak_est <- estimate_1n_coverage_highest_peak(peak_sizes, cov_tab, the_smallest_n) - if (length(args$n_cov) == 0) { - if( (abs(log2(smudge_summary$n_subset_est / smudge_summary$n_peak_est)) > .95 | smudge_summary$n_peak_est < smudge_summary$n_subset_est) & !args$homozygous){ - smudge_summary$n <- smudge_summary$n_subset_est - } else { - smudge_summary$n <- smudge_summary$n_peak_est - } - } else { - smudge_summary$n <- args$n_cov - } - - # if the organism is completely homozygous, all the detected kmer pairs are corresponding to paralogs - # therefore ther inference will confuse AB peak to AABB peak etc. - # that is recolvable just telling to guess half of the coverage instead - if( args$homozygous ){ - smudge_summary$n <- smudge_summary$n / 2 - smudge_summary$n_subset_est <- smudge_summary$n_subset_est / 2 - } - - peak_sizes$structure <- apply(peak_sizes, 1, - function(x){ guess_genome_structure(x, smudge_summary$n)}) +########## +# LINEAR # +########## +pdf(paste0(args$output,'_smudgeplot.pdf')) - dulpicit_structures <- any(table(peak_sizes$structure) > 1) - # if there are more smudges on the same location & if user have not specified nbins - if(dulpicit_structures & iterative_nbins){ - if(args$nbins > 20){ - args$nbins <- args$nbins - 5 - } else { - args$nbins <- args$nbins - 2 - } - smudge_warn(args$output, "detecting two smudges at the same positions, not enough data for this number of bins lowering number of bins to ", args$nbins) - } else { - break - } +# layout(matrix(c(2,4,1,3), 2, 2, byrow=T), c(3,1), c(1,3)) +layout(matrix(c(4,2,1,3), 2, 2, byrow=T), c(3,1), c(1,3)) +# 1 smudge plot +plot_alt(cov_tab, ylim, colour_ramp_log) +if (cov > 0){ + plot_expected_haplotype_structure(cov, smudge_tab, T, xmax = 0.49) } -if (!args$just_plot){ - # checks / warnings for potential inference problems - if( abs(log2(smudge_summary$n_subset_est / smudge_summary$n_peak_est)) > 1 & !args$homozygous){ - smudge_warn(args$output, "!! Warning, the two types of estimates of 1n coverage differ a lot (", - smudge_summary$n_subset_est, "and", smudge_summary$n_peak_est, ")") - smudge_warn(args$output, "i.e. at least of one of the smudgeplot methods to estimate the haploid coverage got it wrong") - smudge_warn(args$output, "Using subset estimate instead of highest peak estimate (less precise but also less often completely wrong)") - smudge_warn(args$output, "Does the smudgeplot look sane? Is at least one of the 1n estimates close a GenomeScope estimate?") - smudge_warn(args$output, "You can help us imrove this software by sharing this strange smudgeplot on https://github.com/KamilSJaron/smudgeplot/issues.") - } - if( L > (smudge_summary$n / 2) & !args$homozygous ){ - smudge_warn(args$output, "!! Warning, your coverage filter on the lower end (L = ", L, - ") is higher than half of the 1n coverage estimate ( 1n / 2 = ", round(smudge_summary$n / 2, 2)) - smudge_warn(args$output, "If the real 1n coverage is half of your estimate you would not picked it up due to the filtering.") - smudge_warn(args$output, "If you have sufficient coverage, consider reruning the analysis with lower L (something like (1n / 2) - 5)") - smudge_warn(args$output, "One good way for verificaiton would be to compare it to GenomeScope estimate of haploid coverage") - } - - peak_sizes$corrected_minor_variant_cov <- sapply(peak_sizes$structure, function(x){round(mean(unlist(strsplit(x, split = '')) == 'B'), 2)}) - peak_sizes$ploidy <- sapply(peak_sizes$structure, nchar) - - to_filter <- peak_sizes$ploidy <= 1 - if( any(to_filter) ){ - smudge_warn(args$output, paste(sum(to_filter), "peaks of kmer pairs detected with coverage < (1n_coverage * 2) =", round(smudge_summary$n * 2, 1))) - tab_to_print <- peak_sizes[to_filter,c(2,3,8,9)] - tab_to_print <- round(tab_to_print, 2) - colnames(tab_to_print) <- c('kmers_in_peak[#]', 'kmers_in_peak[proportion]', 'summit B / (A + B)', 'summit A + B') - smudge_warn(args$output, paste0(capture.output(tab_to_print), collapse = "\n")) - peak_sizes <- peak_sizes[!to_filter,] - smudge_warn(args$output, "This might be due to kmers with sequencing errors present in the kmer dump.") - smudge_warn(args$output, "Increasing L might help remove erroneous kmers.") - } - - peak_sizes$rel_size <- peak_sizes$rel_size / sum(peak_sizes$rel_size) - peak_sizes <- peak_sizes[order(peak_sizes$rel_size, decreasing = T),] - smudge_summary$peak_sizes <- peak_sizes - - # genome ploidy is the ploidy with highest number of corresponding kmer pairs regardless of topology - considered_ploidies <- unique(peak_sizes$ploidy) - ploidy_with_most_smudges <- which.max(sapply(considered_ploidies, function(x){ sum(peak_sizes[peak_sizes$ploidy == x,'rel_size']) }) ) - smudge_summary$genome_ploidy <- considered_ploidies[ploidy_with_most_smudges] - # smudge_summary$genome_ploidy <- peak_sizes$ploidy[which.max(peak_sizes$rel_size)] - - # this will be probably diploid, - # but theoretically one can imagine a species that si completely homozygous tetraploid - if( args$homozygous ){ - smudge_summary$genome_ploidy <- smudge_summary$genome_ploidy / 2 - if(!smudge_summary$genome_ploidy %in% c(2,4,6,8)){ - smudge_warn(args$output, "Guessing really strange ploidy", smudge_summary$genome_ploidy, "perhaps there is not enough coverage for a good inference.") - smudge_warn(args$output, "You can trust the plot, but guessed ploidy or peak detection might be completely off.") - } - } - generate_summary(args, smudge_summary) +# 4 legend +plot_legend(max(cov_tab[, 'freq']), colour_ramp, F) + +### add annotation +# print smudge sizes +plot.new() +if (cov > 0){ + legend('topleft', bty = 'n', reduce_structure_representation(smudge_tab[,'structure']), cex = 1.1) + legend('top', bty = 'n', legend = round(smudge_tab[,2], 2), cex = 1.1) + legend('bottomleft', bty = 'n', legend = c(cov_string, error_string), cex = 1.1) } else { - smudge_summary$n <- 0 # this is interpreted as "no infered" + legend('bottomleft', bty = 'n', legend = error_string, cex = 1.1) } -smudge_warn(args$output, "\n##########") -smudge_warn(args$output, "## PLOT ##") -smudge_warn(args$output, "##########") - -fig_title <- ifelse(length(args$title) == 0, NA, args$title[1]) +plot.new() +mtext(bquote(italic(.(fig_title))), side=3, adj=0.1, line=-3, cex = 1.6) -pdf(paste0(args$output,'_smudgeplot_log10.pdf')) - -layout(matrix(c(2,4,1,3), 2, 2, byrow=T), c(3,1), c(1,3)) -# 1 smudge plot -plot_smudgeplot(smudge_container, smudge_summary$n, colour_ramp_log) -if (!args$just_plot){ -plot_expected_haplotype_structure(smudge_summary$n, peak_sizes, T, xmax = max(smudge_container$x)) -} -if (args$plot_err_line){ - plot_seq_error_line(cov_tab) -} - -histogram_bins = max(30, args$nbins) -# 2,3 hist -plot_histograms(cov_tab, smudge_summary, fig_title, .ylim = ylim, .bins = histogram_bins) # I am testing here setting the number of bars to the same number as the number of squares -# 4 legend -plot_legend(smudge_container, colour_ramp_log) dev.off() -# replace the log transformed values by non-transformed -smudge_container$z <- smudge_container$dens +############ +# log plot # +############ -pdf(paste0(args$output,'_smudgeplot.pdf')) +pdf(paste0(args$output,'_smudgeplot_log10.pdf')) -layout(matrix(c(2,4,1,3), 2, 2, byrow=T), c(3,1), c(1,3)) +layout(matrix(c(4,2,1,3), 2, 2, byrow=T), c(3,1), c(1,3)) +# cov_tab[, 'freq'] <- log10(cov_tab[, 'freq']) # 1 smudge plot -plot_smudgeplot(smudge_container, smudge_summary$n, colour_ramp) -if (!args$just_plot){ - plot_expected_haplotype_structure(smudge_summary$n, peak_sizes, T, xmax = max(smudge_container$x)) +plot_alt(cov_tab, ylim, colour_ramp_log, log = T) + +if (cov > 0){ + plot_expected_haplotype_structure(cov, smudge_tab, T, xmax = 0.49) } -# plot error line L - 1 / cov ~ cov -if (args$plot_err_line){ - plot_seq_error_line(cov_tab) + +# 4 legend +plot_legend(max(cov_tab[, 'freq']), colour_ramp_log, T) + +# print smudge sizes +plot.new() +if (cov > 0){ + legend('topleft', bty = 'n', reduce_structure_representation(smudge_tab[,'structure']), cex = 1.1) + legend('top', bty = 'n', legend = round(smudge_tab[,2], 2), cex = 1.1) + legend('bottomleft', bty = 'n', legend = c(cov_string, error_string), cex = 1.1) +} else { + legend('bottomleft', bty = 'n', legend = error_string, cex = 1.1) } -# 2,3 hist -plot_histograms(cov_tab, smudge_summary, fig_title, - .ylim = ylim, .bins = histogram_bins) -# 4 legend -plot_legend(smudge_container, colour_ramp, F) +plot.new() +mtext(bquote(italic(.(fig_title))), side=3, adj=0.1, line=-3, cex = 1.6) -dev.off() +dev.off() \ No newline at end of file diff --git a/install.R b/install.R deleted file mode 100644 index f3647de..0000000 --- a/install.R +++ /dev/null @@ -1,5 +0,0 @@ -require(devtools) - -document() -# test() #- the tests are broken for the moment -install.packages(".", repos = NULL, type="source") diff --git a/man/annotate_peaks.Rd b/man/annotate_peaks.Rd deleted file mode 100644 index 2cc3014..0000000 --- a/man/annotate_peaks.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/annotate_peaks.R -\name{annotate_peaks} -\alias{annotate_peaks} -\title{annotate_peaks} -\usage{ -annotate_peaks(.peak_points, .min_kmerpair_cov, .max_kmerpair_cov) -} -\description{ -\code{annotate_peaks} -} diff --git a/man/annotate_summits.Rd b/man/annotate_summits.Rd deleted file mode 100644 index 6e7167a..0000000 --- a/man/annotate_summits.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/annotate_summits.R -\name{annotate_summits} -\alias{annotate_summits} -\title{annotate_summits} -\usage{ -annotate_summits( - .peak_points, - .peak_sizes, - .min_kmerpair_cov, - .max_kmerpair_cov, - col = "red" -) -} -\description{ -\code{annotate_summits} -} diff --git a/man/bw.nrdW.Rd b/man/bw.nrdW.Rd deleted file mode 100644 index 2244d7a..0000000 --- a/man/bw.nrdW.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/bw.nrdW.R -\name{bw.nrdW} -\alias{bw.nrdW} -\title{bw.nrdW} -\usage{ -bw.nrdW(x, w) -} -\description{ -\code{bw.nrdW} is a function that mimics bw.nrd0 - Silverman's ‘rule of thumb’ for estimating the kernel width, but using weighted values; -this can be useful if one wants to mimics kerned smoothing to a histogram using counts as weights -} diff --git a/man/coverage_histogram.Rd b/man/coverage_histogram.Rd deleted file mode 100644 index f6b2eda..0000000 --- a/man/coverage_histogram.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/coverage_histogram.R -\name{coverage_histogram} -\alias{coverage_histogram} -\title{coverage_histogram} -\usage{ -coverage_histogram(.cov_tab, bins, xlim, column, horiz, .col) -} -\description{ -\code{coverage_histogram} plots a histogram from a table that contains both values (what is normally x) and their respective frequencies (a column called 'freq'); -} diff --git a/man/estimate_1n_coverage_1d_subsets.Rd b/man/estimate_1n_coverage_1d_subsets.Rd deleted file mode 100644 index c975e91..0000000 --- a/man/estimate_1n_coverage_1d_subsets.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/estimate_1n_coverage_1d_subsets.R -\name{estimate_1n_coverage_1d_subsets} -\alias{estimate_1n_coverage_1d_subsets} -\title{estimate_1n_coverage_1d_subsets} -\usage{ -estimate_1n_coverage_1d_subsets(.cov_tab) -} -\description{ -\code{estimate_1n_coverage_1d_subsets} is used as the first proxy of 1n coverage. The estimate is subsequentially refined by the brightest smudge. -} diff --git a/man/estimate_1n_coverage_highest_peak.Rd b/man/estimate_1n_coverage_highest_peak.Rd deleted file mode 100644 index 2ec0176..0000000 --- a/man/estimate_1n_coverage_highest_peak.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/estimate_1n_coverage_highest_peak.R -\name{estimate_1n_coverage_highest_peak} -\alias{estimate_1n_coverage_highest_peak} -\title{estimate_1n_coverage_highest_peak} -\usage{ -estimate_1n_coverage_highest_peak(.filt_peak_sizes, .cov_tab, .draft_1n) -} -\description{ -\code{estimate_1n_coverage_highest_peak} -} diff --git a/man/filter_peak_sizes.Rd b/man/filter_peak_sizes.Rd deleted file mode 100644 index 2eb3dc0..0000000 --- a/man/filter_peak_sizes.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter_peak_sizes.R -\name{filter_peak_sizes} -\alias{filter_peak_sizes} -\title{filter_peak_sizes} -\usage{ -filter_peak_sizes(.peak_sizes, .threshold = 0.005) -} -\description{ -\code{filter_peak_sizes} -} diff --git a/man/filter_peaks.Rd b/man/filter_peaks.Rd deleted file mode 100644 index 31f1333..0000000 --- a/man/filter_peaks.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter_peaks.R -\name{filter_peaks} -\alias{filter_peaks} -\title{filter_peaks} -\usage{ -filter_peaks(.peak_points, .peak_sizes) -} -\description{ -\code{filter_peaks} -} diff --git a/man/generate_summary.Rd b/man/generate_summary.Rd deleted file mode 100644 index 50000a0..0000000 --- a/man/generate_summary.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/generate_summary.R -\name{generate_summary} -\alias{generate_summary} -\title{generate_summary} -\usage{ -generate_summary(.args, .smudge_summary) -} -\description{ -\code{generate_summary} -} diff --git a/man/get_1d_peaks.Rd b/man/get_1d_peaks.Rd deleted file mode 100644 index 4627f34..0000000 --- a/man/get_1d_peaks.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_1d_peaks.R -\name{get_1d_peaks} -\alias{get_1d_peaks} -\title{get_1d_peaks} -\usage{ -get_1d_peaks( - .cov_tab, - .num_of_peaks = 3, - .adjust = 10, - .peak_frame = data.frame(), - .depth = 1 -) -} -\description{ -\code{get_1d_peaks} is a legacy code -} diff --git a/man/get_col_ramp.Rd b/man/get_col_ramp.Rd deleted file mode 100644 index b30d8e9..0000000 --- a/man/get_col_ramp.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_col_ramp.R -\name{get_col_ramp} -\alias{get_col_ramp} -\title{get_col_ramp} -\usage{ -get_col_ramp(.args, delay = 0) -} -\description{ -\code{get_col_ramp} returns a colour ramp used by smudgeplots; -args need to contain args$col_ramp and args$invert_cols features -the first needs to be a ramp-generating function (e.g. "viridis", "grey.colors", "magma" or "mako") -args$invert_cols is just TRUE or FALSE (if to revert the colours in the palete) -} diff --git a/man/get_peak_sizes.Rd b/man/get_peak_sizes.Rd deleted file mode 100644 index b1a1e4b..0000000 --- a/man/get_peak_sizes.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_peak_sizes.R -\name{get_peak_sizes} -\alias{get_peak_sizes} -\title{get_peak_sizes} -\usage{ -get_peak_sizes(.peak_points) -} -\description{ -\code{get_peak_sizes} -} diff --git a/man/get_peak_summary.Rd b/man/get_peak_summary.Rd deleted file mode 100644 index f84d502..0000000 --- a/man/get_peak_summary.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_peak_summary.R -\name{get_peak_summary} -\alias{get_peak_summary} -\title{get_peak_summary} -\usage{ -get_peak_summary(.peak_points, .smudge_container, .treshold = 0.005) -} -\description{ -\code{get_peak_summary} -} diff --git a/man/get_smudge_container.Rd b/man/get_smudge_container.Rd deleted file mode 100644 index c72ca48..0000000 --- a/man/get_smudge_container.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_smudge_container.R -\name{get_smudge_container} -\alias{get_smudge_container} -\title{get_smudge_container} -\usage{ -get_smudge_container(.cov_tab, .nbins = 20, .xlim = c(0, 0.5), .ylim = NA) -} -\description{ -\code{get_smudge_container} is inspired by https://stackoverflow.com/a/18103689/2962344 -} diff --git a/man/get_trinoploid_1n_est.Rd b/man/get_trinoploid_1n_est.Rd deleted file mode 100644 index 1e7b16c..0000000 --- a/man/get_trinoploid_1n_est.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_trinoploid_1n_est.R -\name{get_trinoploid_1n_est} -\alias{get_trinoploid_1n_est} -\title{get_trinoploid_1n_est} -\usage{ -get_trinoploid_1n_est(.filt_peak_sizes) -} -\description{ -\code{get_trinoploid_1n_est} -} diff --git a/man/guess_genome_structure.Rd b/man/guess_genome_structure.Rd deleted file mode 100644 index 1142907..0000000 --- a/man/guess_genome_structure.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/guess_genome_structure.R -\name{guess_genome_structure} -\alias{guess_genome_structure} -\title{guess_genome_structure} -\usage{ -guess_genome_structure(.filt_peak_sizes_line, .n) -} -\description{ -\code{guess_genome_structure} -} diff --git a/man/peak_agregation.Rd b/man/peak_agregation.Rd deleted file mode 100644 index a0a55d1..0000000 --- a/man/peak_agregation.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/peak_agregation.R -\name{peak_agregation} -\alias{peak_agregation} -\title{peak_agregation} -\usage{ -peak_agregation(.smudge_container, .L = NA) -} -\arguments{ -\item{.smudge_container}{- a smudge_container object; a list with `x`,`y` axies of the 2d histogram and `dens` matrix of 2h histogram values. -Usually output of function \code{get_smudge_container}} -} -\description{ -\code{peak_agregation} is the function that agregates tiles from the smudge_container -into smudges - individual distributions -} diff --git a/man/plot_expected_haplotype_structure.Rd b/man/plot_expected_haplotype_structure.Rd deleted file mode 100644 index 56d0ccf..0000000 --- a/man/plot_expected_haplotype_structure.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_expected_haplotype_structure.R -\name{plot_expected_haplotype_structure} -\alias{plot_expected_haplotype_structure} -\title{plot_expected_haplotype_structure} -\usage{ -plot_expected_haplotype_structure( - .n, - .peak_sizes, - .adjust = F, - .cex = 1.3, - xmax = 0.49 -) -} -\description{ -\code{plot_expected_haplotype_structure} adds genome scture labels to a smudgeplot -} diff --git a/man/plot_histograms.Rd b/man/plot_histograms.Rd deleted file mode 100644 index 03833d1..0000000 --- a/man/plot_histograms.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_histograms.R -\name{plot_histograms} -\alias{plot_histograms} -\title{plot_histograms} -\usage{ -plot_histograms( - .cov_tab, - .smudge_summary, - .fig_title = NA, - .cex = 1.4, - .col = NA, - .ylim = NA, - .bins = 100 -) -} -\description{ -\code{plot_histograms} makes 2 plots - histograms of the both margins of smudge plot -} diff --git a/man/plot_legend.Rd b/man/plot_legend.Rd deleted file mode 100644 index e80d46a..0000000 --- a/man/plot_legend.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_legend.R -\name{plot_legend} -\alias{plot_legend} -\title{plot_legend} -\usage{ -plot_legend(.k, .colour_ramp, .log_scale = T) -} -\description{ -\code{plot_legend} generate topright corener in the smudgeplot -} diff --git a/man/plot_seq_error_line.Rd b/man/plot_seq_error_line.Rd deleted file mode 100644 index c89b94a..0000000 --- a/man/plot_seq_error_line.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_seq_error_line.R -\name{plot_seq_error_line} -\alias{plot_seq_error_line} -\title{plot_seq_error_line} -\usage{ -plot_seq_error_line(.cov_tab, .L = NA, .col = "black") -} -\description{ -\code{plot_seq_error_line} add error line to plot it's L / cov -} diff --git a/man/plot_smudgeplot.Rd b/man/plot_smudgeplot.Rd deleted file mode 100644 index 1787439..0000000 --- a/man/plot_smudgeplot.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot_smudgeplot.R -\name{plot_smudgeplot} -\alias{plot_smudgeplot} -\title{plot_smudgeplot} -\usage{ -plot_smudgeplot(.k, .n, .colour_ramp, .cex = 1.4) -} -\arguments{ -\item{.k}{- the matrix of densities or list of x,y axies and z, the matrix. Usually output of function} - -\item{.n}{- the 1n coverage, the coverage of the haploid genome (can be estimated by TODO function)} - -\item{.colour_ramp}{- the colour pallete for the heat colours} - -\item{.cex}{- the size of the axis labels [default 1.4]} -} -\description{ -\code{plot_smudgeplot} is the core smudgeplot function to plot the core of the smudgeplot -the cental 2D histogram/ landscape plot / heatmap / howeveryouwantyoucallthis plot -} -\author{ -Kamil Jaron \email{kamiljaron at gmail.com} -} diff --git a/man/reduce_structure_representation.Rd b/man/reduce_structure_representation.Rd deleted file mode 100644 index a04e55e..0000000 --- a/man/reduce_structure_representation.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/reduce_structure_representation.R -\name{reduce_structure_representation} -\alias{reduce_structure_representation} -\title{reduce_structure_representation} -\usage{ -reduce_structure_representation(.peak_sizes) -} -\arguments{ -\item{.peak_sizes}{- a table with peak sizes, the truly important column in there is just the "structure" one} -} -\description{ -\code{reduce_structure_representation} is the function that collapses labels loger than 4 characters into 4 character representations -} diff --git a/man/round_minor_variant_cov.Rd b/man/round_minor_variant_cov.Rd deleted file mode 100644 index ce32d1e..0000000 --- a/man/round_minor_variant_cov.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/round_minor_variant_cov.R -\name{round_minor_variant_cov} -\alias{round_minor_variant_cov} -\title{round_minor_variant_cov} -\usage{ -round_minor_variant_cov(.min_cov) -} -\description{ -\code{round_minor_variant_cov} -} diff --git a/man/rounding.Rd b/man/rounding.Rd deleted file mode 100644 index 28b63f7..0000000 --- a/man/rounding.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rounding.R -\name{rounding} -\alias{rounding} -\title{rounding} -\usage{ -rounding(number) -} -\description{ -\code{rounding} rounds to thousands or hundreds if the number is smaller than 1000 -} diff --git a/man/smudge_summary.Rd b/man/smudge_summary.Rd deleted file mode 100644 index e9babc8..0000000 --- a/man/smudge_summary.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/smudge_summary.R -\name{smudge_summary} -\alias{smudge_summary} -\title{smudge_summary} -\usage{ -smudge_summary(.filename, ...) -} -\description{ -\code{smudge_summary} a function that is used to report all the summary output messages -like this I will be able to easily redirect the waring message to file as well -} diff --git a/man/smudge_warn.Rd b/man/smudge_warn.Rd deleted file mode 100644 index a5907f0..0000000 --- a/man/smudge_warn.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/smudge_warn.R -\name{smudge_warn} -\alias{smudge_warn} -\title{smudge_warn} -\usage{ -smudge_warn(.filename, ...) -} -\description{ -\code{smudge_warn} a function that is used to report all the wanrning messages -like this I will be able to easily redirect the waring message to file as well -} diff --git a/man/transform_minor_variant_cov.Rd b/man/transform_minor_variant_cov.Rd deleted file mode 100644 index af2760a..0000000 --- a/man/transform_minor_variant_cov.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transform_minor_variant_cov.R -\name{transform_minor_variant_cov} -\alias{transform_minor_variant_cov} -\title{transform_minor_variant_cov} -\usage{ -transform_minor_variant_cov(.x, .smudge_container) -} -\description{ -\code{transform_minor_variant_cov} -to transform coordinates of smudgeplot to minor_variant_rel_cov -} diff --git a/man/transform_pair_cov.Rd b/man/transform_pair_cov.Rd deleted file mode 100644 index a9e5b7e..0000000 --- a/man/transform_pair_cov.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transform_pair_cov.R -\name{transform_pair_cov} -\alias{transform_pair_cov} -\title{transform_pair_cov} -\usage{ -transform_pair_cov(.y, .smudge_container) -} -\description{ -\code{transform_pair_cov} -to transform coordinates of smudgeplot to total_pair_cov -} diff --git a/man/wtd.quantile.Rd b/man/wtd.quantile.Rd deleted file mode 100644 index c43a0fb..0000000 --- a/man/wtd.quantile.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/wtd.quantile.R -\name{wtd.quantile} -\alias{wtd.quantile} -\title{wtd.quantile & wtd.iqr} -\usage{ -wtd.quantile(x, q = 0.25, weight = NULL) -} -\description{ -\code{wtd.quantile} -calculates a quantile using "wighted data", specifically designed for "histogram-like" data -\code{wtd.iqr} -0.75 - 0.25 weighted quantile -} diff --git a/playground/alternative_fitting/README.md b/playground/alternative_fitting/README.md new file mode 100644 index 0000000..75a9f0a --- /dev/null +++ b/playground/alternative_fitting/README.md @@ -0,0 +1,250 @@ +# Sploidyplot + +The goal is to have a smudge inference based on an explicit model. I hoped for a model that would make a lot of sense - based on negative binomials. + +Gene - made his own EM algorithm, I think. I could not decipher it +Richard and Tianyi - made me an EM algorithm that work on simply coverage A and B; also using normal distributions + + +## alternative plotting + +A minimalist attempt + +```R +minidata <- daArtCamp1[daArtCamp1[, 'total_pair_cov'] < 20, ] +coverages_to_plot <- unique(minidata[, 'total_pair_cov']) +number_of_coverages_to_plot <- length(coverages_to_plot) +mini_ylim <- c(5, 20) +L <- 4 +cols <- c(rgb(1,0,0, 0.5), rgb(1,1,0, 0.5), rgb(0,1,1, 0.5), rgb(1,0,1, 0.5), rgb(0,1,0, 0.5), rgb(0,0,1, 0.5)) +# plot(1:6, pch = 20, cex = 5, col = cols) + +plot_dot_smudgeplot(minidata, rep('black', 32), xlim, mini_ylim, cex = 3) +points((L - 1) / coverages_to_plot, coverages_to_plot, cex = 3, pch = 20, col = 'blue') + +for( cov in 8:19){ + rect(0, cov - 0.5, 0.5, cov + 0.5, col = NA, border = 'black') + width <- 1 / (2 * cov) + min_ratio <- L / cov + rect(min_ratio - width, cov - 0.5, min(0.5, min_ratio + width), cov + 0.5, col = sample(cols, 1)) +} + +text(rep(0.05, number_of_coverages_to_plot), 8:19, 8:19) +``` + +This is more serious attempt that does not really work. + +Alternative local aggregation + +```bash +for ToLID in daAchMill1 daAchPtar1 daAdoMosc1 daAjuCham1 daAjuRept1 daArcMinu1 daArtCamp1 daArtMari1 daArtVulg1 daAtrBela1; do + python3 playground/alternative_fitting/pair_clustering.py data/dicots/smu_files/$ToLID.k31_ploidy.smu.txt --mask_errors > data/dicots/peak_agregation/$ToLID.cov_tab_peaks + Rscript playground/alternative_fitting/alternative_plotting_testing.R -i data/dicots/peak_agregation/$ToLID.cov_tab_peaks -o data/dicots/peak_agregation/$ToLID +done +``` + +This worked well. The agregation produced beautiful blocks, and vastly of the same shape as noticed by Richard. He suggested we should fix their shape and fit only a single parameter - coverage. + +```R +smudge_tab <- read.table('data/dicots/peak_agregation/daArtMari1.cov_tab_peaks', col.names = c('covB', 'covA', 'freq', 'smudge')) + +all_smudges <- unique(smudge_tab[, 'smudge']) +all_smudge_sizes <- sapply(all_smudges, function(x){ sum(smudge_tab[smudge_tab[, 'smudge'] == x, 'freq']) }) + +# plot(sort(all_smudge_sizes, decreasing = T) / sum(all_smudge_sizes), ylim = c(0, 1)) +# sort(all_smudge_sizes, decreasing = T) / sum(all_smudge_sizes) > 0.02 +# 2% of data soiunds reasonable + +smudges <- all_smudges[all_smudge_sizes / sum(all_smudge_sizes) > 0.02 & all_smudges != 0] +smudge_sizes <- all_smudge_sizes[all_smudge_sizes / sum(all_smudge_sizes) > 0.02 & all_smudges != 0] + +smudge_tab[, 'total_pair_cov'] <- smudge_tab[, 1] + smudge_tab[, 2] +smudge_tab[, 'minor_variant_rel_cov'] <- smudge_tab[, 1] / smudge_tab[, 'total_pair_cov'] + + +smudge_tab_reduced <- smudge_tab[smudge_tab[, 'smudge'] %in% smudges, ] +source('playground/alternative_fitting/alternative_plotting_functions.R') + +per_smudge_cov_list <- lapply(smudges, function(x){ smudge_tab_reduced[smudge_tab_reduced$smudge == x, ] }) +names(per_smudge_cov_list) <- smudges + +cov_sum_summary <- sapply(per_smudge_cov_list, function(x){ summary(x[, 'total_pair_cov']) } ) +rel_cov_summary <- sapply(per_smudge_cov_list, function(x){ summary(x[, 'minor_variant_rel_cov']) } ) + +colnames(cov_sum_summary) <- colnames(rel_cov_summary) <- smudges + +data.frame(smudges = smudges, total_pair_cov = round(cov_sum_summary[4, ], 1), minor_variant_rel_cov = round(rel_cov_summary[4, ], 3)) + +head(per_smudge_cov_list[['2']]) + +one_smudge <- per_smudge_cov_list[['2']] +one_smudge[one_smudge[, 'minor_variant_rel_cov'] == 0.5, ] + +table(one_smudge[, 'covB']) +(one_smudge[, 'minor_variant_rel_cov']) + + + + # cov_range <- seq((2 * .L) - 2, max_cov_pair, length = 500) + # lines((.L - 1)/cov_range, cov_range, lwd = 2.5, lty = 2, + +plot_peakmap(smudge_tab_reduced, xlim = c(0, 0.5), ylim = c(0, 300)) + +plot_seq_error_line(smudge_tab, 4) +plot_seq_error_line(smudge_tab, 13) +plot_seq_error_line(smudge_tab, 48) +plot_seq_error_line(smudge_tab, 80) + +one_smudge <- per_smudge_cov_list[['4']] +min(one_smudge[ ,'total_pair_cov']) + +one_smudge[one_smudge[ ,'total_pair_cov'] == 61, ] +one_smudge <- one_smudge[order(one_smudge[, 'minor_variant_rel_cov']), ] + +right_part_of_the_smudge <- one_smudge[one_smudge[ ,'minor_variant_rel_cov'] > 0.2131147, ] + +all_minor_var_rel_covs <- sort(unique(round(right_part_of_the_smudge[, 'minor_variant_rel_cov'], 2))) +corresponding_min_cov_sums <- sapply(all_minor_var_rel_covs, function(x){ min(right_part_of_the_smudge[round(right_part_of_the_smudge[, 'minor_variant_rel_cov'], 2) == x, 'total_pair_cov']) } ) + +lines(all_minor_var_rel_covs, corresponding_min_cov_sums, lwd = 3, lty = 3, col = 'red') + +subtract_line <- function(rel_cov, cov_tab){ + approx_rel_cov = round(rel_cov, 2) + band_covs = round(cov_tab[, 'minor_variant_rel_cov'], 2) == approx_rel_cov + cov_tab[band_covs, ][which.min(cov_tab[band_covs, 'total_pair_cov']), ] +} + +edge_points <- t(sapply(all_minor_var_rel_covs, subtract_line, right_part_of_the_smudge)) +total_pair_cov <- sapply(1:29, function(x){edge_points[[x,5]]}) +minor_variant_rel_cov <- sapply(1:29, function(x){edge_points[[x,6]]}) +lm(total_pair_cov ~ minor_variant_rel_cov + I(minor_variant_rel_cov^2)) + +plot_isoA_line <- function (.covA, .cov_tab, .col = "black") { + min_covB <- min(.cov_tab[, 'covB']) # should be L really + max_covB <- .covA + B_covs <- seq(min_covB, max_covB, length = 500) + lines(B_covs/ (B_covs + .covA), B_covs + .covA, lwd = 2.5, lty = 2, + col = .col) +} + +plot_isoA_line(48, smudge_tab, 'blue') +plot_isoA_line(79, smudge_tab, 'blue') +plot_isoA_line(110, smudge_tab, 'blue') +plot_isoA_line(141, smudge_tab, 'blue') +plot_isoA_line(172, smudge_tab, 'blue') + +``` + +HA, looks great! Let's plot it on the background... + +```R +library(smudgeplot) +source('playground/alternative_fitting/alternative_plotting_functions.R') +colour_ramp <- viridis(32) + + +smudge_tab <- read.table('data/dicots/peak_agregation/daAchMill1.cov_tab_errors', col.names = c('covB', 'covA', 'freq', 'is_error')) +# smudge_tab <- read.table('data/dicots/peak_agregation/daArtMari1.cov_tab_peaks', col.names = c('covB', 'covA', 'freq', 'smudge')) +smudge_tab[, 'total_pair_cov'] <- smudge_tab[, 1] + smudge_tab[, 2] +smudge_tab[, 'minor_variant_rel_cov'] <- smudge_tab[, 1] / smudge_tab[, 'total_pair_cov'] +cov = 31.1 # this is from GenomeScope this time + +plot_alt(smudge_tab[smudge_tab[, 'is_error'] != 0, ], c(0, 100), colour_ramp, T) +plot_alt(smudge_tab[smudge_tab[, 'is_error'] != 1, ], c(0, 100), colour_ramp, T) +plot_alt(smudge_tab, c(0, 100), colour_ramp, T) +plot_iso_grid(31.1, 4, 100) +plot_smudge_labels(18.1, 100) +# .peak_points, .peak_sizes, .min_kmerpair_cov, .max_kmerpair_cov, col = "red" +dev.off() + +plot_iso_grid() + +plot_smudge_labels(cov, 240) +text(0.49, cov / 2, "2err", offset = 0, cex = 1.3, xpd = T, pos = 2) +``` + +Say we will test ploidy up to 16 (capturing up to octoploid paralogs). That makes + +```R +smudge_tab_with_err <- read.table('data/dicots/peak_agregation/daAchMill1.cov_tab_errors', col.names = c('covB', 'covA', 'freq', 'is_error')) + +smudge_filtering_threshold <- 0.01 # at least 1% of genomic kmers +colour_ramp <- viridis(32) + +# # error band, done on non filtered data +# smudge_tab[, 'edgepoint'] <- F +# smudge_tab[smudge_tab[, 'covB'] < L + 3, 'edgepoint'] <- T +# plot_alt(smudge_tab[smudge_tab[, 'edgepoint'], ], c(0, 500), colour_ramp, T) + +cov <- 19.55 # this will be unknown +L <- min(smudge_tab_with_err[, 'covB']) +smudge_tab <- smudge_tab_with_err[smudge_tab_with_err[, 'is_error'] == 0, ] +genomic_kmer_pairs <- sum(smudge_tab[ ,'freq']) + +plot_alt(smudge_tab, c(0, 300), colour_ramp, T) + +smudge_tab[, 'total_pair_cov'] <- smudge_tab[, 1] + smudge_tab[, 2] +smudge_tab[, 'minor_variant_rel_cov'] <- smudge_tab[, 1] / smudge_tab[, 'total_pair_cov'] + +plot_alt(smudge_tab, c(0, 300), colour_ramp) +plot_all_smudge_labels(cov, 300) +dev.off() + +#### isolating all smudges given cov + +# total_genomic_kmer_pairs <- sum(smudge_tab[, 'freq']) + +# plot_alt(smudge_container[[1]], c(0, 300), colour_ramp, T) +# looks good! + +# two functions need to be sources from the smudgeplot package here +covs_to_test <- seq(10.05, 60.05, by = 0.1) +centrality_grid <- sapply(covs_to_test, run_replicate, smudge_tab, smudge_filtering_threshold) +covs_to_test[which.max(centrality_grid)] + +sapply(c(21.71, 21.72, 21.73), run_replicate, smudge_tab, smudge_filtering_threshold) +# 21.72 is our winner! + +tested_covs <- test_grid_of_coverages(smudge_tab, smudge_filtering_threshold, min_to_explore, max_to_explore) +plot(tested_covs[, 'cov'], tested_covs[, 'centrality']) + +``` + +Fixing the main package + +```bash +for smu_file in data/dicots/smu_files/*.k31_ploidy.smu.txt; do + ToLID=$(basename $smu_file .k31_ploidy.smu.txt); + smudgeplot.py all $smu_file -o data/dicots/grid_fits/$ToLID +done + + +``` + + +## Homopolymer compressed testing + +Datasets with lots of errors. Sacharomyces will do. + +``` +FastK -v -c -t4 -k31 -M16 -T4 data/Scer/SRR3265401_[12].fastq.gz -Ndata/Scer/FastK_Table_hc +hetmers -e4 -k -v -T4 -odata/Scer/kmerpairs_hc data/Scer/FastK_Table_hc + +smudgeplot.py hetmers -L 4 -t 4 -o data/Scer/kmerpairs_default_e --verbose data/Scer/FastK_Table + +smudgeplot.py all -o data/Scer/homopolymer_e4_wo data/Scer/kmerpairs_default_e_text.smu + +smudgeplot.py all -o data/Scer/homopolymer_e4_with data/Scer/kmerpairs_hc_text.smu +``` + +## Other + + +### .smu to smu.txt + +For the legacy `.smu` files, we have a convertor for to flat files. + +```bash +gcc src_ploidyplot/smu2text_smu.c -o exec/smu2text_smu +exec/smu2text_smu data/ddSalArbu1/ddSalArbu1.k31_ploidy.smu | less +``` \ No newline at end of file diff --git a/playground/alternative_fitting/alternative_plot_covA_covB.R b/playground/alternative_fitting/alternative_plot_covA_covB.R new file mode 100644 index 0000000..be7bd1c --- /dev/null +++ b/playground/alternative_fitting/alternative_plot_covA_covB.R @@ -0,0 +1,49 @@ +library(ggplot2) + +plot_unsquared_smudgeplot <- function(cov_tab, colour_ramp, xlim, ylim){ + # this is the adjustment for plotting + # cov_tab[cov_tab$covA == cov_tab$covB, 'freq'] <- cov_tab[cov_tab$covA == cov_tab$covB, 'freq'] * 2 + cov_tab$col = colour_ramp[1 + round(31 * cov_tab$freq / max(cov_tab$freq))] + + plot(NULL, xlim = xlim, ylim = ylim, + xlab = 'covA', + ylab = 'covB', cex.lab = 1.4) + + ## This might bite me in the a.., instead of taking L as an argument, I guess it from the data + # L = floor(min(cov_tab_daAchMill1[, 'total_pair_cov']) / 2) + ggplot(cov_tab, aes(x=covA, y=covB, weight = weight)) + + geom_bin2d() + + theme_bw() + +} + +real_clean <- read.table('data/Fiin/kmerpairs_k51_text.smu', col.names = c('covA', 'covB', 'freq')) +real_clean$weight <- real_clean$freq / sum(real_clean$freq) + +# plot(real_clean[, 'covA'], real_clean[, 'covB']) + +xlim <- range(real_clean[, 'covA']) +ylim <- range(real_clean[, 'covB']) + +library(smudgeplot) +args <- list() +args$col_ramp <- 'viridis' +args$invert_cols <- F +colour_ramp <- get_col_ramp(args) +real_clean$col <- colour_ramp[1 + round(31 * real_clean$freq / max(real_clean$freq))] + +plot(NULL, xlim = xlim, ylim = ylim, + xlab = 'covA', + ylab = 'covB', cex.lab = 1.4) + +ggplot(real_clean, aes(x=covA, y=covB, weight = weight)) + + geom_bin2d() + + theme_bw() + +head(real_clean) + + +# plotSquare <- function(row){ +# rect(as.numeric(row['covA']) - 0.5, as.numeric(row['covB']) - 0.5, as.numeric(row['covA']) + 0.5, as.numeric(row['covB']) + 0.5, col = row['col'], border = NA) +# } +# apply(real_clean, 1, plotSquare) diff --git a/playground/alternative_fitting/alternative_plotting.R b/playground/alternative_fitting/alternative_plotting.R new file mode 100644 index 0000000..512b1a6 --- /dev/null +++ b/playground/alternative_fitting/alternative_plotting.R @@ -0,0 +1,48 @@ +# HM Revenue and custom Tax Return form; +# Needs to be 2023 Jan to 2024 Jan + + + +library(smudgeplot) +source('playground/alternative_plotting_functions.R') + +cov_tab_daAchMill1 <- read.table('data/dicots/smu_files/daAchMill1.k31_ploidy.smu.txt', col.names = c('covB', 'covA', 'freq')) +ylim = c(0, 250) + +xlim = c(0, 0.5) + + + +cov_tab_daAchMill1[, 'total_pair_cov'] <- cov_tab_daAchMill1[, 1] + cov_tab_daAchMill1[, 2] +cov_tab_daAchMill1[, 'minor_variant_rel_cov'] <- cov_tab_daAchMill1[, 1] / cov_tab_daAchMill1[, 'total_pair_cov'] + +args <- list() +args$col_ramp <- 'viridis' +args$invert_cols <- F +colour_ramp <- get_col_ramp(args) +cols = colour_ramp[1 + round(31 * cov_tab_daAchMill1$freq / max(cov_tab_daAchMill1$freq))] + +# solving this "density" problem; cov1 cov1; have twice lower probability than cov1 cov2; we need to multiply these points, but it needs to be somehow corrected for the fit / summaries + +plot_dot_smudgeplot(cov_tab_daAchMill1, colour_ramp, xlim, ylim) + +plot_unsquared_smudgeplot(cov_tab_daAchMill1, colour_ramp, xlim, ylim) + +# plots the line where there will be nothing +plot_seq_error_line(cov_tab_daAchMill1, .col = 'red') + +head(cov_tab_daAchMill1[order(cov_tab_daAchMill1[,'total_pair_cov']), ], 12) +colour_ramp +3 / 8:13 + +#### + +cov_tab_Fiin_ideal <- read.table('data/Fiin/idealised/kmerpairs_idealised_with_transformations.tsv', header = T) +head(cov_tab_Fiin_ideal) + +xlim = c(0, 0.5) +ylim = c(0, max(cov_tab_Fiin_ideal[, 'total_pair_cov'])) + +plot_dot_smudgeplot(cov_tab_Fiin_ideal, colour_ramp, xlim, ylim) + +plot_unsquared_smudgeplot(cov_tab_Fiin_ideal, colour_ramp, xlim, ylim) diff --git a/playground/alternative_fitting/alternative_plotting_functions.R b/playground/alternative_fitting/alternative_plotting_functions.R new file mode 100644 index 0000000..ccd0572 --- /dev/null +++ b/playground/alternative_fitting/alternative_plotting_functions.R @@ -0,0 +1,119 @@ +plot_alt <- function(cov_tab, ylim, colour_ramp, logscale = F){ + A_equals_B <- cov_tab[, 'covA'] == cov_tab[, 'covB'] + cov_tab[A_equals_B, 'freq'] <- cov_tab[A_equals_B, 'freq'] * 2 + if (logscale){ + cov_tab[, 'freq'] <- log10(cov_tab[, 'freq']) + } + cov_tab$col <- colour_ramp[1 + round(31 * cov_tab[, 'freq'] / max(cov_tab[, 'freq']))] + + plot(NULL, xlim = c(0, 0.5), ylim = ylim, + xlab = 'Normalized minor kmer coverage: B / (A + B)', + ylab = 'Total coverage of the kmer pair: A + B', cex.lab = 1.4) + min_cov_to_plot <- max(ylim[1],min(cov_tab[, 'total_pair_cov'])) + nothing <- sapply(min_cov_to_plot:ylim[2], plot_one_coverage, cov_tab) + return(0) +} + +plot_one_coverage <- function(cov, cov_tab){ + cov_row_to_plot <- cov_tab[cov_tab[, 'total_pair_cov'] == cov, ] + width <- 1 / (2 * cov) + cov_row_to_plot$left <- cov_row_to_plot[, 'minor_variant_rel_cov'] - width + cov_row_to_plot$right <- sapply(cov_row_to_plot[, 'minor_variant_rel_cov'], function(x){ min(0.5, x + width)}) + apply(cov_row_to_plot, 1, plot_one_box, cov) +} + +plot_one_box <- function(one_box_row, cov){ + left <- as.numeric(one_box_row['left']) + right <- as.numeric(one_box_row['right']) + rect(left, cov - 0.5, right, cov + 0.5, col = one_box_row['col'], border = NA) +} + +plot_dot_smudgeplot <- function(cov_tab, colour_ramp, xlim, ylim, background_col = 'grey', cex = 0.4){ + # this is the adjustment for plotting + cov_tab[cov_tab$covA == cov_tab$covB, 'freq'] <- cov_tab[cov_tab$covA == cov_tab$covB, 'freq'] * 2 + cov_tab$col = colour_ramp[1 + round(31 * cov_tab$freq / max(cov_tab$freq))] + + plot(NULL, xlim = xlim, ylim = ylim, xlab = 'Normalized minor kmer coverage: B / (A + B)', + ylab = 'Total coverage of the kmer pair: A + B') + rect(xlim[1], ylim[1], xlim[2], ylim[2], col = background_col, border = NA) + points(cov_tab[, 'minor_variant_rel_cov'], cov_tab[, 'total_pair_cov'], col = cov_tab$col, pch = 20, cex = cex) +} + +plot_peakmap <- function(cov_tab, xlim, ylim, background_col = 'grey', cex = 2){ + # this is the adjustment for plotting + plot(NULL, xlim = xlim, ylim = ylim, xlab = 'Normalized minor kmer coverage: B / (A + B)', + ylab = 'Total coverage of the kmer pair: A + B') + points(cov_tab[, 'minor_variant_rel_cov'], cov_tab[, 'total_pair_cov'], col = cov_tab$smudge, pch = 20, cex = cex) + legend('bottomleft', col = 1:8, pch = 20, title = 'smudge', legend = 1:8) +} + +plot_seq_error_line <- function (.cov_tab, .L = NA, .col = "black") { + if (is.na(.L)) { + .L <- min(.cov_tab[, "covB"]) + } + max_cov_pair <- max(.cov_tab[, "total_pair_cov"]) + cov_range <- seq((2 * .L) - 2, max_cov_pair, length = 500) + lines((.L - 1)/cov_range, cov_range, lwd = 2.5, lty = 2, + col = .col) +} + +plot_isoA_line <- function (.covA, .L, .col = "black", .ymax = 250, .lwd, .lty) { + min_covB <- .L # min(.cov_tab[, 'covB']) # should be L really + max_covB <- .covA + B_covs <- seq(min_covB, max_covB, length = 500) + isoline_x <- B_covs/ (B_covs + .covA) + isoline_y <- B_covs + .covA + lines(isoline_x[isoline_y < .ymax], isoline_y[isoline_y < .ymax], lwd = .lwd, lty = .lty, col = .col) +} + +plot_isoB_line <- function (.covB, .ymax, .col = "black", .lwd, .lty) { + cov_range <- seq((2 * .covB) - 2, .ymax, length = 500) + lines((.covB)/cov_range, cov_range, lwd = .lwd, lty = .lty, col = .col) +} + +plot_iso_grid <- function(.cov, .L, .ymax, .col = 'black', .lwd = 2, .lty = 2){ + for (i in 0:15){ + cov <- (i + 0.5) * .cov + plot_isoA_line(cov, .L = .L, .ymax = .ymax, .col, .lwd = .lwd, .lty = .lty) + if (i < 8){ + plot_isoB_line(cov, .ymax, .col, .lwd = .lwd, .lty = .lty) + } + } +} + +plot_smudge_labels <- function(cov_est, ymax, xmax = 0.49, .cex = 1.3, .L = 4){ + for (As in 1:(floor(ymax / cov_est) - 1)){ + label <- paste0(As, "Aerr") + text(.L / (As * cov_est), (As * cov_est) + .L, label, + offset = 0, cex = .cex, xpd = T, pos = ifelse(As == 1, 3, 4)) + } + for (ploidy in 2:floor(ymax / cov_est)){ + for (Bs in 1:floor(ploidy / 2)){ + As = ploidy - Bs + label <- paste0(As, "A", Bs, "B") + text(ifelse(As == Bs, (xmax + 0.49)/2, Bs / ploidy), ploidy * cov_est, label, + offset = 0, cex = .cex, xpd = T, + pos = ifelse(As == Bs, 2, 1)) + } + } +} + +create_smudge_container <- function(cov, cov_tab, smudge_filtering_threshold){ + smudge_container <- list() + total_genomic_kmers <- sum(cov_tab[ , 'freq']) + + for (Bs in 1:8){ + cov_tab_isoB <- cov_tab[cov_tab[ , 'covB'] > cov * ifelse(Bs == 1, 0, Bs - 0.5) & cov_tab[ , 'covB'] < cov * (Bs + 0.5), ] + # cov_tab_isoB[, 'Bs'] <- Bs + cov_tab_isoB[, 'As'] <- round(cov_tab_isoB[, 'covA'] / cov) #these are be individual smudges cutouts given coverages + cov_tab_isoB[cov_tab_isoB[, 'As'] == 0, 'As'] = 1 + for( As in Bs:(16 - Bs)){ + cov_tab_one_smudge <- cov_tab_isoB[cov_tab_isoB[, 'As'] == As, ] + if (sum(cov_tab_one_smudge[, 'freq']) / total_genomic_kmers > smudge_filtering_threshold){ + label <- paste0(As, "A", Bs, "B") + smudge_container[[label]] <- cov_tab_one_smudge[,-which(names(cov_tab_one_smudge) %in% c('is_error', 'As'))] + } + } + } + return(smudge_container) +} \ No newline at end of file diff --git a/playground/alternative_fitting/alternative_plotting_testing.R b/playground/alternative_fitting/alternative_plotting_testing.R new file mode 100644 index 0000000..598cfae --- /dev/null +++ b/playground/alternative_fitting/alternative_plotting_testing.R @@ -0,0 +1,90 @@ +library(smudgeplot) +library(argparse) +source('playground/alternative_fitting/alternative_plotting_functions.R') + +parser <- ArgumentParser() +parser$add_argument("-i", "-infile", help="Input file") +parser$add_argument("-o", "-outfile", help="Output file") +args <- parser$parse_args() + +args$col_ramp <- 'viridis' +args$invert_cols <- F + +# cov_tab_daAchMill1 <- read.table('data/dicots/smu_files/daAchMill1.k31_ploidy.smu.txt', col.names = c('covB', 'covA', 'freq')) +# cov_tab <- read.table(args$file, col.names = c('covB', 'covA', 'freq')) +# args <- list() +# args$i <- 'data/ddSalArbu1/ddSalArbu1.k31_ploidy_converted.smu_with_peaks.txt' +# args$o <- 'data/ddSalArbu1/smudge_with_peaks' + +cov_tab <- read.table(args$i, col.names = c('covB', 'covA', 'freq','peak')) + +xlim = c(0, 0.5) +ylim = c(0, 300) + + +cov_tab[, 'total_pair_cov'] <- cov_tab[, 1] + cov_tab[, 2] +cov_tab[, 'minor_variant_rel_cov'] <- cov_tab[, 1] / cov_tab[, 'total_pair_cov'] + +colour_ramp <- viridis(32) +colour_ramp_log <- get_col_ramp(args, 16) +# cols = colour_ramp[1 + round(31 * cov_tab$freq / max(cov_tab$freq))] + +# solving this "density" problem; cov1 cov1; have twice lower probability than cov1 cov2; we need to multiply these points, but it needs to be somehow corrected for the fit / summaries + +plot_dot_smudgeplot(cov_tab, colour_ramp, xlim, ylim) + +pdf(paste0(args$o, '_background.pdf')) + plot_alt(cov_tab, ylim, colour_ramp, F) +dev.off() + +pdf(paste0(args$o, '_log_background.pdf')) + plot_alt(cov_tab, ylim, colour_ramp, T) +dev.off() + +pdf(paste0(args$o, '_peaks.pdf')) + plot_peakmap(cov_tab, xlim = xlim, ylim = ylim) +dev.off() + +# plots the line where there will be nothing +# plot_seq_error_line(cov_tab, .col = 'red') + +# head(cov_tab[order(cov_tab[,'total_pair_cov']), ], 12) +# colour_ramp +# 3 / 8:13 + +#### + +# cov_tab_Fiin_ideal <- read.table('data/Fiin/idealised/kmerpairs_idealised_with_transformations.tsv', header = T) +# head(cov_tab_Fiin_ideal) + +# xlim = c(0, 0.5) +# ylim = c(0, max(cov_tab_Fiin_ideal[, 'total_pair_cov'])) + +# pdf('data/Fiin/idealised/straw_plot_test3.pdf') +# plot_dot_smudgeplot(cov_tab_Fiin_ideal, colour_ramp, xlim, ylim) +# dev.off() + +# pdf('data/Fiin/idealised/straw_plot_test2.pdf') +# plot_unsquared_smudgeplot(cov_tab_Fiin_ideal, colour_ramp, xlim, ylim) +# dev.off() + +# # testing the packaged version + +# library(smudgeplot) +# args <- list() +# args$col_ramp <- 'viridis' +# args$invert_cols <- F +# colour_ramp <- get_col_ramp(args) + +# cov_tab_Fiin_ideal <- read.table('data/Fiin/idealised/kmerpairs_idealised_with_transformations.tsv', header = T) + +# pdf('data/Fiin/idealised/straw_plot_test.pdf') +# plot_alt(cov_tab_Fiin_ideal, c(50, 700), colour_ramp) +# dev.off() + +# source('playground/alternative_fitting/alternative_plotting_functions.R') + +# pdf('data/Fiin/idealised/straw_plot_test2.pdf') +# plot_unsquared_smudgeplot(cov_tab_Fiin_ideal, colour_ramp, c(0, 0.5), c(50, 700)) +# dev.off() + diff --git a/playground/alternative_fitting/pair_clustering.py b/playground/alternative_fitting/pair_clustering.py new file mode 100644 index 0000000..1cd3eb3 --- /dev/null +++ b/playground/alternative_fitting/pair_clustering.py @@ -0,0 +1,84 @@ + +# cov2freq = defualtdict(covA, covB) -> freq +# cov2peak = dict(covA, covB) -> peak +# dict(peak) -> summit (if relevant) +# import numpy as np + +import argparse +from pandas import read_csv # type: ignore +from collections import defaultdict +from statistics import mean +# import matplotlib.pyplot as plt + +#### + +parser = argparse.ArgumentParser() +parser.add_argument('infile', nargs='?', help='name of the input tsv file with covarages and frequencies.') +parser.add_argument('-nf', '-noise_filter', help='Do not agregate into smudge k-mer pairs with frequency lower than this parameter', type=int, default=50) +parser.add_argument('-d', '-distance', help='Manthattan distance of k-mer pairs that are considered neioboring for the local agregation purposes.', type=int, default=5) +parser.add_argument('--mask_errors', help='instead of reporting assignments to individual smudges, just remove all monotonically decreasing points from the error line', action="store_true", default = False) +args = parser.parse_args() + +### what should be aguments at some point +# smu_file = 'data/ddSalArbu1/ddSalArbu1.k31_ploidy_converted.smu.txt' +# distance = 5 +# noise_filter = 100 + +smu_file = args.infile +distance = args.d +noise_filter = args.nf + +### load data +# cov_tab = np.loadtxt(smu_file, dtype=int) +cov_tab = read_csv(smu_file, names = ['covB', 'covA', 'freq'], sep='\t') +cov_tab = cov_tab.sort_values('freq', ascending = False) +L = min(cov_tab['covB']) # important only when --mask_errors is on + +# generate a dictionary that gives us for each combination of coverages a frequency +cov2freq = defaultdict(int) +cov2peak = defaultdict(int) +# for idx, covB, covA, freq in cov_tab.itertuples(): +# cov2freq[(covA, covB)] = freq +# I can create this one when I iterate though the data though + +# plt.hist(means) +# plt.hist([x for x in means if x < 100 and x > -100]) +# plt.show() + +### clustering +next_peak = 1 +for idx, covB, covA, freq in cov_tab.itertuples(): + cov2freq[(covA, covB)] = freq # with this I can get rid of lines 23 24 that pre-makes this table + if freq < noise_filter: + break + highest_neigbour_coords = (0, 0) + highest_neigbour_freq = 0 + # for each kmer pair I will retrieve all neibours (Manhattan distance) + for xA in range(covA - distance,covA + distance + 1): + # for explored A coverage in neiborhood, we explore all possible B coordinates + distanceB = distance - abs(covA - xA) + for xB in range(covB - distanceB,covB + distanceB + 1): + xB, xA = sorted([xA, xB]) # this is to make sure xB is smaller than xA + # iterating only though those that were assigned already + # and recroding only the one with highest frequency + if cov2peak[(xA, xB)] and cov2freq[(xA, xB)] > highest_neigbour_freq: + highest_neigbour_coords = (xA, xB) + highest_neigbour_freq = cov2freq[(xA, xB)] + if highest_neigbour_freq > 0: + cov2peak[(covA, covB)] = cov2peak[(highest_neigbour_coords)] + else: + # print("new peak:", (covA, covB)) + if args.mask_errors: + if covB < L + args.d: + cov2peak[(covA, covB)] = 1 # error line + else: + cov2peak[(covA, covB)] = 0 # central smudges + else: + cov2peak[(covA, covB)] = next_peak # if I want to keep info about all locally agregated smudges + next_peak += 1 + +cov_tab = cov_tab.sort_values(['covA', 'covB'], ascending = True) +for idx, covB, covA, freq in cov_tab.itertuples(): + print(covB, covA, freq, cov2peak[(covA, covB)]) + # if idx > 20: + # break diff --git a/playground/fooling_around_impossible_locations.R b/playground/fooling_around_impossible_locations.R deleted file mode 100644 index f27b829..0000000 --- a/playground/fooling_around_impossible_locations.R +++ /dev/null @@ -1,113 +0,0 @@ -####### SAMPLE - -library(smudgeplot) - -cov_tab_sample <- read.table('data/testing_Scer.tsv', col.names = c('covB', 'covA', 'freq')) -cov_tab_sample[, 'total_pair_cov'] <- cov_tab_sample[, 1] + cov_tab_sample[, 2] -cov_tab_sample[, 'minor_variant_rel_cov'] <- cov_tab_sample[, 1] / cov_tab_sample[, 'total_pair_cov'] - -xlim <- c(0, 0.5) -ylim <- c(0, 150) -nbins <- 40 - -smudge_container_sample <- get_smudge_container(cov_tab_sample, nbins, xlim, ylim) - -args <- list() -args$col_ramp <- "viridis" -args$invert_cols <- FALSE -colour_ramp <- get_col_ramp(args) # get the default colour ramp (Spectral, 11) - -plot_smudgeplot(smudge_container_sample, 15.5, colour_ramp) - -######## SCER - -cov_tab <- read.table("data/Scer/PloidyPlot3_text.smu", col.names = c('covB', 'covA', 'freq')) #nolint -cov_tab[, 'total_pair_cov'] <- cov_tab[, 1] + cov_tab[, 2] -cov_tab[, 'minor_variant_rel_cov'] <- cov_tab[, 1] / cov_tab[, 'total_pair_cov'] - -smudge_container <- get_smudge_container(cov_tab, nbins, xlim, ylim) -plot_smudgeplot(smudge_container, 15.5, colour_ramp) -rect(0.4375, smudge_container$y[6], 0.4500, smudge_container$y[7], col = 'white') -rect(0, 15.5*2, 0.500, 15.5*2, col = 'grey') -rect(0.49, 0, 0.49, 475, col = 'grey') -rect(0.4935, 0, 0.49, 475, col = 'grey') - -######## - -L <- 12 -y_min <- 22.50 -y_max <- 26.25 - - -bracket_size <- diff(smudge_container$y)[1] - -allowed_ranges <- lapply(smudge_container$y, function(ymin){if(ymin + bracket_size <= 2*L){ return(c()) }; seq(max(ceiling(ymin), 2*L), ymin + bracket_size, by = 1)}) - -# this will generate a table of all possible k-mer combinations of covA and covB -impossible_ratios <- function(allowed_range, L, x_brackets){ - impossible <- rep(TRUE, length(smudge_container$x)) - if (is.null(allowed_range)) { - return (impossible) - } - minicov_tab <- merge(L:floor(max(allowed_range) / 2), (ceiling(max(allowed_range) / 2):(max(allowed_range) - L))) - colnames(minicov_tab) <- c("covB", "covA") - minicov_tab <- minicov_tab[minicov_tab[, 'covB'] <= minicov_tab[, 'covA'], ] - minicov_tab[, 'cov_ratio'] <- minicov_tab[, 'covB'] / (minicov_tab[, 'covB'] + minicov_tab[, 'covA']) - minicov_tab[, 'bracket'] <- findInterval(minicov_tab[, 'cov_ratio'], x_brackets, left.open = TRUE) - xcoords <- unique(minicov_tab[, 'bracket']) - impossible[xcoords] <- FALSE - return (impossible) -} - -rect(0.4375, smudge_container$y[6], 0.4500, smudge_container$y[7], col = 'white') - -rect(0, 0, 0.5, 10, col = 'white') -rect(0, 0, 0.05, 250, col = 'white') - -allowed_range <- allowed_ranges[[8]] -impossible_ratios(allowed_ranges[[7]], L, smudge_container$x) -impossible_ratios(allowed_ranges[[8]], L, smudge_container$x) -matrix_of_impossibles <- sapply(allowed_ranges, impossible_ratios, L, smudge_container$x) - -smudge_container <- get_smudge_container(cov_tab, nbins, xlim, ylim) -smudge_container$z[matrix_of_impossibles] <- NA -plot_smudgeplot(smudge_container, 15.5, colour_ramp) - - -L <- 12 -cov_tab_mini <- merge(c(L:20), c(L:20)) -colnames(cov_tab_mini) <- c("covA", "covB") -cov_tab_mini <- cov_tab_mini[cov_tab_mini$covB <= cov_tab_mini$covA, ] -# now cov_tab_mini is a realistic cov table - -cov_tab_mini$cov_sum <- cov_tab_mini$covB + cov_tab_mini$covA -cov_tab_mini$cov_ratio <- cov_tab_mini$covB / cov_tab_mini$cov_sum - -cov_tab_mini_first_covsum_bracket <- cov_tab_mini[cov_tab_mini$cov_sum %in% allowed_range, ] -table(cov_tab_mini_first_covsum_bracket$cov_ratio) - - -library(smudgeplot) - -args <- ArgumentParser()$parse_args() -args$output <- "data/Scer/sploidyplot_test" -args$nbins <- 40 -args$kmer_size <- 21 -args$homozygous <- FALSE -args$L <- c() -args$col_ramp <- 'viridis' -args$invert_cols <- TRUE - - - L <- 12 - smudge_container$x - smudge_container$y - is_possible <- function(x_min, x_max, y_min, y_max, L){ - y_min <- 22.50 - y_max <- 26.25 - seq(ceiling(y_min), y_max, by = 1) - x_min <- 0.4500 - x_max <- 0.4625 - if (x_max < L / y_max) - } - diff --git a/playground/interactive_plot_strawberry_full_kmer_families_fooling_around.R b/playground/interactive_plot_strawberry_full_kmer_families_fooling_around.R index 0525d32..6dc3211 100644 --- a/playground/interactive_plot_strawberry_full_kmer_families_fooling_around.R +++ b/playground/interactive_plot_strawberry_full_kmer_families_fooling_around.R @@ -1,7 +1,7 @@ library("methods") library("argparse") library("smudgeplot") -library("hexbin") +# library("hexbin") # preprocessing # to get simply number of memebers / family (exploration) @@ -13,159 +13,11 @@ library("hexbin") args <- ArgumentParser()$parse_args() args$homozygous <- F -args$input <- './data/strawberry_iinumae/kmer_counts_L109_U.tsv' -args$output = './data/strawberry_iinumae/straw' +args$input <- 'data/Fiin/kmerpairs_k51_text.smu' +args$output = './data/Fiin/testrun' args$title = 'F. iinumae' args$nbins <- 40 args$L <- NULL args$n_cov <- NULL args$k <- 21 -family_members <- table(read.table('data/strawberry_iinumae/kmer_counts_L109_U_family_members.tsv')$V1) -family_members / sum(family_members) - -# 2 3 4 5 6 7 8 -# 0.60463486 0.18252481 0.08881352 0.05260407 0.03399928 0.02262155 0.01480190 - -tranf_cov <- read.table('data/strawberry_iinumae/kmer_counts_L109_U_sums_min_max.tsv', col.names = c('sum', 'min', 'max')) - -iterative_nbins <- T - -smudge_summary <- list() - -pairs <- c(tranf_cov$min + tranf_cov$max == tranf_cov$sum) -cov <- tranf_cov[pairs,c(2,3)] -minor_variant_rel_cov <- cov$min / (cov$min + cov$max) -total_pair_cov <- cov$min + cov$max - -plot_smudge <- function(minor_variant_rel_cov, total_pair_cov, add_to_title = ''){ - L <- ifelse( length(args$L) == 0, min(total_pair_cov) / 2, args$L) - smudge_summary$n_subset_est <- 144 - draft_n <- ifelse(length(args$n_cov) == 0, smudge_summary$n_subset_est, args$n_cov) - - ymax <- min(10*draft_n, max(total_pair_cov)) - ymin <- min(total_pair_cov) - 1 - - dulpicit_structures <- T - repeat { - smudge_container <- get_smudge_container(minor_variant_rel_cov, total_pair_cov, .nbins = args$nbins, .ylim = c(ymin, ymax)) - peak_points <- peak_agregation(smudge_container) - peak_sizes <- get_peak_summary(peak_points, smudge_container, 0.02) - - the_smallest_n <- min(get_trinoploid_1n_est(peak_sizes), draft_n) - smudge_summary$n_peak_est <- estimate_1n_coverage_highest_peak(peak_sizes, minor_variant_rel_cov, total_pair_cov, the_smallest_n) - - smudge_summary$n <- ifelse(length(args$n_cov) == 0, smudge_summary$n_peak_est, args$n_cov) - - peak_sizes$structure <- apply(peak_sizes, 1, function(x){ guess_genome_structure(x, smudge_summary$n)}) - - dulpicit_structures <- any(table(peak_sizes$structure) > 1) - if(dulpicit_structures & iterative_nbins){ - if(args$nbins > 20){ - args$nbins <- args$nbins - 5 - } else { - args$nbins <- args$nbins - 2 - } - smudge_warn(args$output, "detecting two smudges at the same positions, not enough data for this number of bins lowering number of bins to ", args$nbins) - } else { - break - } - } - - peak_sizes$corrected_minor_variant_cov <- sapply(peak_sizes$structure, function(x){round(mean(unlist(strsplit(x, split = '')) == 'B'), 2)}) - peak_sizes$ploidy <- sapply(peak_sizes$structure, nchar) - - - peak_sizes$rel_size <- peak_sizes$rel_size / sum(peak_sizes$rel_size) - peak_sizes <- peak_sizes[order(peak_sizes$rel_size, decreasing = T),] - smudge_summary$peak_sizes <- peak_sizes - - considered_ploidies <- unique(peak_sizes$ploidy) - ploidy_with_most_smudges <- which.max(sapply(considered_ploidies, function(x){ sum(peak_sizes[peak_sizes$ploidy == x,'rel_size']) }) ) - smudge_summary$genome_ploidy <- considered_ploidies[ploidy_with_most_smudges] - - generate_summary(args, smudge_summary) - fig_title <- paste(args$title, add_to_title) - - layout(matrix(c(2,4,1,3), 2, 2, byrow=T), c(3,1), c(1,3)) - # 1 smudge plot - plot_smudgeplot(smudge_container, smudge_summary$n, colour_ramp) - plot_expected_haplotype_structure(smudge_summary$n, peak_sizes, T, xmax = max(smudge_container$x)) - # 2,3 hist - plot_histograms(minor_variant_rel_cov, total_pair_cov, - ymax, smudge_summary, args$nbins, fig_title) - # 4 legend - plot_legend(smudge_container, colour_ramp) -} - -plot_smudge(minor_variant_rel_cov, total_pair_cov, 'just pairs') -## OK, the C++ software kind of works, -# but the smudgeplot of the families of two DOES NOT look exactly the same as when produced by python kmer pairs script -# it's hard to say from here where is the difference comes from, but... it remains true that the smudgeplots are different - - -# Now, let look at the non diploid ones -cov <- tranf_cov[!pairs,] -minor_variant_rel_cov <- cov$min / cov$sum -total_pair_cov <- cov$sum - -pdf('figures/multi_family_covsum_hist.pdf') - plot_hist <- hist(total_pair_cov, breaks = 2000, xlim = c(0,5000)) - for(i in c(4, 6,8,10, 16,32) * 144){ - lines(c(i,i), c(0, 100000), col = 'red') - text(i, max(plot_hist$counts), labels = i / 144, pos = 4) - } -dev.off() - -pdf('figures/multi_family_smudgeplot.pdf') - plot_smudge(minor_variant_rel_cov, total_pair_cov, 'only_multi_families') -dev.off() - -minor_variant_rel_cov <- cov$max / cov$sum -total_pair_cov <- cov$sum - -low_cov_data <- c(total_pair_cov < 1300) -minor_variant_rel_cov <- minor_variant_rel_cov[low_cov_data] -total_pair_cov <- total_pair_cov[low_cov_data] - -# Create hexbin object and plot -h <- hexbin(data.frame(max_rel_cov = minor_variant_rel_cov,total_pair_cov)) - -pdf('figures/multi_family_max_rel_cov.pdf') - plot(h, colramp=viridis) -dev.off() -#plot_smudge(minor_variant_rel_cov, total_pair_cov, 'only_multi_families') - -tranf_cov$memebers <- read.table('data/strawberry_iinumae/kmer_counts_L109_U_family_members.tsv')$V1 -cov <- tranf_cov[tranf_cov$memebers == 3,] - -cov$mid <- cov$sum - (cov$min + cov$max) - -cov$min <- round(cov$min / 144) -cov$mid <- round(cov$mid / 144) -cov$max <- round(cov$max / 144) - -tri_family_tab <- table(paste(cov$max, cov$mid, cov$min)) - -tri_represented <- tri_family_tab[tri_family_tab > 5000] -tri_family_overview <- data.frame(name = names(tri_represented), count = as.numeric(tri_represented), stringsAsFactors = F) - -tri_family_overview$As <- sapply(strsplit(tri_family_overview$name, ' '), function(x)( as.numeric(x[1]) ) ) -tri_family_overview$Bs <- sapply(strsplit(tri_family_overview$name, ' '), function(x)( as.numeric(x[2]) ) ) -tri_family_overview$Cs <- sapply(strsplit(tri_family_overview$name, ' '), function(x)( as.numeric(x[3]) ) ) - -tri_family_overview$ploidy <- rowSums(tri_family_overview[,c('As', 'Bs', 'Cs')]) -tri_family_overview$smudge <- paste0(sapply(tri_family_overview$As, function(x) { paste0(rep('A', x), collapse = '') }), - sapply(tri_family_overview$Bs, function(x) { paste0(rep('B', x), collapse = '') }), - sapply(tri_family_overview$Cs, function(x) { paste0(rep('C', x), collapse = '') })) - -tri_family_overview <- tri_family_overview[order(tri_family_overview$ploidy),] -tri_barplot <- tri_family_overview$count -tri_spaces <- c(0.5, ifelse(tri_family_overview$ploidy[-1] == tri_family_overview$ploidy[-length(tri_barplot)], 0.2, 0.5)) -names(tri_barplot) <- tri_family_overview$ploidy -annot <- tri_family_overview$count > 5e4 - -pdf('figures/triallelic_smudge_sizes.pdf') - bar_pos <- barplot(tri_barplot, space = tri_spaces) - text(bar_pos[annot], tri_family_overview$count[annot], tri_family_overview$smudge[annot]) -dev.off() \ No newline at end of file diff --git a/playground/popart.R b/playground/popart.R new file mode 100644 index 0000000..dbd52db --- /dev/null +++ b/playground/popart.R @@ -0,0 +1,56 @@ +library(smudgeplot) + +args <- ArgumentParser()$parse_args() +args$output <- "data/Scer/sploidyplot_test" +args$nbins <- 40 +args$kmer_size <- 21 +args$homozygous <- FALSE +args$L <- c() +args$col_ramp <- 'viridis' +args$invert_cols <- TRUE + +cov_tab <- read.table("data/Scer/PloidyPlot3_text.smu", col.names = c('covB', 'covA', 'freq'), skip = 2) #nolint +cov_tab[, 'total_pair_cov'] <- cov_tab[, 1] + cov_tab[, 2] +cov_tab[, 'minor_variant_rel_cov'] <- cov_tab[, 1] / cov_tab[, 'total_pair_cov'] + +cov_tab_1n_est <- round(estimate_1n_coverage_1d_subsets(cov_tab), 1) + +xlim <- c(0, 0.5) +# max(total_pair_cov); 10*draft_n +ylim <- c(0, 150) +nbins <- 40 + +smudge_container <- get_smudge_container(cov_tab, nbins, xlim, ylim) +smudge_container$z <- smudge_container$dens + +plot_popart <- function(cov_tab, ylim, colour_ramp){ + A_equals_B <- cov_tab[, 'covA'] == cov_tab[, 'covB'] + cov_tab[A_equals_B, 'freq'] <- cov_tab[A_equals_B, 'freq'] * 2 + cov_tab$col <- colour_ramp[1 + round(31 * cov_tab[, 'freq'] / max(cov_tab[, 'freq']))] + + plot(NULL, xlim = c(0.1, 0.5), ylim = ylim, xaxt = "n", yaxt = "n", xlab = '', ylab = '', bty = 'n') + min_cov_to_plot <- max(ylim[1],min(cov_tab[, 'total_pair_cov'])) + nothing <- sapply(min_cov_to_plot:ylim[2], plot_one_coverage, cov_tab) + return(0) +} + +par(mfrow = c(2, 5)) + +args$col_ramp <- "viridis" +args$invert_cols <- FALSE +colour_ramp <- get_col_ramp(args) # get the default colour ramp (Spectral, 11) +# plot_smudgeplot(smudge_container, 15.5, colour_ramp) +plot_popart(cov_tab, c(20, 120), colour_ramp) + +args$invert_cols <- TRUE +colour_ramp <- get_col_ramp(args) # get the default colour ramp (Spectral, 11) +# plot_smudgeplot(smudge_container, 15.5, colour_ramp) +plot_popart(cov_tab, c(20, 120), colour_ramp) + + +for (ramp in c("grey.colors", "magma", "plasma", "mako", "inferno", "rocket", "heat.colors", "cm.colors")){ + args$col_ramp <- ramp + colour_ramp <- get_col_ramp(args) # get the default colour ramp (Spectral, 11) + plot_popart(cov_tab, c(20, 120), colour_ramp) +} + diff --git a/tests/README.md b/tests/README.md index 8880ac0..445b730 100644 --- a/tests/README.md +++ b/tests/README.md @@ -4,46 +4,61 @@ This is a place for manual tests that have not been automated (yet?). Don't forget to re-install the package/script before execution. Somehting like ``` -Rscript install.R -install -C exec/smudgeplot /usr/local/bin -install -C exec/smudgeplot_plot.R /usr/local/bin +make install INSTALL_PREFIX=~ ``` should do the job. #### interface tests -##### smudgeplot plot - -``` -./exec/smudgeplot plot tests/data/toy_middle_coverages.tsv -o tests/data/toy_middle_R -n 100 -nbins 20 -``` - -generates a smudgeplot from the toy data. +##### data prep -##### smudgeplot hetkmers +Download `SRR3265401` - nice teteraploid Sacharomyces run I use often for testing. -two different methods to extract homologous kmers +##### smudgeplot plot -###### All +Defaults: ``` -./exec/smudgeplot hetkmers -o tests/data/toy_all tests/data/toy_kmer_k21.dump +smudgeplot.py all data/Scer/kmerpairs_default_e2_text.smu -o data/Scer/240918_trial ``` -###### Middle +Testing parameters: ``` -./exec/smudgeplot hetkmers -o tests/data/toy_middle tests/data/toy_kmer_k21.dump --middle +smudgeplot.py all data/Scer/kmerpairs_default_e2_text.smu -o data/Scer/240918_trial_params -t "Species 1" -c 20 -ylim 80 -col_ramp magma --invert_cols ``` -generates two files `tests/data/toy_middle_coverages.tsv` and `tests/data/toy_middle_sequences.tsv` with coverages and sequences of kmer pairs. +##### smudgeplot hetkmers + +two different methods to extract homologous kmers + +TODO +##### Dicots -##### smudgeplot kmer extraction +This is a large dataset of the first 540 dicot genomes sequenced by the Tree of Life. Some of them are completed, some of them are with insufficient coverage or otherwise qc failed data. The idea here is to be able to tell those apart, get reasonable defaults so the generated plot is meaningful for a reasonable number (i.e. nearly all) of them. ``` -exec/smudgeplot.py extract -cov tests/data/toy_all_coverages.tsv -seq tests/data/toy_all_sequences.tsv -minc 400 -maxc 410 -minr 0.49 -maxr 0.5 | head +time ./exec/smudgeplot.py plot data/dicots/smu_files/daAchMill1.k31_ploidy.smu.txt -o data/dicots/alt_plots/daAchMill1 --alt_plot -q 0.9 ``` -would extract 80 lines made of 20 kmers pairs, but showing only the first ten. \ No newline at end of file +```bash +for smu in data/dicots/smu_files/*.smu.txt; do + species=$(basename $smu) + echo $species $smu + time ./exec/smudgeplot.py plot $smu -o data/dicots/alt_plots/$species --alt_plot -q 0.9 +done + +for smu in data/dicots/smu_files/*.smu.txt; do + species=$(basename $smu .smu.txt) + echo $species $smu + time ./exec/smudgeplot.py plot $smu -c 10 -o data/dicots/alt_plots_c10/$species --alt_plot -q 0.9 +done + +for smu in $(ls data/dicots/smu_files/*.smu.txt | head -20); do + species=$(basename $smu .smu.txt) + echo $species $smu + smudgeplot.py all $smu -t $species -o data/dicots/automated_smudgeplots/$species +done +``` \ No newline at end of file diff --git a/tests/testthat.R b/tests/testthat.R deleted file mode 100644 index c687ba7..0000000 --- a/tests/testthat.R +++ /dev/null @@ -1,4 +0,0 @@ -library(testthat) -library(smudgeplot) - -test_package("smudgeplot") \ No newline at end of file diff --git a/tests/testthat/fmlo_smaple_cov_file.tsv b/tests/testthat/fmlo_smaple_cov_file.tsv deleted file mode 100644 index 68f82c0..0000000 --- a/tests/testthat/fmlo_smaple_cov_file.tsv +++ /dev/null @@ -1,1000 +0,0 @@ -203 452 -199 439 -390 484 -235 482 -229 334 -220 407 -197 394 -183 192 -195 211 -633 728 -193 552 -188 188 -197 414 -177 409 -217 455 -197 402 -181 359 -218 396 -244 249 -185 225 -217 409 -197 431 -231 242 -167 408 -215 379 -206 396 -198 637 -195 468 -398 635 -191 465 -178 193 -247 440 -203 424 -196 367 -185 434 -222 1409 -234 1083 -202 406 -212 433 -203 409 -412 526 -558 720 -195 412 -216 454 -204 411 -251 689 -242 475 -178 181 -178 580 -438 686 -166 212 -191 412 -208 452 -174 355 -221 634 -494 508 -202 351 -162 377 -229 445 -517 611 -183 384 -252 437 -199 397 -452 693 -172 202 -640 647 -223 379 -234 539 -421 454 -202 387 -222 785 -422 458 -213 233 -404 604 -264 465 -562 873 -201 472 -608 1263 -232 496 -256 358 -244 406 -222 436 -220 393 -178 447 -447 521 -198 609 -506 905 -204 205 -457 615 -209 402 -171 307 -218 429 -223 230 -349 416 -242 818 -211 414 -197 394 -180 383 -200 372 -151 348 -220 413 -622 642 -155 403 -193 475 -238 422 -171 365 -137 208 -200 374 -234 435 -201 366 -229 404 -200 404 -206 452 -177 407 -227 306 -213 481 -219 449 -193 301 -134 624 -183 560 -228 487 -190 399 -194 704 -183 388 -168 209 -202 415 -571 662 -175 373 -225 417 -203 367 -199 249 -206 426 -240 818 -145 282 -219 432 -165 204 -235 545 -195 240 -172 395 -151 722 -270 399 -226 242 -161 324 -217 438 -234 988 -216 640 -182 432 -224 396 -707 1331 -198 223 -214 404 -219 265 -213 407 -402 674 -178 215 -193 705 -190 262 -199 424 -178 195 -218 439 -232 526 -177 404 -207 441 -382 690 -221 227 -215 462 -140 285 -225 590 -215 402 -218 438 -257 435 -210 260 -202 443 -243 392 -202 405 -195 452 -192 361 -187 199 -231 450 -211 444 -225 471 -234 411 -216 580 -211 701 -158 353 -208 377 -138 235 -174 492 -222 419 -234 414 -397 540 -160 381 -214 420 -197 381 -199 399 -221 575 -577 704 -210 249 -166 355 -204 1379 -224 431 -174 398 -176 1405 -268 489 -377 621 -195 603 -226 1373 -209 604 -211 415 -222 495 -129 417 -190 397 -195 896 -228 604 -693 1236 -204 229 -195 617 -234 428 -245 402 -190 426 -146 484 -196 390 -187 440 -174 372 -225 478 -175 367 -565 594 -381 473 -210 455 -209 453 -1108 1452 -196 414 -207 369 -472 918 -220 470 -192 394 -239 446 -200 1252 -211 216 -187 470 -179 460 -164 370 -222 397 -194 396 -180 361 -409 753 -630 706 -407 671 -180 400 -579 649 -216 224 -235 253 -180 421 -638 641 -212 217 -227 400 -209 416 -234 393 -202 385 -244 451 -220 459 -202 424 -177 416 -191 371 -195 205 -216 435 -215 457 -343 747 -233 468 -198 464 -193 203 -220 479 -217 246 -219 505 -203 219 -257 749 -219 627 -187 388 -228 498 -177 650 -691 755 -1057 1075 -600 645 -188 336 -208 397 -198 395 -234 421 -209 447 -258 451 -171 359 -208 446 -207 381 -245 442 -189 331 -243 678 -203 402 -217 419 -179 295 -177 371 -188 424 -203 358 -192 413 -230 454 -237 641 -474 496 -204 428 -170 392 -207 865 -225 662 -420 625 -196 367 -267 454 -440 1428 -167 362 -193 1169 -208 447 -224 451 -249 468 -233 556 -235 1044 -278 445 -192 405 -592 639 -178 354 -409 582 -170 464 -427 833 -375 660 -171 383 -193 195 -443 618 -209 234 -190 205 -202 425 -537 584 -190 370 -219 424 -187 493 -195 196 -575 692 -247 359 -187 574 -228 382 -462 1467 -177 402 -248 390 -435 874 -182 379 -340 547 -215 417 -363 598 -171 704 -225 456 -198 415 -201 537 -203 404 -155 307 -211 381 -169 519 -392 478 -205 635 -257 477 -205 216 -178 388 -214 376 -204 425 -306 457 -191 327 -215 431 -575 1101 -587 601 -162 374 -285 459 -231 472 -200 676 -203 816 -609 623 -177 372 -214 428 -335 613 -189 710 -370 963 -221 237 -164 184 -220 410 -405 578 -203 403 -243 509 -195 347 -208 451 -193 433 -208 398 -197 424 -208 401 -224 434 -156 363 -175 441 -176 183 -253 420 -203 637 -191 416 -207 413 -228 362 -203 408 -226 683 -141 387 -160 377 -194 420 -229 440 -181 388 -226 419 -195 536 -206 579 -217 587 -201 361 -498 631 -170 181 -424 673 -204 431 -229 459 -185 416 -216 445 -192 381 -164 246 -232 446 -174 189 -195 440 -211 379 -202 369 -205 376 -456 1056 -481 553 -219 375 -372 588 -206 255 -139 197 -276 402 -1115 1486 -191 437 -178 580 -220 626 -247 433 -209 212 -392 431 -172 389 -163 557 -210 390 -398 944 -187 352 -258 446 -203 204 -202 378 -166 210 -405 499 -242 416 -179 377 -217 626 -223 510 -165 344 -664 735 -231 652 -219 438 -232 498 -170 184 -234 387 -743 750 -190 473 -213 1248 -230 486 -163 434 -608 621 -215 365 -365 659 -195 409 -187 430 -596 670 -247 454 -224 386 -199 408 -235 473 -220 476 -299 1189 -207 473 -215 432 -279 466 -177 191 -331 624 -372 416 -441 862 -191 699 -213 388 -225 306 -188 380 -180 356 -197 222 -194 230 -218 446 -562 722 -218 470 -240 424 -626 649 -228 536 -199 390 -226 589 -254 451 -200 220 -237 418 -190 195 -212 480 -228 399 -198 648 -222 508 -217 244 -202 364 -198 428 -236 485 -402 442 -207 762 -248 435 -241 434 -398 670 -396 450 -216 412 -201 463 -185 406 -234 472 -170 359 -196 395 -245 446 -221 424 -214 508 -159 179 -133 278 -212 240 -240 700 -197 429 -238 430 -231 432 -229 429 -198 422 -195 442 -198 443 -188 209 -410 685 -234 457 -250 466 -220 518 -167 544 -170 203 -476 494 -208 405 -280 316 -199 818 -207 468 -232 499 -219 442 -185 441 -419 825 -210 652 -186 375 -194 433 -202 416 -180 386 -177 403 -236 460 -186 394 -697 698 -218 657 -189 406 -220 440 -208 648 -631 642 -198 316 -241 483 -181 397 -201 650 -188 375 -191 201 -178 430 -241 456 -217 397 -223 445 -197 513 -176 574 -523 602 -190 472 -171 1047 -178 1136 -181 430 -221 437 -191 812 -454 772 -201 416 -206 382 -369 609 -247 484 -204 440 -197 377 -572 624 -208 220 -216 431 -218 432 -213 464 -340 1307 -200 448 -215 676 -188 359 -194 201 -181 390 -195 434 -196 407 -192 400 -208 373 -185 340 -214 425 -182 437 -187 627 -207 459 -163 278 -190 565 -188 401 -172 371 -219 469 -645 863 -397 808 -711 846 -208 441 -191 411 -197 373 -197 378 -170 265 -194 423 -207 436 -197 431 -447 776 -228 660 -551 613 -210 423 -590 726 -190 190 -193 405 -182 362 -226 399 -219 471 -228 237 -674 704 -214 1364 -491 495 -424 701 -199 225 -473 927 -211 392 -202 782 -206 387 -233 689 -448 730 -153 297 -140 349 -192 388 -212 327 -218 612 -254 422 -194 386 -202 438 -199 419 -233 350 -169 386 -185 512 -178 433 -442 618 -383 703 -359 600 -206 351 -209 251 -195 401 -395 627 -177 331 -261 594 -179 577 -315 420 -208 455 -205 462 -168 770 -169 424 -185 848 -423 627 -181 206 -217 429 -183 533 -226 390 -204 386 -183 300 -196 416 -192 220 -186 935 -164 369 -211 365 -697 878 -263 1129 -200 368 -627 909 -194 592 -191 339 -197 344 -209 453 -180 408 -247 723 -222 438 -514 563 -251 668 -236 599 -231 428 -218 432 -213 422 -208 764 -418 638 -175 398 -194 268 -210 433 -238 480 -190 369 -526 568 -147 238 -224 233 -220 439 -190 513 -194 206 -220 374 -214 426 -301 389 -194 217 -186 192 -160 507 -225 428 -183 375 -198 386 -224 466 -312 637 -165 196 -205 415 -423 512 -241 446 -181 226 -176 430 -218 427 -397 1136 -261 386 -213 469 -217 257 -648 684 -213 381 -207 360 -210 439 -238 454 -166 823 -203 238 -206 385 -183 484 -194 408 -236 466 -427 841 -171 424 -203 901 -211 425 -190 217 -192 497 -227 439 -228 617 -215 458 -579 654 -404 1274 -203 398 -195 375 -224 419 -230 518 -204 398 -268 895 -190 375 -203 598 -187 356 -227 454 -228 391 -229 409 -391 862 -542 604 -389 496 -166 184 -196 472 -239 250 -235 464 -613 1203 -206 987 -231 427 -464 555 -232 454 -399 678 -186 395 -177 327 -193 216 -179 416 -628 1070 -161 364 -182 437 -141 497 -195 410 -177 378 -560 605 -248 470 -405 670 -192 409 -594 656 -223 403 -206 437 -178 564 -216 401 -245 362 -205 463 -145 211 -216 255 -211 340 -210 418 -207 449 -211 389 -591 1231 -223 382 -213 215 -126 175 -381 456 -218 1003 -209 412 -231 444 -223 425 -131 227 -495 628 -187 190 -188 349 -180 483 -200 375 -179 383 -355 938 -203 387 -219 380 -178 433 -415 437 -200 233 -197 442 -218 405 -217 494 -218 229 -374 790 -188 421 -201 218 -628 682 -530 567 -215 442 -195 404 -241 452 -461 614 -217 361 -183 206 -185 401 -182 183 -234 408 -218 431 -404 1193 -576 621 -226 405 -218 471 -179 383 -169 377 -187 448 -189 387 -238 376 -220 423 -178 445 -183 417 -191 383 -232 442 -575 1143 -212 238 -414 640 -247 248 -230 323 -123 409 -199 951 -228 401 -166 432 -196 973 -227 386 -146 398 -214 610 -213 432 -343 399 -188 611 -238 690 -191 369 -169 215 -208 379 -168 369 -189 463 -190 434 -264 974 -191 649 -185 359 -167 205 -186 414 -184 653 -227 267 -430 626 -227 429 -201 398 -175 359 -191 203 -205 384 -257 467 -210 393 -260 520 -483 499 -251 478 -240 474 -202 410 -207 367 -222 425 -661 892 -192 1031 -231 507 -461 623 -262 412 -206 413 -190 470 -224 448 -196 211 -167 779 -191 405 -221 381 -383 570 -160 180 -173 455 -413 557 -229 423 -619 709 -206 383 -205 469 -218 634 -200 373 -180 388 -189 432 -201 221 -279 492 -830 833 -227 240 -208 441 -236 374 -224 448 -241 605 -165 414 -186 352 -394 887 -192 544 -181 189 -232 538 -216 404 -188 466 -191 1109 -660 668 -180 424 -215 463 -204 400 -429 864 -365 623 -276 537 -194 348 -174 379 -185 368 -540 592 -201 417 -196 204 -178 189 -208 459 -155 376 -224 234 -630 636 -173 373 -231 376 -193 346 -202 389 -220 226 -609 902 -190 204 -155 192 -197 645 -371 794 -210 461 -255 471 -243 403 -223 389 -192 407 -174 401 -400 789 -544 602 diff --git a/tests/testthat/test-estimate_1n_coverage_1d_subsets.R b/tests/testthat/test-estimate_1n_coverage_1d_subsets.R deleted file mode 100644 index dda6eb8..0000000 --- a/tests/testthat/test-estimate_1n_coverage_1d_subsets.R +++ /dev/null @@ -1,18 +0,0 @@ -context("1d subsets fitting") - -cov <- read.table('fmlo_smaple_cov_file.tsv') - -minor_variant_rel_cov <- cov$V1 / (cov$V1 + cov$V2) -total_pair_cov <- cov$V1 + cov$V2 - -high_cov_filt <- quantile(total_pair_cov, 0.99) > total_pair_cov -minor_variant_rel_cov <- minor_variant_rel_cov[high_cov_filt] -total_pair_cov <- total_pair_cov[high_cov_filt] - -draft_n <- round(estimate_1n_coverage_1d_subsets(total_pair_cov, minor_variant_rel_cov), 1) -expected_n <- 207 # from genomescope - -test_that("approximate coverage estimation", { - expect_true(abs(draft_n - expected_n) < 10) - } -) diff --git a/tests/testthat/test-fitting.R b/tests/testthat/test-fitting.R deleted file mode 100644 index 708f5cd..0000000 --- a/tests/testthat/test-fitting.R +++ /dev/null @@ -1,46 +0,0 @@ -context("integration") - -cov <- read.table('fmlo_smaple_cov_file.tsv') - -minor_variant_rel_cov <- cov$V1 / (cov$V1 + cov$V2) -total_pair_cov <- cov$V1 + cov$V2 - -high_cov_filt <- quantile(total_pair_cov, 0.99) > total_pair_cov -minor_variant_rel_cov <- minor_variant_rel_cov[high_cov_filt] -total_pair_cov <- total_pair_cov[high_cov_filt] - -draft_n <- round(estimate_1n_coverage_1d_subsets(total_pair_cov, minor_variant_rel_cov), 1) -ymax <- min(10 * draft_n, max(total_pair_cov)) -ymin <- min(total_pair_cov) - 1 - -nbins <- 15 -smudge_container <- get_smudge_container(minor_variant_rel_cov, total_pair_cov, .nbins = nbins) - -test_that("2d fitting somehow works", { - expect_gt(sum(smudge_container$dens), 0) - } -) - -# peak_points <- peak_agregation(smudge_container) -# peak_sizes <- get_peak_summary(peak_points, smudge_container) -# n <- estimate_1n_coverage_highest_peak(peak_sizes, minor_variant_rel_cov, total_pair_cov) -# peak_sizes$structure <- apply(peak_sizes, 1, -# function(x){ guess_genome_structure(x, n)}) -# peak_sizes$corrected_minor_variant_cov <- sapply(peak_sizes$structure, function(x){round(mean(unlist(strsplit(x, split = '')) == 'B'), 2)}) -# peak_sizes$ploidy <- sapply(peak_sizes$structure, nchar) -# -# genome_ploidy <- peak_sizes$ploidy[which.max(peak_sizes$rel_size)] -# -# test_that("M. flo is expected to be estimated as triploid", { -# expect_equal(genome_ploidy, 3) -# } -# ) - -# pal <- brewer.pal(11,'Spectral') -# rf <- colorRampPalette(rev(pal[3:11])) -# colour_ramp <- rf(32) -# layout(matrix(c(2,4,1,3), 2, 2, byrow=T), c(3,1), c(1,3)) -# plot_smudgeplot(smudge_container, n, colour_ramp) -# plot_expected_haplotype_structure(n, peak_sizes, T) -# plot_histograms(minor_variant_rel_cov, total_pair_cov, ymax, "TEST", genome_ploidy) -# plot_legend(smudge_container, total_pair_cov, colour_ramp) diff --git a/tests/testthat/test-get_smudge_container.R b/tests/testthat/test-get_smudge_container.R deleted file mode 100644 index 4c3b6c8..0000000 --- a/tests/testthat/test-get_smudge_container.R +++ /dev/null @@ -1,22 +0,0 @@ -context("smudge container generation") - -x <- c(2,2,2,3,3,3,3,3,3,3) -y <- c(1,1,2,2,2,2,3,3,3,3) - -cont <- get_smudge_container(x, y, .nbins = 3, .xlim = c(0, 3), .ylim = c(0, 3)) -exp <- matrix(c(0,0,0,2,1,0,0,3,4), ncol = 3, byrow = T) - -test_that("generation of smudge matrix categorize as expected", { - expect_true(all(cont$dens == exp)) - } -) - -minor_cov <- c(0.5, 0.33, 0.32, 0.25, 0.125, 0.31) -pair_cov <- c(40, 60, 61, 83, 164, 123) -cont <- get_smudge_container(minor_cov, pair_cov, .nbins = 4) -exp <- matrix(c(0,0,0,1, 0,0,1,0, 0,2,1,0, 1,0,0,0), ncol = 4, byrow = T) - -test_that("generation of smudge matrix categorize as expected", { - expect_true(all(cont$dens == exp)) - } -) diff --git a/tests/testthat/test-get_summary.R b/tests/testthat/test-get_summary.R deleted file mode 100644 index 5b8f1ec..0000000 --- a/tests/testthat/test-get_summary.R +++ /dev/null @@ -1,49 +0,0 @@ -context("generation of peak summary") - -cov <- read.table('fmlo_smaple_cov_file.tsv') - -minor_variant_rel_cov <- cov$V1 / (cov$V1 + cov$V2) -total_pair_cov <- cov$V1 + cov$V2 - -high_cov_filt <- quantile(total_pair_cov, 0.99) > total_pair_cov -minor_variant_rel_cov <- minor_variant_rel_cov[high_cov_filt] -total_pair_cov <- total_pair_cov[high_cov_filt] - -draft_n <- 205 -ymax <- min(10 * draft_n, max(total_pair_cov)) -ymin <- min(total_pair_cov) - 1 - -nbins <- 30 -smudge_container <- get_smudge_container(minor_variant_rel_cov, total_pair_cov, .nbins = nbins) - -peak_points <- peak_agregation(smudge_container) -# peak_points[peak_points$summit == T,] - -test_that("peak agregation creates more than one smudge", { - expect_true(sum(peak_points$summit) > 1) - expect_true(any(!is.na(peak_points$peak))) - } -) - -peak_sizes <- get_peak_summary(peak_points, smudge_container, 0.05) -the_smallest_n <- get_trinoploid_1n_est(peak_sizes) - -n <- estimate_1n_coverage_highest_peak(peak_sizes, minor_variant_rel_cov, total_pair_cov, the_smallest_n) - -test_that("test that polished estimate of coverage for mflo makes sense", { - expect_true(abs(n - 207) < 10) - } -) - -peak_sizes$structure <- apply(peak_sizes, 1, - function(x){ guess_genome_structure(x, n)}) - -peak_sizes$corrected_minor_variant_cov <- sapply(peak_sizes$structure, function(x){round(mean(unlist(strsplit(x, split = '')) == 'B'), 2)}) -peak_sizes$ploidy <- sapply(peak_sizes$structure, nchar) - -genome_ploidy <- peak_sizes$ploidy[which.max(peak_sizes$rel_size)] - -test_that("mflo was estimated triploid", { - expect_true(genome_ploidy == 3) - } -)