-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistics.R
103 lines (88 loc) · 3.53 KB
/
statistics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
mean_nucdiv <- df %>%
select(starts_with("nucdiv")) %>%
as.matrix() %>%
rowMeans()
# mean GC content, stdev GC content, distribution
# mean completeness, stdev completeness, distribution
# mean contamination, stdev contamination, distribution
# mean strain heterogeneity, stdev strain heterogeneity, distribution
# mean length, stdev length, distribution
# mean contigs, stdev contigs, distribution
# mean N50, stdev N50, distribution
stats_df <- read_csv2("data/mags.csv")
seqkit_df <- read_csv2("data/seqkit.csv")
coding_density_df <- read_csv2("data/coding_density.csv") %>%
select(-`...1`)
stats_df <- stats_df %>%
left_join(seqkit_df) %>%
left_join(coding_density_df)
p1 <- ggplot(stats_df, aes(x = length / 1000000)) +
geom_histogram(bins = 100,
colour = "black",
fill = "white") +
labs(x = "Genome Length (Mbp)",
y = "Number of MAGs") +
scale_x_continuous(limits = c(0, 8),
breaks = seq(0, 8)) +
scale_y_continuous(limits = c(0, 70),
breaks = seq(0, 70, by = 10)) +
theme_classic()
p2 <- ggplot(stats_df, aes(x = contigs)) +
geom_histogram(bins = 100,
colour = "black",
fill = "white") +
scale_x_continuous(breaks = seq(0, 1000, by = 100)) +
scale_y_continuous(limits = c(0, 60),
breaks = seq(0, 60, by = 10)) +
labs(x = "Contigs",
y = "Number of MAGs") +
theme_classic()
p3 <- ggplot(stats_df, aes(x = N50)) +
geom_histogram(bins = 100,
colour = "black",
fill = "white") +
scale_x_log10(labels = scales::number_format()) +
labs(y = "Number of MAGs") +
theme_classic()
pivoted_df <- stats_df %>%
pivot_longer(cols = c("completeness",
"contamination",
"CodingDensity",
"GC"),
values_to = "checkm_percent",
names_to = "checkm_parameter")
pivoted_df$checkm_parameter[pivoted_df$checkm_parameter == "CodingDensity"] <- "Coding Density"
pivoted_df$checkm_parameter[pivoted_df$checkm_parameter == "completeness"] <- "Completeness"
pivoted_df$checkm_parameter[pivoted_df$checkm_parameter == "contamination"] <- "Contamination"
pivoted_df$checkm_parameter[pivoted_df$checkm_parameter == "GC"] <- "GC Content"
pivoted_df$checkm_parameter <- factor(
pivoted_df$checkm_parameter,
levels = c("Completeness", "Contamination", "GC Content", "Coding Density")
)
p4 <- ggplot(pivoted_df, aes(x = checkm_parameter, y = checkm_percent)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 100),
breaks = seq(0, 100, by = 10)) +
labs(x = "MAG Quality Statistics",
y = "Percentage (%)") +
theme_classic()
p <- (p1 / p2 / p3 | p4) +
plot_annotation(tag_levels = "A")
ggsave("statistics.pdf",
plot = p,
width = 20,
height = 10,
device = "pdf",
path = "results")
stats_df <- data.frame("Completeness" = mean(df$completeness),
"Contamination" = mean(df$contamination),
"MeanCoverage" = mean(df$MeanCov),
"MinCoverage" = mean(df$MinCov),
"Breadth" = mean(rowMeans(ubiquity_mat)),
"DiSiperMbp" = mean(df$MeanDiSiperMbp),
"NonSynonymousFraction" = mean(df$MeanNonsynonimousFractionMean),
"NucDiv" = mean(mean_nucdiv),
"MaxRPKM" = mean(df$AbundMax)) %>%
t()
colnames(stats_df) <- "U42"
write.csv2(stats_df, "results/statistics.csv")