-
Notifications
You must be signed in to change notification settings - Fork 0
/
tmp.R
118 lines (87 loc) · 3.21 KB
/
tmp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
library(readr)
library(reshape)
library(doParallel)
file_dir <- "data/"
file_vec <- list.files(path = file_dir, pattern = "*.contigs.csv")
file_list <- list()
for (n in 1:length(file_vec)){
file_list[[n]] <- read_csv(paste0(file_dir, file_vec[n]))
}
all_df <- merge_all(file_list) %>%
group_by(genome) %>%
summarise(Scaffolds = mean(true_scaffolds)) %>%
mutate(genome = gsub("matabat2bin.", "", gsub(".fa", "", genome)))
df <- df %>% left_join(all_df)
write_csv2(df, "data/mags.csv")
file_merger <- function(pattern, dir = "data/", method = read_csv2) {
file_vec <- list.files(path = dir, pattern = pattern)
file_list <- list()
file_list <- foreach (i = file_vec) {
method(paste0(file_dir, i)) %>%
na.omit() %>%
filter(genome %in% selected_genomes) %>%
mutate(sample = i,
genome = gsub(".fna", "", gsub("metabat2bin_", "", genome)),
position = as.factor(position))
}
full_df <- merge_all(file_list) %>%
group_by(genome) %>%
summarise(Scaffolds = mean(true_scaffolds)) %>%
mutate(genome = gsub("matabat2bin.", "", gsub(".fa", "", genome)))
}
snvs_files <- list.files(pattern = 'UU.*_snvs_summary.csv',
path = "data")
snvs_list <- mclapply(paste0("data/", snvs_files),
read_csv2)
snvs_file_list <- list()
sample_loader <- function(sample_name) {
read_csv2(sample_name) %>%
mutate(sample = gsub(".csv", "", sample_name),
genome = gsub("matabat2bin.", "", gsub(".fa", "", genome)))
}
###
micro_df <- merge_all(diversity_file_list) %>%
na.omit() %>%
dplyr::rename(nucdiv = nucl_diversity) %>%
group_by(genome)
gc_content <- read_tsv("data/gc.tsv", col_names = c("genome", "GC"))
gc_content <- gc_content %>%
mutate(genome = gsub("matabat2bin.", "", gsub("_k141_\\d*", "", gsub("_*UU", "UU", genome))))
gc_content <- gc_content %>%
group_by(genome) %>%
summarise(GC = mean(GC))
base_content <- read_tsv("data/base_content.tsv", col_names = c("genome", "BaseContent"))
base_content <- base_content %>%
mutate(genome = gsub("matabat2bin.", "", gsub("_k141_\\d*", "", gsub("_*UU", "UU", genome))))
base_content <- base_content %>%
group_by(genome) %>%
summarise(BaseContent = mean(BaseContent))
base_count <- read_tsv("data/base_count.tsv", col_names = c("genome", "BaseCount"))
base_count <- base_count %>%
mutate(genome = gsub("matabat2bin.", "", gsub("_k141_\\d*", "", gsub("_*UU", "UU", genome))))
base_count <- base_count %>%
group_by(genome) %>%
summarise(BaseCount = mean(BaseCount))
seqkit_df <- gc_content %>%
full_join(base_content) %>%
full_join(base_count)
seqkit_df <- filtered_df %>%
left_join(seqkit_df) %>%
select(genome, GC, BaseContent, BaseCount)
write_csv2(seqkit_df, "data/seqkit.csv")
###
a <- fviz_nbclust(heatmap_mat, kmeans, method = "wss")
b <- fviz_nbclust(nd_mat, kmeans, method = "wss")
a_mod <- a +
labs(title = "RPKM Clustering")
b_mod <- b +
labs(title = "Nucleotide Diversity Clustering") +
theme(axis.title.y = element_blank())
p <- (a_mod | b_mod) +
plot_annotation(tag_levels = "A")
ggsave("kmeans_optimal_clustering.pdf",
plot = p,
device = "pdf",
path = "results",
width = 10,
height = 5)