-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistical_analysis.R
260 lines (227 loc) · 14.6 KB
/
statistical_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# Each of the below statements correspond verbatim to text in the manuscript
# so that they can be located by text search instead of reference to line number.
# Statements are indicated by double ## and other comments by single #
## Among the 36 phyla, 194 classes, 194 orders and 347 families identified...
dim(det$phylum)
dim(det$class)
dim(det$order)
dim(det$family)
## the top ten abundant classes along the Arctic Mid-Ocean Ridge (AMOR) are...
names(det$class)[order(colSums(det$class/rowSums(det$class)), decreasing = T)[c(1:5, 7:11)]] # names
sum(det$class$Alphaproteobacteria/rowSums(det$class))*(100/184)
sum(det$class$Nitrososphaeria/rowSums(det$class))*(100/184)
sum(det$class$S085/rowSums(det$class))*(100/184)
sum(det$class$Planctomycetacia/rowSums(det$class))*(100/184)
sum(det$class$Gammaproteobacteria/rowSums(det$class))*(100/184)
sum(det$class$Phycisphaerae/rowSums(det$class))*(100/184)
sum(det$class$Deltaproteobacteria/rowSums(det$class))*(100/184)
sum(det$class$`MD2896-B214`/rowSums(det$class))*(100/184)
sum(det$class$`Chloroflexi Subdivision 5 (SAR202)`/rowSums(det$class))*(100/184)
sum(det$class$Pacearchaeota/rowSums(det$class))*(100/184)
## ...all achieve median calssification rates (the percentage of correctly assigned
# samples) above 76%
tmp <- rbind(cbind(svmclass$rclass_quart[[1]]$tperm[, ncol(svmclass$rclass_quart[[1]]$tperm)], variable = names(svmclass$rclass_quart)[1], perm = "Non-permuted"),
cbind(svmclass$rclass_quart[[1]]$pperm[, ncol(svmclass$rclass_quart[[1]]$pperm)], variable = names(svmclass$rclass_quart)[1], perm = "Permuted"))
# 2
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[2]]$tperm[, ncol(svmclass$rclass_quart[[2]]$tperm)], variable = names(svmclass$rclass_quart)[2], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[2]]$pperm[, ncol(svmclass$rclass_quart[[2]]$pperm)], variable = names(svmclass$rclass_quart)[2], perm = "Permuted"))
# 3
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[3]]$tperm[, ncol(svmclass$rclass_quart[[3]]$tperm)], variable = names(svmclass$rclass_quart)[3], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[3]]$pperm[, ncol(svmclass$rclass_quart[[3]]$pperm)], variable = names(svmclass$rclass_quart)[3], perm = "Permuted"))
# 4
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[4]]$tperm[, ncol(svmclass$rclass_quart[[4]]$tperm)], variable = names(svmclass$rclass_quart)[4], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[4]]$pperm[, ncol(svmclass$rclass_quart[[4]]$pperm)], variable = names(svmclass$rclass_quart)[4], perm = "Permuted"))
# 5
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[5]]$tperm[, ncol(svmclass$rclass_quart[[5]]$tperm)], variable = names(svmclass$rclass_quart)[5], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[5]]$pperm[, ncol(svmclass$rclass_quart[[5]]$pperm)], variable = names(svmclass$rclass_quart)[5], perm = "Permuted"))
# 6
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[6]]$tperm[, ncol(svmclass$rclass_quart[[6]]$tperm)], variable = names(svmclass$rclass_quart)[6], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[6]]$pperm[, ncol(svmclass$rclass_quart[[6]]$pperm)], variable = names(svmclass$rclass_quart)[6], perm = "Permuted"))
# 7
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[7]]$tperm[, ncol(svmclass$rclass_quart[[7]]$tperm)], variable = names(svmclass$rclass_quart)[7], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[7]]$pperm[, ncol(svmclass$rclass_quart[[7]]$pperm)], variable = names(svmclass$rclass_quart)[7], perm = "Permuted"))
# 8
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[8]]$tperm[, ncol(svmclass$rclass_quart[[8]]$tperm)], variable = names(svmclass$rclass_quart)[8], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[8]]$pperm[, ncol(svmclass$rclass_quart[[8]]$pperm)], variable = names(svmclass$rclass_quart)[8], perm = "Permuted"))
# 9
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[9]]$tperm[, ncol(svmclass$rclass_quart[[9]]$tperm)], variable = names(svmclass$rclass_quart)[9], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[9]]$pperm[, ncol(svmclass$rclass_quart[[9]]$pperm)], variable = names(svmclass$rclass_quart)[9], perm = "Permuted"))
# 10
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[10]]$tperm[, ncol(svmclass$rclass_quart[[10]]$tperm)], variable = names(svmclass$rclass_quart)[10], perm = "Non-permuted"))
tmp <- rbind(tmp, cbind(svmclass$rclass_quart[[10]]$pperm[, ncol(svmclass$rclass_quart[[10]]$pperm)], variable = names(svmclass$rclass_quart)[10], perm = "Permuted"))
tmp <- as.data.frame(tmp)
names(tmp) <- c("value", "variable", "Permuted")
tmp$value <- as.numeric(as.character(tmp$value))
tmp$value <- tmp$value / 46
tmp %>%
filter(Permuted == "Non-permuted") %>%
group_by(variable) %>%
summarise(median = median(value), lq = quantile(value, 0.025), hq = quantile(value, 0.975))
# higher than by random chance
median(tmp$value[tmp$Permuted == "Permuted"])
## Between-core differences in oxygen penetration depth (10-116 cmbsf)
# as well as the much stronger correlation between per-sample
# classification accuracy and oxygen category...
tmp <- rbind(btmp$rank_abundance,
ttmp$rank_abundance,
qtmp$rank_abundance
) %>%
select(-answer) %>%
group_by(sample) %>%
summarise_at(vars(fraction), mean) %>%
as.data.frame
tmp$sample <- as.character(tmp$sample)
tmp$core <- NA
tmp$oxy_cat <- NA
tmp$oxy_cons <- NA
tmp$depth <- NA
for (i in seq(nrow(tmp))) {
tmp$core[i] <- paste(strsplit(tmp$sample[i], split = "_")[[1]][1],
strsplit(tmp$sample[i], split = "_")[[1]][2], sep = "_")
tmp$oxy_cat[i] <- det$rmap$oxy_cat[which(det$rmap$rowname == tmp$sample[i])]
tmp$oxy_cons[i] <- det$rmap$oxy_cons[which(det$rmap$rowname == tmp$sample[i])]
tmp$depth[i] <- det$rmap$depth[which(det$rmap$rowname == tmp$sample[i])]
}
tmp <- tmp[order(tmp$fraction, decreasing = T), ]
cor.test(tmp$fraction, tmp$oxy_cat, method = "spearman") # with oxygen category
cor.test(tmp$fraction, as.numeric(tmp$depth), method = "spearman") # with sediment depth
## Classification rates tend to plateau using only a fraction of the class’s OTUs...
.extract_svm_opt_acc <- function(svmobj, n = 11) {
if (n > 11) stop("n must be <= 11!")
out <- as.data.frame(matrix(NA, ncol = n, nrow = 2 * nrow(svmobj[[1]]$tperm)))
for (i in seq(n)) {
tmp <- svmobj[[i]]
out[, i] <- rbind(tmp$tperm[tmp$opt_otus], tmp$pperm[tmp$opt_otus])
}
names(out) <- names(svmobj)[1:n]
out$Permuted <- c(rep("Base", nrow(svmobj[[1]]$tperm)),
rep("Permuted", nrow(svmobj[[1]]$pperm)))
out
}
svmclass$rclass_quart_opt_acc <- .extract_svm_opt_acc(svmclass$rclass_quart, n = 10)
tmp <- svmclass$rclass_quart_opt_acc[1:128, 1:10]
tmp <- apply(tmp, 2, as.numeric)
tmp <- tmp / 46
# tmpcls will be used further for subsequent tests, do not delete.
tmpcls <- data.frame(taxon = dimnames(tmp)[[2]],
median = apply(tmp, 2, median))
e <- c(unlist(lapply(svmclass$rclass_quart, function(x) x$opt_otus)))
tmpcls$opt_OTUs <- e*20 - 10
# test
cor.test(tmpcls$opt_OTUs, tmpcls$median, method = "spearman")
## Median classification rate at the plateau is not significantly
# correlated with the relative abundance
tmpcls$csums <- colSums(((det$class/rowSums(det$class))*(1/1.83))[, det$classorder[c(1:5, 7:11)]])
cor.test(tmpcls$csums, tmpcls$median, method = "spearman") # test
## Moreover, classification rate is not connected to the
# fraction of OTUs needed to reach the plateau
cor.test(tmpcls$opt_OTUs, tmpcls$median, method = "spearman")
## Classification accuracy (how often a specific sediment sample is assigned
# correctly) is strongly correlated with oxygen category
# see comment to line 224 above.
## mean accuracy decreases significantly from categories 1-4 to 5 and 6 and then to 7...
tmp <- rbind(btmp$rank_abundance,
ttmp$rank_abundance,
qtmp$rank_abundance
) %>%
select(-answer) %>%
group_by(sample) %>%
summarise_at(vars(fraction), mean) %>%
as.data.frame
tmp$sample <- as.character(tmp$sample)
tmp$core <- NA
tmp$oxy_cat <- NA
tmp$oxy_cons <- NA
tmp$depth <- NA
for (i in seq(nrow(tmp))) {
tmp$core[i] <- paste(strsplit(tmp$sample[i], split = "_")[[1]][1],
strsplit(tmp$sample[i], split = "_")[[1]][2], sep = "_")
tmp$oxy_cat[i] <- det$rmap$oxy_cat[which(det$rmap$rowname == tmp$sample[i])]
tmp$oxy_cons[i] <- det$rmap$oxy_cons[which(det$rmap$rowname == tmp$sample[i])]
tmp$depth[i] <- det$rmap$depth[which(det$rmap$rowname == tmp$sample[i])]
}
tmp <- tmp[order(tmp$fraction, decreasing = T), ]
wilcox.test(tmp$fraction[tmp$oxy_cat %in% 1:4],
tmp$fraction[tmp$oxy_cat %in% 5:6], alternative = "greater") # mean accuracy in 1-4 > than in 5-6
wilcox.test(tmp$fraction[tmp$oxy_cat %in% 5:6],
tmp$fraction[tmp$oxy_cat == 7], alternative = "greater") # 5-6 > 7
## While approximately 63% of the samples in categories 1-4 (>25μM)
# are correctly classified at least 80% of the time...
sum(table(tmp$oxy_cat[tmp$fraction > 0.9])[1:4]) / sum(table(tmp$oxy_cat)[1:4])
mean(tmp$fraction[tmp$oxy_cat == 7])
## only in three of 11 cores did organic carbon content correlate significantly with depth...
TOC_tmp <- .read_excel_allsheets("data/AMOR_CN_profiles.xlsx")
cor.test(TOC_tmp[["GS14_GC02"]]$depth_CN, TOC_tmp[["GS14_GC02"]]$organic_carbon, method = "spearman") # neg insignificant
cor.test(TOC_tmp[["GS14_GC08"]]$depth_CN, TOC_tmp[["GS14_GC08"]]$organic_carbon, method = "spearman") # pos insignificant
cor.test(TOC_tmp[["GS14_GC09"]]$depth_CN, TOC_tmp[["GS14_GC09"]]$organic_carbon, method = "spearman") # pos insignificant
cor.test(TOC_tmp[["GS14_GC12"]]$depth_CN, TOC_tmp[["GS14_GC12"]]$organic_carbon, method = "spearman") # neg significant (0.03)
cor.test(TOC_tmp[["GS15_GC01"]]$depth_CN, TOC_tmp[["GS15_GC01"]]$organic_carbon, method = "spearman") # pos insignificant
cor.test(TOC_tmp[["GS16_GC04"]]$depth_CN, TOC_tmp[["GS16_GC04"]]$organic_carbon, method = "spearman") # neg insignificant
cor.test(TOC_tmp[["GS16_GC05"]]$depth_CN, TOC_tmp[["GS16_GC05"]]$organic_carbon, method = "spearman") # neg significant (0.03)
cor.test(TOC_tmp[["GS16_GC06"]]$depth_CN, TOC_tmp[["GS16_GC06"]]$organic_carbon, method = "spearman") # neg significant (0.03)
cor.test(TOC_tmp[["GS17_GC02"]]$depth_CN, TOC_tmp[["GS17_GC02"]]$organic_carbon, method = "spearman") # no data
cor.test(TOC_tmp[["GS17_GC04"]]$depth_CN, TOC_tmp[["GS17_GC04"]]$organic_carbon, method = "spearman") # no data
cor.test(TOC_tmp[["GS17_GC05"]]$depth_CN, TOC_tmp[["GS17_GC05"]]$organic_carbon, method = "spearman") # no data
## 167 OTUs (3.8% of all) are never differentially expressed across either the 1-6 or 1-7 category pairs...
(ncol(det$rOTU) -
length(unique(c(det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Pair"],
det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Partner"])))) /
ncol(det$rOTU)
## These OTUs are highly abundant and diverse, accounting for 41% of all reads and spanning
# 18 phyla and 44 classes.
# 1 - sum of all differentially expressed OTUs
1 - sum(det$rOTU[, unique(c(det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Pair"],
det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Partner"]))]) / sum(det$rOTU)
# taxonomic representation
# class
length(unique(det$rotu_tax[which(!rownames(det$rotu_tax) %in%
unique(c(det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Pair"],
det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Partner"]))), "class"]))
# phylum
length(unique(det$rotu_tax[which(!rownames(det$rotu_tax) %in%
unique(c(det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Pair"],
det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Partner"]))), "phylum"]))
## 71.4% of all reads, and 24 of 169 OTUs, assigned to Nitrosopumilales were not differentially expressed...
rownames(det$rotu_tax)[det$rotu_tax$order == "Ca. Nitrosopumilales"] # 169 OTUs
# 24 OTUs
table(det$rotu_tax[which(!rownames(det$rotu_tax) %in%
unique(c(det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Pair"],
det$prpd_oxy_cat$long[det$prpd_oxy_cat$long$name %in% c("6_1", "7_1"), "Partner"]))), "class"])
## all 24 OTUs not differentially expressed belong to the Nitrosopumilaceae family...
# see data/arb-silva.de_align_resultlist_948641_nitrosopumilus.csv
## We find that instead the vast majority of reads ascribed to Nitrosopumilales (95.2%)
# are differentially expressed along the length of the nitrate gradient...
NO3 <- list()
NO3$cmp <- .assign_gc_cons_id(.read_excel_allsheets("data/no3.xlsx", add_long = F),
species_name = "NO3", species_depth = "depth",
micro_map = cmp$map, zero_threshold = 0.0001, add_long = T)
# change names of long table as well as category vaules
NO3$cmp$long$NO3_cat <- 0
names(NO3$cmp$long)[2:3] <- c("NO3", "NO3_cat")
for (i in seq(nrow(NO3$cmp$long))) {
if (is.na(NO3$cmp$long$NO3[i])) NO3$cmp$long$NO3[i] = 8
if (NO3$cmp$long$NO3[i] > 0.03) NO3$cmp$long$NO3_cat[i] = 1
else if (NO3$cmp$long$NO3[i] > 0.025) NO3$cmp$long$NO3_cat[i] = 2
else if (NO3$cmp$long$NO3[i] > 0.02) NO3$cmp$long$NO3_cat[i] = 3
else if (NO3$cmp$long$NO3[i] > 0.015) NO3$cmp$long$NO3_cat[i] = 4
else if (NO3$cmp$long$NO3[i] > 0.01) NO3$cmp$long$NO3_cat[i] = 5
else if (NO3$cmp$long$NO3[i] > 0.005) NO3$cmp$long$NO3_cat[i] = 6
else NO3$cmp$long$NO3_cat[i] = 7
}
# perform propd analysis
NO3$det <- NO3$cmp$long[which(rownames(NO3$cmp$long) %in% rownames(det$rOTU)), ]
# Run propd analysis on cmp set
NO3$prpd <- pairwise_propd(det$rOTU_cmR,
conditions = num2char(NO3$det$NO3_cat),
propd_thetaval = 0.50,
mine_prp = T)
NO3$prpd <- .prpd_name_to_column(NO3$prpd)
NO3$prpd <- .prpd_reduce_union(NO3$prpd)
NO3$prpd$long <- NO3$prpd$long[which(!is.na(NO3$prpd$long$name)), ]
NO3$prpd$long$name <- unwind_tuple(NO3$prpd$long$name, NO3$cmp$long$NO3_cat)
NO3$prpd$long <- top_n_propd_pairs_batch(NO3$prpd$long, n = .25)
# check percentages
tmp <- det$rotu_tax[which(!rownames(det$rotu_tax) %in%
unique(c(NO3$prpd$long[NO3$prpd$long$name %in% c("1_6", "1_7"), "Pair"],
NO3$prpd$long[NO3$prpd$long$name %in% c("1_6", "1_7"), "Partner"]))), 1:9]
sum(det$rOTU[, c(rownames(tmp)[tmp$class == "Nitrososphaeria"])]) / sum(det$class$Nitrososphaeria) # 24.7%