-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_clean_postprocess_data.R
164 lines (134 loc) · 6.67 KB
/
import_clean_postprocess_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Import, clean and post-process AMOR
# ====
# Import and do basic processing
# ====
cmp <- list(
otus = read.csv("data/all.otutab.nonchimeras.csv", row.names = 1),
tax = process_tax_file("data/otus.csv"),
contaminant_genera = read_ods("data/contaminant_genera.ods")
)
# Fix kingdom and superkingdom names
if(any(is.na(cmp$tax$kingdom))) {
cmp$tax$kingdom[is.na(cmp$tax$kingdom)] <- "No hits"
cmp$tax$superkingdom[which(stringr::str_count(cmp$tax$superkingdom, "size") == 1)] <- "No hits"
}
# Merge OTUs and taxonomy
cmp$all_otu_tax <- transform(merge(x = cmp$tax,
y = cmp$otus,
by = "row.names"),
row.names = Row.names, Row.names = NULL)
# Transpose so that samples are rows
cmp$otus <- as.data.frame(t(cmp$otus))
# ====
# Post-processing:
# Remove single occurrences, decontaminate and filter known contaminants
# ====
# 1. Remove singletons
cmp$otus <- cmp$otus[, which(colSums(cmp$otus) > 1)]
# 2. Decontaminate AMOR samples
cmp$decont_batch = c(1, # CB06 + CB03/06
rep(2, 3), # CB08 + CB05
3, # CB10
rep(4, 2), # CB11
rep(5, 3), # CB14
rep(6, 5), # CB15
rep(1, length(which(grepl("GS10_PC12", rownames(cmp$otus)) == 1))),
rep(1, length(which(grepl("GS13_CC02", rownames(cmp$otus)) == 1))),
rep(1, length(which(grepl("GS14_GC02", rownames(cmp$otus)) == 1))),
rep(2, length(which(grepl("GS14_GC04", rownames(cmp$otus)) == 1))),
rep(2, length(which(grepl("GS14_GC08", rownames(cmp$otus)) == 1))),
rep(2, length(which(grepl("GS14_GC09", rownames(cmp$otus)) == 1))),
rep(2, length(which(grepl("GS14_GC12", rownames(cmp$otus)) == 1))),
rep(1, length(which(grepl("GS14_GC14", rownames(cmp$otus)) == 1))),
rep(2, length(which(grepl("GS15_GC01", rownames(cmp$otus)) == 1))),
rep(3, length(which(grepl("GS16_GC04", rownames(cmp$otus)) == 1))),
rep(4, length(which(grepl("GS16_GC05", rownames(cmp$otus)) == 1))),
rep(4, length(which(grepl("GS16_GC06", rownames(cmp$otus)) == 1))),
rep(4, length(which(grepl("GS16_GC07", rownames(cmp$otus)) == 1))),
rep(5, length(which(grepl("GS17_GC02", rownames(cmp$otus)) == 1))),
rep(6, length(which(grepl("GS17_GC04", rownames(cmp$otus)) == 1))),
rep(5, length(which(grepl("GS17_GC05", rownames(cmp$otus)) == 1)))
)
cmp$decont_status = c(rep(TRUE, 15), rep(FALSE, 418))
# Identify contaminants
cmp$isContaminant <- cmp$otus[c(1:15, which(grepl("GS1", rownames(cmp$otus)))), ] %>%
remove_empty_columns %>%
as.matrix %>%
isContaminant(neg = cmp$decont_status, batch = cmp$decont_batch)
# Isolate contaminants
cmp$contaminants <- cmp$otus[c(1:15, which(grepl("GS1", rownames(cmp$otus)))), ] %>%
remove_empty_columns %>%
.[, cmp$isContaminant$contaminant]
# Remove contaminants from AMOR OTU table before addressing cross-talk
cmp$amor_decontaminated <- cmp$otus[c(1:15, which(grepl("GS1", rownames(cmp$otus)))), ] %>%
remove_empty_columns %>%
.[, !cmp$isContaminant$contaminant]
# Remove contaminants from full OTU table
cmp$otus <- cmp$amor_decontaminated
# Remove blank samples
cmp$otus <- cmp$otus[!grepl("blnk", rownames(cmp$otus)), ]
# 3. Check remaining non-contaminants against contaminant_genera.
# Conservative approach selected herein: Remove identified OTUs regardless.
cmp$otus <- cmp$otus[, !names(cmp$otus) %in%
rownames(cmp$otu_tax[which(cmp$otu_tax$genus %in% cmp$contaminant_genera$genus), ])]
# Update OTU and taxonomy table
cmp$postprocessed_otu_tax <- transform(merge(x = cmp$tax,
y = as.data.frame(t(cmp$otus)),
by = "row.names"),
row.names = Row.names, Row.names = NULL)
# ====
# Remove shallow libraries, GS14_GC14 and rare OTUs
# ====
# 1. Remove samples with fewer than 1000 reads.
cmp$otus <- cmp$otus[rowSums(cmp$otus) > 1000, ] %>% remove_empty_columns
# 2. Remove GS14_GC14 from the table
cmp$otus <- cmp$otus[-which(grepl("GS14_GC14", rownames(cmp$otus))) ,]
# 3. Remove all OTUs with fewer than 100 reads.
cmp$otus <- cmp$otus[, colSums(cmp$otus) >= 100]
cmp$otu_tax <- transform(merge(x = cmp$tax,
y = as.data.frame(t(cmp$otus)),
by = "row.names"),
row.names = Row.names, Row.names = NULL)
# ====
# Import (merge) and trim mapping files according to above sample removal
# ====
# 1. Import files
cmp$map <- read_ods(path = "data/mapping.ods") %>%
as.data.frame
# 2. Adjust mapping files according to OTU table and vice versa
# Remove GC14 from mapping and adjust core_id accordingly
cmp$map <- cmp$map[-which(cmp$map$core == "GC14"), ]
cmp$map$core_id[which(cmp$map$core_id > 8)] <- cmp$map$core_id[which(cmp$map$core_id > 8)] - 1
# add rownames column to map to avoid confusion between depth and id columns downstream
cmp$map$rowname <- paste(cmp$map$cruise, cmp$map$core, cmp$map$depth, sep = "_")
rownames(cmp$map) <- cmp$map$rowname
# Import and interpolate O2 profiles for all cores
# (function see: auxillary_functions/o2_concentration_helpers.R)
cmp$o2 <- .assign_o2_cons_id(.read_excel_allsheets("data/oxygen_profiles.xlsx"),
micro_map = cmp$map,
zero_threshold = 0.005,
add_long = T)
# Add oxy_cat and o2 to mapping directly if rownames are identical
if (identical(rownames(cmp$o2$long), cmp$map$rowname)) {
cmp$map$oxy_cons <- cmp$o2$long$o2
cmp$map$oxy_cat <- cmp$o2$long$oxy_cat
}
# 2.5 Change OTU table rownames according to mapping files
rn <- rownames(cmp$otus)
rownames(cmp$otus) <- seq(nrow(cmp$otus))
rntmp <- c()
for (i in seq(rownames(cmp$otus))) {
rownames(cmp$otus)[i] <-
cmp$map$rowname[which(paste(cmp$map$cruise, cmp$map$core, cmp$map$id, sep = "_") == rn[i])]
}
rm(rn)
# reorder GS17_GC02 samples after increasing depth (done manually):
cmp$otus <- cmp$otus[c(1:216, 267:277, 217:266, 278:387), ]
tmp <- cmp$otus
# 3. Concatenate and trim mapping files
cmp$map <- cmp$map[which(cmp$map$rowname %in% rownames(cmp$otus)), ]
# 4. Update full OTU table with new names
cmp$otu_tax <- transform(merge(x = cmp$tax,
y = as.data.frame(t(cmp$otus)),
by = "row.names"),
row.names = Row.names, Row.names = NULL)