-
Notifications
You must be signed in to change notification settings - Fork 0
/
CS_BABY_BIOME_RGI_AMR_Bin_Abundance_estimation.R
executable file
·205 lines (169 loc) · 10.4 KB
/
CS_BABY_BIOME_RGI_AMR_Bin_Abundance_estimation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
################################################################################
##### CS Baby Biome: AMR gene abundance analysis in MAGs RGI-CARD2023
### Author(s):Asier Fernández-Pato
### Last updated: 1st February, 2024
################################################################################
#****************
# Load modules
#****************
library(tidyverse)
library(dplyr)
library(ggplot2)
# Set working directory
setwd("~/Desktop/PhD/Projects/CS Baby Biome/New_analysis/")
#****************
# Define functions
#****************
# Function to estimate the abundance of AMR features (genes, classes, mechanisms). Input:
# Abundance table: MAG abundance table (MAGs in rows and samples in columns)
# AM_count: AM feature count table (MAGs in rows and AM feature in columns)
calculate_AMR_abundance <- function(Bin_abundance, AM_count) {
# Initialize an empty data frame for the gene abundance table
abundance_table <- data.frame(matrix(ncol = ncol(Bin_abundance), nrow = ncol(AM_count)))
colnames(abundance_table) <- colnames(Bin_abundance)
rownames(abundance_table) <- colnames(AM_count)
# Generate a table with merged Bin abundances and AMR counts
merged_data <- merge(Bin_abundance, AM_count, by = 0, all.x = TRUE)
merged_data <- replace(merged_data, is.na(merged_data), 0)
rownames(merged_data) <- merged_data[, 1]
merged_data <- merged_data[, -1]
# Loop through samples and AMRs to estimate the abundances
for (sample in colnames(merged_data)[1:(ncol(Bin_abundance))]) {
Bins_present <- rownames(merged_data)[merged_data[, sample] != 0] # Bins present in the sample
abundance_Bins <- merged_data[merged_data[, sample] != 0, sample] # the abundance of Bins present in the sample
for (AMR in colnames(merged_data)[(ncol(Bin_abundance) + 1):ncol(merged_data)]) {
Bins_AMR <- rownames(merged_data)[merged_data[, AMR] != 0] # Bins that have the AMR
counts_AMR_Bins <- merged_data[merged_data[, AMR] != 0, AMR] # counts of AMRs the Bins
# Select Bins present in the sample that have the AMR gene / class
Bins <- intersect(Bins_AMR, Bins_present) # Bins with the AMR present in the sample
counts <- as.numeric(counts_AMR_Bins[Bins_AMR %in% Bins_present]) # the AMR counts of the Bins present in the sample
abundance <- as.numeric(abundance_Bins[match(Bins, Bins_present)]) # the abundance of the Bins with the AMR present in the sample
gene_abundance <- ifelse(length(counts) > 0 && length(abundance) > 0, sum(counts * abundance), 0)
# Add AMR abundance value to the corresponding sample column
abundance_table[AMR, sample] <- gene_abundance
}
}
return(abundance_table)
}
calculate_AMR_counts <- function(Bin_abundance, AM_count) {
# Initialize an empty data frame for the count table
count_table <- data.frame(matrix(ncol = ncol(Bin_abundance), nrow = ncol(AM_count)))
colnames(count_table) <- colnames(Bin_abundance)
rownames(count_table) <- colnames(AM_count)
# Generate a table with merged Bin abundances and AMR counts
merged_data <- merge(Bin_abundance, AM_count, by = 0, all.x = TRUE)
merged_data <- replace(merged_data, is.na(merged_data), 0)
rownames(merged_data) <- merged_data[, 1]
merged_data <- merged_data[, -1]
# Loop through samples and AMRs to estimate the abundances
for (sample in colnames(merged_data)[1:(ncol(Bin_abundance))]) {
Bins_present <- rownames(merged_data)[merged_data[, sample] != 0] # Bins present in the sample
abundance_Bins <- merged_data[merged_data[, sample] != 0, sample] # the abundance of Bins present in the sample
for (AMR in colnames(merged_data)[(ncol(Bin_abundance) + 1):ncol(merged_data)]) {
Bins_AMR <- rownames(merged_data)[merged_data[, AMR] != 0] # Bins that have the AMR
counts_AMR_Bins <- merged_data[merged_data[, AMR] != 0, AMR] # counts of AMRs the Bins
# Select Bins present in the sample that have the AMR gene / class
Bins <- intersect(Bins_AMR, Bins_present) # Bins with the AMR present in the sample
counts <- as.numeric(counts_AMR_Bins[Bins_AMR %in% Bins_present]) # the AMR counts of the Bins present in the sample
abundance <- as.numeric(abundance_Bins[match(Bins, Bins_present)]) # the abundance of the Bins with the AMR present in the sample
feature_count <- ifelse(length(counts) > 0 && length(abundance) > 0, sum(counts), 0)
# Add AMR feature count value to the corresponding sample column
count_table[AMR, sample] <- feature_count
}
}
return(count_table)
}
#****************
# Processing of CARD annotation with RGI
#****************
# Read CARD AMR annotation table
CARD_annotation <- read.delim("AMR_BACTERIA/B_APPROACH/Merged_AMR_annotation_strict.txt", sep = "\t", header = T, check.names = F)
CARD_annotation$Contig <- result <- sub("_[^_]+$", "", CARD_annotation$Contig)
# Generate tables with the counts of (per MAG):
# A) AMR Gene
# B) AMR Classes
# C) AMR Mechanisms
# D) Antibiotics/Antimicrobials (group those closely related: e.g. Colistin A and Colistin B)
AMR_Gene_count <- CARD_annotation %>%
count(Contig, Best_Hit_ARO) %>%
pivot_wider(names_from = Best_Hit_ARO, values_from = n, values_fill = 0) %>%
column_to_rownames(var = "Contig")
AMR_Class_count <- CARD_annotation %>%
filter(`Drug Class` != "") %>%
separate_rows(`Drug Class`, sep = "; ") %>%
group_by(`Drug Class`, Contig) %>%
summarise(n = sum(!is.na(`Drug Class`))) %>%
pivot_wider(names_from = `Drug Class`, values_from = n, values_fill = 0) %>%
column_to_rownames(var = "Contig")
AMR_Class_count$`streptogramin A antibiotic` <- NULL # a more general class already present
AMR_Class_count$`streptogramin B antibiotic` <- NULL
AMR_Mechanism_count <- CARD_annotation %>%
filter(`Resistance Mechanism` != "") %>%
separate_rows(`Resistance Mechanism`, sep = "; ") %>%
group_by(`Resistance Mechanism`, Contig) %>%
summarise(n = sum(!is.na(`Resistance Mechanism`))) %>%
pivot_wider(names_from = `Resistance Mechanism`, values_from = n, values_fill = 0) %>%
column_to_rownames(var = "Contig")
AM_count <- CARD_annotation %>%
filter(Antibiotic != "") %>%
separate_rows(Antibiotic, sep = "; ") %>%
group_by(Antibiotic, Contig) %>%
summarise(n = sum(!is.na(Antibiotic))) %>%
pivot_wider(names_from = Antibiotic, values_from = n, values_fill = 0) %>%
column_to_rownames(var = "Contig")
# Remove columns with a more general AB already present (bacitracin, gentamicin, polymyxin)
AM_count[c("bacitracin A", "bacitracin B", "bacitracin F",
'gentamicin B', 'gentamicin C', 'polymyxin B1',
'polymyxin B2', 'polymyxin B3', 'polymyxin B4')] <- NULL
#**************************************
# Estimate AM feature abundances and count tables
#**************************************
# Load bin abundance table and process it
Bin_abundance <- read.delim("AMR_BACTERIA/B_APPROACH/Merged_bin_abundances.txt", sep = "\t", header = T, check.names = F)
rownames(Bin_abundance) <- Bin_abundance$Genome
Bin_abundance$Genome <- NULL
colnames(Bin_abundance) <- sub("^(.*?)_.*", "\\1", colnames(Bin_abundance))
Bin_abundance_present <- Bin_abundance[rowSums(Bin_abundance) != 0, , drop = FALSE] # All 516 are present
# Estimate AMR abundances
AM_abundance <- calculate_AMR_abundance(Bin_abundance, AM_count) #117
AMR_Gene_abundance <- calculate_AMR_abundance(Bin_abundance, AMR_Gene_count) #188
AMR_Class_abundance <- calculate_AMR_abundance(Bin_abundance, AMR_Class_count) #30
AMR_Mechanism_abundance <- calculate_AMR_abundance(Bin_abundance, AMR_Mechanism_count) #6
# Estimate the AMR gene counts per sample
AM_presence <- calculate_AMR_counts(Bin_abundance, AM_count) #117
AMR_Gene_presence <- calculate_AMR_counts(Bin_abundance, AMR_Gene_count) #188
AMR_Class_presence <- calculate_AMR_counts(Bin_abundance, AMR_Class_count) #30
AMR_Mechanism_presence <- calculate_AMR_counts(Bin_abundance, AMR_Mechanism_count) #6
# Select only features with a prevalence > 5%
AM_abundance_prev_list <- rownames(AM_abundance)[rowSums(AM_abundance != 0) > 195*0.05] #111
AMR_Gene_abundance_prev_list <- rownames(AMR_Gene_abundance)[rowSums(AMR_Gene_abundance != 0) > 195*0.05] #163
AMR_Class_abundance_prev_list <- rownames(AMR_Class_abundance)[rowSums(AMR_Class_abundance != 0) > 195*0.05] #30
AMR_Mechanism_abundance_prev_list <- rownames(AMR_Mechanism_abundance)[rowSums(AMR_Mechanism_abundance != 0) > 195*0.05] #6
AM_abundance_prev <- AM_abundance[AM_abundance_prev_list,]
AMR_Gene_abundance_prev <- AMR_Gene_abundance[AMR_Gene_abundance_prev_list,]
AMR_Class_abundance_prev <- AMR_Class_abundance[AMR_Class_abundance_prev_list,]
AMR_Mechanism_abundance_prev <- AMR_Mechanism_abundance[AMR_Mechanism_abundance_prev_list,]
AM_presence_prev <- AM_presence[AM_abundance_prev_list,]
AMR_Gene_presence_prev <- AMR_Gene_presence[AMR_Gene_abundance_prev_list,]
AMR_Class_presence_prev <- AMR_Class_presence[AMR_Class_abundance_prev_list,]
AMR_Mechanism_presence_prev <- AMR_Mechanism_presence[AMR_Mechanism_abundance_prev_list,]
#****************
# Save output
#****************
# Save AMR abundance and count tables
write.table(AM_abundance,"AMR_BACTERIA/B_APPROACH/ABUNDANCE TABLES/CS_Baby_Biome_AM_Abundance_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)
write.table(AMR_Gene_abundance,"AMR_BACTERIA/B_APPROACH/ABUNDANCE TABLES/CS_Baby_Biome_AMR_Gene_Abundance_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)
write.table(AMR_Class_abundance,"AMR_BACTERIA/B_APPROACH/ABUNDANCE TABLES/CS_Baby_Biome_AMR_Class_Abundance_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)
write.table(AMR_Mechanism_abundance,"AMR_BACTERIA/B_APPROACH/ABUNDANCE TABLES/CS_Baby_Biome_AMR_Mechanism_Abundance_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)
write.table(AM_presence,"AMR_BACTERIA/B_APPROACH/COUNT TABLES/CS_Baby_Biome_AM_Count_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)
write.table(AMR_Gene_presence,"AMR_BACTERIA/B_APPROACH/COUNT TABLES/CS_Baby_Biome_AMR_Gene_Count_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)
write.table(AMR_Class_presence,"AMR_BACTERIA/B_APPROACH/COUNT TABLES/CS_Baby_Biome_AMR_Class_Count_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)
write.table(AMR_Mechanism_presence,"AMR_BACTERIA/B_APPROACH/COUNT TABLES/CS_Baby_Biome_AMR_Mechanism_Count_Table.txt", sep = "\t",
row.names = T, col.names = T, quote = FALSE)