-
Notifications
You must be signed in to change notification settings - Fork 0
/
ATMKO_HSNpt_select_129_strain.Rmd
136 lines (103 loc) · 4.34 KB
/
ATMKO_HSNpt_select_129_strain.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
---
title: "Compare 129 strains"
author: "Daniel M. Gatti, Ph.D."
date: "8/23/2021"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(VariantAnnotation)
library(tidyverse)
base_dir = '/media/dmgatti/data1/ColoState'
geno_dir = file.path(base_dir, 'data', 'genotypes', 'AT_modifier', 'CSU_Garcia_MURGIGV01_20180510')
geno_file = file.path(geno_dir, 'CSU_Garcia_MURGIGV01_20180510_FinalReport.zip')
results_dir = file.path(base_dir, 'results')
muga_dir = '/media/dmgatti/data0/MUGA'
marker_file = file.path(muga_dir, 'gm_uwisc_v1.csv')
sanger_dir = '/media/dmgatti/data0/Sanger'
sanger_file = file.path(sanger_dir, 'mgp_REL2005_snps_indels.vcf.gz')
```
The goal of this document is to idenfity the 192 strain in the Sanger Mouse Genomes data that has a genotype most similar to the 129S6/SvEvTac-Atmtm1Awb/J strain used to create the ATM KO HS/Npt cross. I will compare the GigaMUGA SNPs with SNPs in the Sanger Mouse Genomes and select the strain with the highest similarity.
Read in the Gigamuga markers.
```{r read_markers}
markers = read_csv(marker_file) %>%
select(marker:bp_mm10)
```
Read in the CSU genotypes.
```{r read_genotypes}
geno = read_delim(file = geno_file, delim = '\t', skip = 9) %>%
select(`SNP Name`:`Allele2 - Forward`) %>%
unite(gt , `Allele1 - Forward`:`Allele2 - Forward`, sep = '') %>%
pivot_wider(names_from = `Sample ID`, values_from = gt) %>%
rename(marker = `SNP Name`) %>%
mutate(both_equal = `129SVE-M` == `129SVE-F`)
```
Keep the markers where the two samples agree.
```{r}
geno = geno %>%
filter(both_equal) %>%
select(-both_equal)
```
Join the markers to the genotypes and keep autosomes and Chr X.
```{r join_marker_geno}
geno = right_join(markers, geno, by = 'marker') %>%
filter(chr %in% c(1:19, 'X'))
```
Get the header information for the Sanger file and select samples.
```{r prepare_vcf}
header = scanVcfHeader(sanger_file)
samples = samples(header)
samples = samples[grep('^129', samples)]
```
For each marker on the Gigamuga, get the Sanger SNPs for the three 129 strains.
```{r get_sanger}
strain_comp = NULL
unique_chr = unique(sanger$chr)
for(chr in unique_chr) {
print(paste('CHR', chr))
gr = GRanges(seqnames = chr, ranges = IRanges(start = 0, end = 200e6))
param = ScanVcfParam(geno = 'GT', samples = samples, which = gr)
vcf = readVcf(file = sanger_file, geno = 'mm10', param = param)
vcf = subset(vcf, info(vcf)$INDEL == FALSE)
current_mkr = geno[geno$chr == chr, c('marker', 'chr', 'bp_mm10', '129SVE-F')]
current_gr = GRanges(seqnames = chr, IRanges(start = current_mkr$bp_mm10, width = 1),
marker = current_mkr$marker)
vcf = subsetByOverlaps(vcf, current_gr)
vcf = genotypeCodesToNucleotides(vcf)
gt = sub('/', '', VariantAnnotation::geno(vcf)$GT)
gt = data.frame(chr = seqnames(rowRanges(vcf)),
bp_mm10 = start(rowRanges(vcf)),
ref = as.character(rowRanges(vcf)$REF),
gt)
strain_comp = rbind(strain_comp,
merge(current_mkr, gt, by = c('chr', 'bp_mm10'), all = TRUE))
} # for(chr)
# Save the file.
saveRDS(strain_comp, file = file.path(results_dir, 'strain_comparison_129.rds'))
```
See which strain is most similar to the KO.
```{r compare_strains}
strain_comp %>%
select(-marker) %>%
rename_with(.fn = str_replace, pattern = '^X', replacement = '') %>%
pivot_longer(cols = `129P2_OlaHsd`:`129S5SvEvBrd`, names_to = 'strain') %>%
mutate(geno_eq = `129SVE-F` == value) %>%
group_by(strain) %>%
summarise(mean_sim = mean(geno_eq, na.rm = TRUE))
```
129S1/SvImJ seems to be the most similar. Look by chromosome.
```{r strain_sim_by_chr,fig.height=14,fig.width=8}
strain_comp %>%
select(-marker) %>%
filter(!is.na(ref)) %>%
rename_with(.fn = str_replace, pattern = '^X', replacement = '') %>%
pivot_longer(cols = `129P2_OlaHsd`:`129S5SvEvBrd`, names_to = 'strain') %>%
mutate(pos_mb = bp_mm10 * 1e-6,
geno_eq = `129SVE-F` == value,
geno_eq = as.numeric(geno_eq),
chr = factor(chr, levels = c(1:19, 'X'))) %>%
ggplot(aes(pos_mb, geno_eq)) +
geom_smooth(method = 'loess', span = 0.1) +
lims(y = c(0, 1.0)) +
facet_grid(chr~strain)
```