-
Notifications
You must be signed in to change notification settings - Fork 0
/
4_calculate_entropy.R
72 lines (53 loc) · 2.14 KB
/
4_calculate_entropy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
library(bio3d) # read.fasta
library(tidyverse)
setwd("/ix/djishnu/Aaron/scripts/snpropipeline/")
source("entropy_func.R")
# already calculated
if(F){
output_folder = file.path("/ix/djishnu/Aaron/data/fastas_clustal/clustal_output_50_iter/")
output_folder_files = list.files(output_folder)
entropy_list = list()
alignment_list = list()
# try with first one
for(file in output_folder_files) {
fsta = bio3d::read.fasta(paste0(output_folder, file))
en_val = entropy(fsta)
entropy_list = append(entropy_list, list(en_val))
alignment_list = append(alignment_list, list(fsta$ali))
# entropy_list = append(entropy_list, list("entropy" = entropy(fsta), "alignments" = fsta$ali))
}
saveRDS(entropy_list, "/ix/djishnu/Aaron/data/output_data/entropy_values.RDS")
saveRDS(alignment_list, "/ix/djishnu/Aaron/data/output_data/alignment_entropy_values.RDS")
}
entropy_list = readRDS("/ix/djishnu/Aaron/data/output_data/entropy_values.RDS")
mut_index = readRDS("/ix/djishnu/Aaron/data/output_data/mutant_sequence_index.RDS")
df = data.frame()
for(i in 1:length(alignment_list)){
index = i
seqnames = row.names(alignment_list[[i]])
goi = grep('^ENSP', seqnames)
if(length(goi > 0)){
for(j in 1:length(goi)){
parsed = separate(data.frame(seqnames[goi]), col = 1, into = c("ensembl_protein_id", "ensembl_transcript_id", "chr", "coord", "id"))
parsed$index = i
df = rbind(df, parsed)
}
}
}
df = df %>% drop_na()
df$chr = as.double(df$chr)
df$coord = as.double(df$coord)
# merge with residue start #s
df = df %>% left_join(mut_index, by = c("chr", "coord"))
H_annotated_SNPs = df %>% select(-id) %>% distinct()
# get H values at position in index of entropy file
H_annotated_SNPs$H = -1
for(i in 1:nrow(H_annotated_SNPs)){
ind = H_annotated_SNPs[i,]$index
Hvals = entropy_list[[ind]]$H
H_annotated_SNPs[i,"H"] = Hvals[H_annotated_SNPs[i,]$residue_start]
}
saveRDS(H_annotated_SNPs, "/ix/djishnu/Aaron/data/output_data/ENTROPY_VALS_FOR_SNPS.RDS")
# parse sequence names: fasta files are named with tx#_protein#_chr_pos_ind
# where index is ithe index in the mutations df
#look up in this table