-
Notifications
You must be signed in to change notification settings - Fork 0
/
607_wk5_project.R
134 lines (102 loc) · 4.55 KB
/
607_wk5_project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
file <-0
file <- read.table(file = "/Users/alexandersatz/Documents/Cuny/IS607/week5/ks_bioactivity.txt", quote = "", fill = TRUE, sep = "\t", header = TRUE, stringsAsFactors = TRUE)
nrow(file)
head(file)
## have cmpds with greater potency in any assay then been tested in a larger number of distinct assays?
bioact.1 <-tbl_df(file)
## there are 1447 activity types, which is a lot
summarise(bioact.1,
numberdatatype = n_distinct(ACTIVITY_TYPE))
## we see below that we have things on a log scale and - log scale
activity.type <- group_by(bioact.1, ACTIVITY_TYPE)
types.df <-summarise(activity.type,
number = n())
head(types.df,100)
## We also have different units of activity (non log scale)
activity.units <- group_by(bioact.1, STANDARD_UNIT)
units.df <-summarise(activity.units,
number = n())
units.df <- data.frame(units.df)
units.df
## this subset contains all the 'log' values and converts to -log if wasn't already
bioact.log <- filter(bioact.1, grepl("log", ACTIVITY_TYPE, ignore.case = TRUE))
bioact.log <- filter(bioact.log, ! grepl("log2", ACTIVITY_TYPE, ignore.case = TRUE))
bioact.log <- filter(bioact.log, ! grepl("logp", ACTIVITY_TYPE, ignore.case = TRUE))
bioact.log <- filter(bioact.log, ! grepl("logd", ACTIVITY_TYPE, ignore.case = TRUE))
bioact.log <- filter(bioact.log, ! grepl("GI50", ACTIVITY_TYPE, ignore.case = TRUE))
bioact.log
bioact.logged1 <- mutate(bioact.log,
LOG.ACT = as.integer(abs(STANDARD_VALUE)))
bioact.logged1 <-arrange(bioact.logged1, desc(LOG.ACT))
bioact.logged1 ## looks good, still has NA values
## the data set has NA values in the STANDARD_VALUE column, even though there is an activity type given?
## I assume this means the assay was run, but no vlue could be determined for whatever reason. I will
## keep these values in, but assign them a 'potency' of zero
bioact.na <- (bioact.logged1[is.na(bioact.logged1$LOG.ACT),])
bioact.na
## this subset takes all the values that are not a log
bioact.notlog <-filter(bioact.1, ! agrepl("log", ACTIVITY_TYPE, ignore.case = TRUE))
bioact.notlog
## the 'notlog' values need to be adjusted if they are nM or uM in scale.
bioact.loguM <-bioact.notlog %>%
filter(grepl("um", STANDARD_UNIT, ignore.case = TRUE)) %>%
mutate(LOG.ACT = as.integer(-log10((STANDARD_VALUE/1000000))))
bioact.loguM <-arrange(bioact.loguM, desc(LOG.ACT))
bioact.loguM$LOG.ACT ## looks great and values range from 4-6 mainly, so the right range.
bioact.lognM <-bioact.notlog %>%
filter(grepl("nm", STANDARD_UNIT, ignore.case = TRUE)) %>%
mutate(LOG.ACT = as.integer(-log10((STANDARD_VALUE/1000000000))))
bioact.lognM <-arrange(bioact.lognM, desc(LOG.ACT))
bioact.lognM$LOG.ACT ## looks good
#combine all 3 dataframes
biact2 <- rbind(bioact.logged1, bioact.loguM, bioact.lognM)
biact2 ## ~11,000 entries now
biact2 <- select(biact2, COMPOUND_ID, STANDARD_UNIT, STANDARD_VALUE, LOG.ACT, DOM_ID, NAME)
biact2 <-arrange(biact2, desc(LOG.ACT))
head(biact2, 10) ## the log values match expectations based on units and values!
## remove NA value
biact3 <- biact2[!is.na(biact2$LOG.ACT),]
biact3
#Now ready to group by cmpd ID, determine max potency value, and count number of assays
biact.cmp <- group_by(biact3, COMPOUND_ID)
bioact.sum <- summarize(biact.cmp,
max.pot = max(LOG.ACT),
tot.Assays = n(),
uniq.Assays = n_distinct(NAME)
)
bioact.sum
bioact.sum <-arrange(bioact.sum, desc(max.pot))
bioact.sum
## first we plot max.pot versus uniq Assays
p <-ggvis(bioact.sum, x = ~max.pot, y = ~uniq.Assays)
layer_points(p)
ggplot(data=bioact.sum, aes(x=max.pot, y=uniq.Assays, group=1)) + geom_point()
bioact.sum %>%
ggvis(~max.pot, ~uniq.Assays,
opacity := input_slider(0,1)) %>%
layer_smooths() %>%
layer_points()
## then the tot # of assays
bioact.sum %>%
ggvis(~max.pot, ~tot.Assays,
opacity := input_slider(0,1)) %>%
layer_smooths() %>%
layer_points()
biact.group <- group_by(bioact.sum, max.pot)
## looking at max potency verus uniq assays
uniq.assays<- summarise(biact.group,
v.avg = mean(uniq.Assays),
v.med = median(uniq.Assays))
uniq.assays %>%
ggvis(~max.pot, ~v.med) %>%
layer_smooths() %>%
layer_points()
## looking at max potency versus total # assays
total.assays<- summarise(biact.group,
v.avg = mean(tot.Assays),
v.med = median(tot.Assays))
total.assays
total.assays %>%
ggvis(~max.pot, ~v.med) %>%
layer_smooths() %>%
layer_points()