-
Notifications
You must be signed in to change notification settings - Fork 0
/
K2_final.Rmd
304 lines (248 loc) · 11 KB
/
K2_final.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
---
jupyter:
jupytext:
text_representation:
extension: .Rmd
format_name: rmarkdown
format_version: '1.2'
jupytext_version: 1.3.2
kernelspec:
display_name: R
language: R
name: ir
---
```{r}
#Install needed packages
#install.packages("docstring")
```
```{r}
library(docstring)
```
```{r}
# Sample dataset D
# (x1, x2, x3) are binary variables
x1 <- c(1, 1, 0, 1, 0, 0, 1, 0, 1, 0)
x2 <- c(0, 1, 0, 1, 0, 1, 1, 0, 1, 0)
x3 <- c(0, 1, 1, 1, 0, 1, 1, 0, 1, 0)
df <- data.frame(x1, x2, x3)
```
```{r}
log.fact <- function(m, r){
#' Precompute log of factorials from 0 to m+r-1
#'
#' `log.fact` returns an array `a` of m+r elements, such that `a[i+1] = log(i!)`
#'
#' @param m Integer
#' @param r Integer
#' @return Numeric vector of size `m+r`, such that the (i+1)-th element contains log(i!).
#'
#' @examples
#' log.fact(2, 3)
fact <- log( c(1, 1:(m+r-1)) )
for (i in 4:length(fact)) { #0! = 1, 1! = 1, 2! = 2, so they are already correct
fact[i] <- fact[i] + fact[i-1]
}
return(fact)
}
```
The following function computes:
$$ \log f(i, \pi_i) = \sum_{j=1}^{q_i} \Big[ \log (r_i - 1)! - \log (N_{ij} + r_i - 1)! + \sum_{k=1}^{r_i} \log N_{ijk}! \Big]$$
where:
- $r_i$ is the number of values that the $i$-th variable ($x_i$) can assume (e.g. $x_i \in \{0,1\} \Rightarrow r_i = 2$
- $\mathbf{\pi_i}$ contains the indices of the parents of $i$ (e.g. if $x_2, x_3 \rightarrow x_i$ in the DAG, then $\mathbf{\pi_i} = (1,2)$)
- $q_i$ is the number of possible values that the variables in $\mathbf{\pi_i}$ can take. For example, if $\mathbf{\pi_i} = (1,2)$, and both $x_1$ and $x_2$ are binary, then $q_i = 2^2$, since $(x_1, x_2) \in \{(0,0), (0,1), (1,0), (1,1)\}$. Treating the values of $x_j \in \mathbf{\pi_i}$ as digits of a binary number (or, in general, of a number in base $r = \max_j(r_j)$) defines a natural ordering for the possible values of $\mathbf{\pi_i}$.
- $N_{ijk}$ is the number of cases in the database $D$ where $x_i$ assumes its $k$-th value, and $\mathbf{\pi_i}$ its $j$-th value.
- $N_{ij} = \sum_{k=1}^{r_i} N_{ijk}$
```{r}
f <- function(i, i.parents, database, r, factorials) {
#' Computes the function f
#'
#' Returns the joint (log)probability that node i has parents i.parents, and that this bayesian network
#' has produced all the cases observed in database.
#'
#' @param i Integer. Represents a node.
#' @param i.parents Vector of integers. Parents of i. Should not contain i itself.
#' @param database data.frame of size (m,n) with all integer values between 0 and r-1. Observed cases.
#' @param r Integer. Number of possible values a single observation in database can assume.
#' @param factorials Numeric vector. Precomputed log(factorial(i)) for i from 0 to m+r-1 (see log.fact function)
#' @return Numeric. Probability that i has i.parents as parents, and that this structure results
#' in the observed database.
parents.length <- length(i.parents)
m <- nrow(database)
#From each row of database we compute the indices j and k
#k: value of the i-th variable (database[i]) + 1
#+1 is needed because indices in R start from 1, and database contains values 0:r-1
index.k <- database[i] + 1
#j: take values of database[i.parents], and treat them as digits of a number in base r
#(then add 1 to account for R indices)
todec <- r ** (0:max(0,parents.length-1))
if (parents.length > 0)
index.j <- as.matrix(database[i.parents])%*%todec + 1
else
index.j <- rep(1,m)
#Gather the indices in a matrix
indexes <- cbind(index.j, index.k)
#Sort
indexes <- indexes[ order(indexes[,1], indexes[,2]), ]
#Now indexes is a sequence of "ordered blocks of rows", e.g.
# indexes <- [[1, 1], [1, 2], [1, 2], [2, 1], [2, 2]]
#We want to count the length of each block (i.e. number of "repetitions" of each row)
#By taking the differences between each row and the previous one:
d <- abs( diff( as.matrix(indexes) ) )
#the only rows with non-zero elements are the ones at the "boundaries" of blocks
#The example from before leads to:
#d <- [[0, 1], [0, 0], [1, 1], [0, 1]]
#Sum columns to reduce the matrix to a vector:
d1 <- d[,1] + d[,2]
#d1 <- [1, 0, 2, 1]
#The unique rows are the ones with non-zero values + the last one
idx.unique <- indexes[c(which(d1>0), length(d1)+1), ]
#To compute repetitions, we add an element at the boundaries (start and end)
#and take the differences of the indices of non-zero values
N_ijk <- diff( c(0, which(d1>0), length(d1)+1) )
#Gather everything in a data.frame
N <- data.frame('Row'=idx.unique[,1], 'Col'=idx.unique[,2], 'Value'=N_ijk)
#print(N)
#Sum the elements of each row to compute N_{ij}
Nj <- aggregate(.~Row,data=N,sum)[,c(3)]
#Compute factorials
N$Value <- factorials[N$Value+1]
logN <- aggregate(.~Row,data=N,sum)[,c(3)]
#Compute the probability
factor.first <- factorials[r] #r_i-1+1
factor.second <- factorials[Nj+r] #+r_i -1 +1 (because of the indices)
factor.third <- logN
result <- sum(factor.first - factor.second + factor.third)
return(result)
}
```
```{r}
# K2 algorithm implementation
K2 <- function(database, u, ordering) {
#' Implementation of K2 algorithm as specified by
#' G. F. Cooper, E. Herskovits '"A Bayesian Method for the Induction of Probabilistic Networks from Data"
#'
#' Given a `database` of cases each containing n observations, and an `ordering` of the n variables,
#' the algorithm finds the bayesian network structure with the (approximately) maximum probability
#' of generating the observed data, by using at most `u` parents for each node i, which are chosen
#' only between the nodes j preceding i in the ordering.
#'
#' @param database data.frame of size (m, n). Dataset containing m observations of n variables, with no missing values.
#' All values must be integers between 0 and r-1.
#' @param u Integer. Maximum number of parents for each node.
#' @param ordering Vector of size n OR list of n1 vectors of total size n
#' If vector must contain a valid permutation of the integers [1, 2 ... n], representing the "prior" ordering of variables,
#' such that the variable with index ordering[i] can have as possible parents only variables with index ordering[j] with j < i
#' (i.e. variables that "appear before it" in the provided ordering).
#' If list must contains the layers of the networks, representing the "prior" ordering of variables,
#' such that the variable with index ordering[[i]] can have as possible parents only variables in the previous layers, and so
#' with index ordering[[j]] with j<i. Notice that n1<=n, and if it is equal we come back to the previous case.
#' @return matrix of size (n, n)
#' Adjacency matrix adj of the most probable Directed Acyclic Graph found given the evidence in the database.
#' adj[i,j] is 1 if there is a connection from node j to node i (i.e. if j is a parent of i)
m <- nrow(database)
n <- ncol(database)
r <- max(database) + 1
factorials <- log.fact(m, r)
adj <- matrix(data = 0, nrow = n, ncol = n)
debug <- TRUE
for (i in 1:n) {
i.parents <- c()
log.p.old <- f(i, i.parents, database, r, factorials)
#log-Probability of a structure with i disconnected
if (debug) cat('x', i, ' [], log(p) = ', log.p.old, '\n')
OKToProceed <- TRUE
#Determine which nodes can be parents of i
if (class(ordering)=='list'){ # Layer ordering, each element of the list is a layer
for (h in 1:length(ordering)){
if (i %in% ordering[[h]] ){
i.pos <- h
break
}
}
if( i.pos == 1){
next
} else {
i.parents.candidates <- unlist( ordering[1:i.pos-1] )
}
} else { # Normal ordering, the input is a vector
i.pos <- which(ordering == i) #Position of i in the ordering. Only variables before this position can be parents of i
if (i.pos == 1) { #If i is at the start of the ordering, there are no parents to check
next
} else {
i.parents.candidates <- ordering[1:i.pos-1]
}
}
#Add at most u nodes as parents of i,
#only if each one of them increases the structure's probability
while (OKToProceed & length(i.parents) < u) {
log.p.new <- -Inf
newparent.index <- NA
#Select the node from the candidate list that maximizes the structure's probability (greedy algorithm)
for (candidate in i.parents.candidates) {
log.p.candidate <- f(i, c(i.parents, candidate), database, r, factorials)
if (debug) cat('x', i, ' [', i.parents, ', +', candidate, '+], log(p) = ', log.p.candidate, '\n')
if (log.p.candidate > log.p.new) {
log.p.new <- log.p.candidate
newparent.index <- candidate
}
}
#Accept the new parent candidate only if it increases the total structure's probability
if (log.p.new > log.p.old) {
log.p.old <- log.p.new
i.parents <- c(i.parents, newparent.index)
if (debug) cat(newparent.index, ' -> ', i, '\n')
adj[i, newparent.index] = 1
#Remove the added parent from the candidates list
i.parents.candidates <- i.parents.candidates[-newparent.index]
if (length(i.parents.candidates) == 0) { #If there are no more candidates to check, terminate
OKToProceed <- FALSE
if (debug) cat('\n---\n')
}
} else { #If probability does not increase, stop adding parents to i
OKToProceed <- FALSE
if (debug) cat('\n---\n')
}
}
}
return(adj)
}
```
```{r}
#Print the structure
structure <- K2(df, 2, c(1,2,3))
print(structure)
#Should be:
# [0, 0, 0]
# [1, 0, 0]
# [0, 1, 0]
for (i in 1:nrow(structure)) {
cat("Node ", i, " : [")
for (j in 1:ncol(structure)) {
if (structure[i,j]) {
cat(j)
}
}
cat("]\n")
}
```
**Expected output** (also checked by hand)
```text
x 1 [], log(p) = -7.927324
x 2 [], log(p) = -7.927324
x 2 [ , + 1 +], log(p) = -6.802395
1 -> 2
---
x 3 [], log(p) = -7.745003
x 3 [ , + 1 +], log(p) = -7.495542
x 3 [ , + 2 +], log(p) = -5.192957
2 -> 3
x 3 [ 2 , + 1 +], log(p) = -5.991465
---
[,1] [,2] [,3]
[1,] 0 0 0
[2,] 1 0 0
[3,] 0 1 0
Node 1 : []
Node 2 : [1]
Node 3 : [2]```