This repository has been archived by the owner on Apr 30, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
page_ranker.py
208 lines (172 loc) · 6.18 KB
/
page_ranker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 14 20:43:21 2019
@author: Zargham
"""
import networkx as nx
import pandas as pd
import numpy as np
#defaults
default_self_loop_wt= .001
def update_score(g,alpha,seed, lazy=False, lazy_wt = .5):
#lazy random walk assumes a topology independent 1/2 wt on self-loops
lazy_wt = lazy_wt*float(lazy)
prior_x = nx.get_node_attributes(g,'score')
for n in g.nodes:
self_wt = g.nodes[n]['self_wt']/g.nodes[n]['total_wt']
val = (1-alpha)*self_wt*prior_x[n] + alpha*seed[n]
for nb in g.nodes[n]['out_nbr']:
#outbound neighbor
e_count = edge_count(g, n,nb)
for e3 in range(e_count):
wt = g.edges[(n,nb,e3)]['out_weight']/g.nodes[nb]['total_wt']
val = val + (1-alpha)*wt*prior_x[nb]
for nb in g.nodes[n]['in_nbr']:
#inbound neighbor
e_count = edge_count(g, nb,n)
for e3 in range(e_count):
wt = g.edges[(nb,n,e3)]['in_weight']/g.nodes[nb]['total_wt']
val = val + (1-alpha)*wt*prior_x[nb]
#print(val)
g.nodes[n]['score']= lazy_wt*prior_x[n]+(1-lazy_wt)*val
return g
#helper function
def edge_count(g,src,dst):
i =0
stop = False
while not(stop):
try:
g.edges[(src,dst,i)]
i=i+1
except:
stop = True
return i
#tuples are (to_weight, from_weight)
default_edge_wt_by_type = {
'github/authors': (0.5,1),
'github/hasParent':(1,1/4),
'git/hasParent':(1,1/4),
'github/mentionsAuthor': (1,1/32),
'github/mergedAs':(.5,1),
'github/references':(1,1/16),
'github/reactsHeart':(2,1/32),
'github/reactsHooray':(4,1/32),
'github/reactsRocket':(1,0), #appears to be missing from current implementation
'github/reactsThumbsUp':(1,1/32)
}
default_node_wt_by_type = {
'github/issue':2.0,
'github/repo':4.0,
'github/comment': 1.0,
'git/commit':2.0,
'github/user':1.0,
'github/bot':1.0,
'github/review': 1.0,
'github/pull': 4.0
}
def wt_heuristic(g,
node_wt_by_type=default_node_wt_by_type,
edge_wt_by_type=default_edge_wt_by_type,
self_loop_wt=default_self_loop_wt):
for e in g.edges:
e_wts = edge_wt_by_type[g.edges[e]['type']]
src_wt = node_wt_by_type[g.nodes[e[0]]['type']]
dst_wt = node_wt_by_type[g.nodes[e[1]]['type']]
g.edges[e]['in_weight'] = e_wts[0]*dst_wt
g.edges[e]['out_weight'] = e_wts[1]*src_wt
'''
for n in g.nodes:
wt = self_loop_wt
for nb in nx.all_neighbors(g,n):
#outbound neighbor
if nb in g.neighbors(n):
e_count = edge_count(g,n,nb)
for e3 in range(e_count):
wt = wt + g.edges[(n,nb,e3)]['out_weight']
#inbound neighbor
else:
e_count = edge_count(g,nb,n)
for e3 in range(e_count):
wt = wt + g.edges[(nb,n,e3)]['in_weight']
g.nodes[n]['denominator']=wt
'''
#create neighborhoods
for n in g.nodes:
g.nodes[n]['all_nbr']= set(nx.all_neighbors(g,n))
g.nodes[n]['in_nbr'] = set()
g.nodes[n]['out_nbr'] = set()
for nb in g.nodes[n]['all_nbr']:
#print((n,nb))
try :
g.edges[(nb,n,0)]
g.nodes[n]['in_nbr'].add(nb)
except:
pass
try :
g.edges[(n,nb,0)]
g.nodes[n]['out_nbr'].add(nb)
except:
pass
for n in g.nodes:
self_wt = self_loop_wt#/g.nodes[n]['denominator']
g.nodes[n]['self_wt']=self_wt
total_wt = self_wt
for nb in g.nodes[n]['out_nbr']:
#outbound neighbor
e_count = edge_count(g, n,nb)
for e3 in range(e_count):
wt = g.edges[(n,nb,e3)]['in_weight']#/g.nodes[nb]['denominator']
#g.edges[(n,nb,e3)]['normalized_out_wt']=wt
total_wt = total_wt+wt
for nb in g.nodes[n]['in_nbr']:
#inbound neighbor
e_count = edge_count(g, nb,n)
for e3 in range(e_count):
wt = g.edges[(nb,n,e3)]['out_weight']#/g.nodes[nb]['denominator']
#g.edges[(nb,n,e3)]['normalized_in_wt']=wt
total_wt = total_wt+wt
g.nodes[n]['total_wt'] = total_wt
return g
def pageRanker(g,
alpha,
K,
seed=None,
initial_value = None,
lazy=False,
lazy_wt = .5,
lazy_decay = True,
self_loop_wt=default_self_loop_wt,
node_wt_by_type =default_node_wt_by_type,
edge_wt_by_type=default_edge_wt_by_type):
#improve input verification for seed
#must be dict keyed to nodes
#with non-negative floating point values summing to 1
if seed==None:
N = len(g.nodes)
seed = {n:1.0/N for n in g.nodes}
#improve input verification for initial value
#must be dict keyed to nodes
#with non-negative floating point values summing to 1
if initial_value==None:
initial_value = seed
for n in g.nodes:
g.nodes[n]['score'] = initial_value[n]
g = wt_heuristic(g,
node_wt_by_type=node_wt_by_type,
edge_wt_by_type=edge_wt_by_type,
self_loop_wt=self_loop_wt)
#print(g.nodes[0])
x_dict = {0:initial_value}
for k in range(0,K):
g = update_score(g,
alpha,
seed,
lazy,
lazy_wt*(1-int(lazy_decay)*k/(k+3)))
x_dict[k+1] = nx.get_node_attributes(g,'score')
#result in numpy array format
pr= np.array([g.nodes[n]['score'] for n in range(len(g.nodes))])
#trajectory in pandas dataframe format
df = pd.DataFrame(x_dict).T
return pr,df, g