-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_utils.py
101 lines (85 loc) · 3.93 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import torch
import torch.nn.functional as F
from torch_geometric.datasets import HeterophilousGraphDataset, WikiCS
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score
def load_fixed_splits(data_dir, dataset, name):
splits_lst = []
if name in ['roman-empire', 'amazon-ratings', 'minesweeper', 'tolokers', 'questions']:
torch_dataset = HeterophilousGraphDataset(name=name.capitalize(), root=data_dir)
data = torch_dataset[0]
for i in range(data.train_mask.shape[1]):
splits = {}
splits['train'] = torch.where(data.train_mask[:,i])[0]
splits['valid'] = torch.where(data.val_mask[:,i])[0]
splits['test'] = torch.where(data.test_mask[:,i])[0]
splits_lst.append(splits)
elif name in ['wikics']:
torch_dataset = WikiCS(root=f"{data_dir}/wikics/")
data = torch_dataset[0]
for i in range(data.train_mask.shape[1]):
splits = {}
splits['train'] = torch.where(data.train_mask[:,i])[0]
splits['valid'] = torch.where(torch.logical_or(data.val_mask, data.stopping_mask)[:,i])[0]
splits['test'] = torch.where(data.test_mask[:])[0]
splits_lst.append(splits)
elif name in ['amazon-computer', 'amazon-photo', 'coauthor-cs', 'coauthor-physics']:
splits = {}
idx = np.load(f'{data_dir}/{name}_split.npz')
splits['train'] = torch.from_numpy(idx['train'])
splits['valid'] = torch.from_numpy(idx['valid'])
splits['test'] = torch.from_numpy(idx['test'])
splits_lst.append(splits)
elif name in ['pokec']:
split = np.load(f'{data_dir}/{name}/{name}-splits.npy', allow_pickle=True)
for i in range(split.shape[0]):
splits = {}
splits['train'] = torch.from_numpy(np.asarray(split[i]['train']))
splits['valid'] = torch.from_numpy(np.asarray(split[i]['valid']))
splits['test'] = torch.from_numpy(np.asarray(split[i]['test']))
splits_lst.append(splits)
else:
raise NotImplementedError
return splits_lst
def eval_f1(y_true, y_pred):
acc_list = []
y_true = y_true.detach().cpu().numpy()
y_pred = y_pred.argmax(dim=-1, keepdim=True).detach().cpu().numpy()
for i in range(y_true.shape[1]):
f1 = f1_score(y_true, y_pred, average='micro')
acc_list.append(f1)
return sum(acc_list)/len(acc_list)
def eval_acc(y_true, y_pred):
acc_list = []
y_true = y_true.detach().cpu().numpy()
y_pred = y_pred.argmax(dim=-1, keepdim=True).detach().cpu().numpy()
for i in range(y_true.shape[1]):
is_labeled = y_true[:, i] == y_true[:, i]
correct = y_true[is_labeled, i] == y_pred[is_labeled, i]
acc_list.append(float(np.sum(correct))/len(correct))
return sum(acc_list)/len(acc_list)
def eval_rocauc(y_true, y_pred):
""" adapted from ogb
https://github.com/snap-stanford/ogb/blob/master/ogb/nodeproppred/evaluate.py"""
rocauc_list = []
y_true = y_true.detach().cpu().numpy()
if y_true.shape[1] == 1:
# use the predicted class for single-class classification
y_pred = F.softmax(y_pred, dim=-1)[:,1].unsqueeze(1).cpu().numpy()
else:
y_pred = y_pred.detach().cpu().numpy()
for i in range(y_true.shape[1]):
# AUC is only defined when there is at least one positive data.
if np.sum(y_true[:, i] == 1) > 0 and np.sum(y_true[:, i] == 0) > 0:
is_labeled = y_true[:, i] == y_true[:, i]
score = roc_auc_score(y_true[is_labeled, i], y_pred[is_labeled, i])
rocauc_list.append(score)
if len(rocauc_list) == 0:
raise RuntimeError(
'No positively labeled data available. Cannot compute ROC-AUC.')
return sum(rocauc_list)/len(rocauc_list)
dataset_drive_url = {
'snap-patents' : '1ldh23TSY1PwXia6dU0MYcpyEgX-w3Hia',
'pokec' : '1dNs5E7BrWJbgcHeQ_zuy5Ozp2tRCWG0y',
'yelp-chi': '1fAXtTVQS4CfEk4asqrFw9EPmlUPGbGtJ',
}