-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.py
89 lines (66 loc) · 2.36 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from torch.utils.data import Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import random
import json
import pandas as pd
class PandasDataset(Dataset):
"""
Creates a Torch Dataset from a Pandas pickle file.
"""
def __init__(self, pd_file):
self.data = pd.read_pickle(pd_file)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data.loc[idx].to_dict()
def select(self, field):
for _, datum in self.data.iterrows():
yield datum[field]
class JsonDataset(Dataset):
"""
Creates a Torch Dataset from a JSON file.
We assume the JSON file is a list of dictionaries, where each
dictionary corresponds to a single datum.
"""
def __init__(self, json_file):
with open(json_file) as f:
self.data = json.load(f)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def select(self, field):
for datum in self:
yield datum[field]
def domain(data, category):
"""
Returns the set of existing values for a particular category in a
list of training data.
"""
result = set()
for datum in data:
result.add(datum[category])
return result
def split_data(ids, dev_percent, test_percent):
"""
Given a list of datum ids and dev/test percentages, returns a partition
(train, dev, test) of the datum ids.
"""
dev_size = int(dev_percent * len(ids))
test_size = int(test_percent * len(ids))
train_ids = set(ids)
dev_ids = random.sample(train_ids, dev_size)
train_ids = train_ids - set(dev_ids)
test_ids = random.sample(train_ids, test_size)
train_ids = list(train_ids - set(test_ids))
return train_ids, dev_ids, test_ids
def get_samplers(all_ids, dev_percent, test_percent):
"""
Given a list of datum ids and dev/test percentages, makes a
train/dev/test split and returns samplers for the three subsets.
"""
train_ids, dev_ids, test_ids = split_data(all_ids, dev_percent, test_percent)
train_sampler = SubsetRandomSampler(train_ids)
dev_sampler = SubsetRandomSampler(dev_ids)
test_sampler = SubsetRandomSampler(test_ids)
return train_sampler, dev_sampler, test_sampler