forked from EleutherAI/gpt-neo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tasks.py
106 lines (96 loc) · 3.92 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os.path
import json
import requests
import numpy as np
import ftfy
from data.encoders import fetch_encoder
import tensorflow as tf
lambada_src_uri = 'https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl'
normalization = 'NFKC'
# Note: this task is called "lambada" but it really refers to OpenAI's version
# of the task, which actually differs in some ways from the task described in
# the original paper. So, strictly speaking, accuracy values from this task
# should not be compared to accuracy values from the original lambada task.
# For more information, see
# https://github.com/openai/gpt-2/issues/131
def lambada_create_tokens_data(params, path):
with open(path, 'w') as f:
req = requests.get(lambada_src_uri)
req.raise_for_status()
jsons = [json.loads(l) for l in req.iter_lines()]
texts = [ftfy.fix_text(j['text'], normalization=normalization) for j in jsons]
enc = encoders.fetch_encoder(params)
arrays = [enc.encode(t) for t in texts]
json.dump(arrays, f)
return arrays
def lambada_read_or_create_tokens_data(params, path):
# if you tell me where the file should go, i will helpfully create it for you
if not os.path.exists(path):
return lambada_create_tokens_data(params, path)
with open(path) as f:
return json.load(f)
def lambada_bin_pack(params, tokens_data):
eos_token = params['eos_id']
n_ctx = params['n_ctx']
dummy_token = 1
pad_batch_size = params['eval_batch_size']
bins = []
for a in tokens_data:
if len(bins) == 0 or len(bins[-1])+len(a)+1 > n_ctx:
bins.append([])
bins[-1] += a
bins[-1].append(eos_token)
while len(bins)%pad_batch_size != 0:
bins.append([])
bins_array = np.full((len(bins), n_ctx), dummy_token, dtype=np.uint16)
for i, b in enumerate(bins):
bins_array[i, 0:len(b)] = b
return bins_array
def lambada_init(params):
ds_configs = params['dataset_configs']
l = [
ds_configs[ds_id]['lambada_tokens_path']
for ds_id, _, _, _ in params['datasets']
if 'lambada_tokens_path' in ds_configs[ds_id]
]
assert len(l) > 0, 'lambada_tokens_path not found in the dataset config'
lt_path = l[0]
assert lt_path.endswith('.json'), 'lambada_tokens_path must have extension json'
tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
bins_array = lambada_bin_pack(params, tokens_data)
params['lambada_tokens_path'] = lt_path
params['lambada_n_steps'] = len(bins_array)//params['eval_batch_size']
def lambada_get_task_info(params):
return {
'n_steps': params['lambada_n_steps'],
}
# The LAMBADA evaluation code looks at the logits of each position just before an eos_token
def lambada_input(params):
eos_token = 50256 if params['n_vocab'] >= 50257 else 0
n_ctx = params['n_ctx']
lt_path = params['lambada_tokens_path']
tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
bins_array = lambada_bin_pack(params, tokens_data)
dataset = tf.data.Dataset.from_tensor_slices(bins_array)
def _get_output(bin):
bin = tf.cast(bin, dtype=tf.int32)
indexes = tf.range(n_ctx)
results = tf.gather(bin, (indexes+1)%n_ctx)
eos_next_positions = tf.math.equal(tf.gather(bin, (indexes+2)%n_ctx), eos_token)
output = tf.where(eos_next_positions, results, tf.constant(eos_token, shape=[n_ctx]))
bin = tf.reshape(bin, [n_ctx])
bin = tf.cast(bin, dtype=tf.int32)
output = tf.reshape(output, [n_ctx])
output = tf.cast(output, dtype=tf.int32)
return bin, output
dataset = dataset.map(_get_output)
dataset = dataset.batch(params['eval_batch_size'], drop_remainder=True)
dataset = dataset.repeat()
return dataset
task_descriptors = {
'lambada': {
'init_fn': lambada_init,
'get_task_info_fn': lambada_get_task_info,
'input_fn': lambada_input,
}
}