forked from probml/pyprobml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
caliban_logs_parse.py
executable file
·92 lines (73 loc) · 2.49 KB
/
caliban_logs_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Parse log files from caliban_save_logs
# Authors: Guy Gur-Ari, Kevin Murphy
from absl import app
from absl import flags
import json
import pandas as pd
import glob
FLAGS = flags.FLAGS
flags.DEFINE_string("logdir", "", "Directory containing log files")
def flatten_json_payload(entry):
if not "jsonPayload" in entry:
return entry
flat_entry = dict(entry)
flat_entry.update(entry["jsonPayload"])
del flat_entry["jsonPayload"]
return flat_entry
def json_file_to_pandas(filename):
with open(filename, 'r') as f:
entries = json.load(f)
if isinstance(entries, list):
flat_entries = [flatten_json_payload(entry) for entry in entries]
else:
flat_entries = [flatten_json_payload(entries)]
df = pd.DataFrame(flat_entries)
return df
def get_job_num_from_fname(fname):
# Extract job number from filename
# eg. '/content/gdrive/MyDrive/Logs/caliban_kpmurphy_20210208_194505_1.json' to 1
parts = fname.split('.') # separate into filename and suffix
body = parts[0]
parts = body.split('_') # parse jobname into pieces
job_num = parts[-1] # final piece is the number
return int(job_num)
def json_dir_to_pandas(fnames):
df_list = []
for filename in fnames:
print('reading ', filename)
df = json_file_to_pandas(filename)
num = get_job_num_from_fname(filename)
df['job_num'] = num
df = df.astype({'job_num': 'int32'})
df_list.append(df)
return pd.concat(df_list)
def parse_logs(logdir):
fnames = glob.glob(f'{logdir}/*.log')
return json_dir_to_pandas(fnames)
def parse_configs(logdir):
fnames = glob.glob(f'{logdir}/*.config')
return json_dir_to_pandas(fnames)
def get_log_messages(df, job_num=None):
'''Return list of log messages for this job'''
if job_num:
df = df[df.job_num == job_num]
# messages are stored most recent first. We restore to chronological order.
df = df[['timestamp', 'message']].copy()
df['time'] = pd.to_datetime(df.timestamp)
df = df.sort_values(by='time', ascending=True)
messages = df.loc[:, ['message']].dropna()
return messages.values
def get_args(df, job_num):
'''Return list of arguments (flags) passed to this job'''
dic = configs.loc[df.job_num==job_num,'trainingInput'].values[0]
args = dic['args']
return args
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
configs_df = parse_configs(FLAGS.logdir)
print(configs_df)
logs_df = parse_logs(FLAGS.logdir)
print(logs_df)
if __name__ == '__main__':
app.run(main)