-
Notifications
You must be signed in to change notification settings - Fork 0
/
year_counter.py
executable file
·117 lines (90 loc) · 4.29 KB
/
year_counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
import collections
import csv
from datetime import datetime
import json
import translation_cache
import package_cache
PACKAGE_TIMESTAMPS_FILENAME = '/var/experiments-output/package_cache.json'
DOWNLOAD_LOG_FILENAME = '/var/experiments-output/simple/sorted.packages.log.4'
# The experiment is only valid since the following Unix timestamp.
SINCE_TIMESTAMP = 1395360000
UNTIL_TIMESTAMP = 1397952000
with open(PACKAGE_TIMESTAMPS_FILENAME) as package_timestamps_json:
project_timestamps = json.load(package_timestamps_json)
projects_last_updated_in_year = collections.Counter()
projects_last_updated_in_2014_last_updated_in_month = collections.Counter()
for timestamps in project_timestamps.values():
# We are looking only at projects did update before compromise.
last_updated_timestamp = \
package_cache.get_last_timestamp_before_compromise(timestamps,
SINCE_TIMESTAMP)
if last_updated_timestamp:
last_updated_datetime = datetime.utcfromtimestamp(last_updated_timestamp)
last_updated_year = last_updated_datetime.year
projects_last_updated_in_year[last_updated_year] += 1
if last_updated_year == 2014:
last_updated_month = last_updated_datetime.month
projects_last_updated_in_2014_last_updated_in_month[
last_updated_month] += 1
print('All projects last updated before compromise in these years:')
print(projects_last_updated_in_year)
print('')
print('All projects last updated before compromise in 2014 updated in these months:')
print(projects_last_updated_in_2014_last_updated_in_month)
print('')
dloaded_projects_last_updated_in_year = collections.Counter()
dloaded_projects_last_updated_in_2014_last_updated_in_month = \
collections.Counter()
future_projects = set()
missing_projects = set()
with open(DOWNLOAD_LOG_FILENAME) as download_log:
download_log = csv.reader(download_log)
for timestamp, ip_address, package_url, user_agent in download_log:
timestamp = int(timestamp)
project_name = translation_cache.infer_package_name(package_url)
try:
timestamps = project_timestamps[project_name]
except KeyError:
# NOTE: Probably the entire project was deleted after compromise but
# before now.
missing_projects.add(project_name)
continue
else:
# We are looking only at projects did update before compromise.
last_updated_timestamp = \
package_cache.get_last_timestamp_before_compromise(timestamps,
SINCE_TIMESTAMP)
# Project was not updated before compromise.
if not last_updated_timestamp:
# Misnomer, but actually the first time package was updated after
# compromise.
last_updated_timestamp = timestamps[0]
# Question: Why is the user downloading a package from a project that
# *seems* to have been last updated in the future?
# Answer: Some of these packages seem to have been deleted. For
# example, a user downloaded sparsehash-0.11 which does not exist on
# PyPI anymore, and the earliest known package now is sparsehash-0.3
# which was updated after the compromise.
if last_updated_timestamp > UNTIL_TIMESTAMP:
future_projects.add(project_name)
last_updated_datetime = \
datetime.utcfromtimestamp(last_updated_timestamp)
last_updated_year = last_updated_datetime.year
dloaded_projects_last_updated_in_year[last_updated_year] += 1
if last_updated_year == 2014:
last_updated_month = last_updated_datetime.month
dloaded_projects_last_updated_in_2014_last_updated_in_month[
last_updated_month] += 1
print('All downloaded projects were last updated in these years:')
print(dloaded_projects_last_updated_in_year)
print('')
print('All downloaded projects last updated in 2014 updated in these months:')
print(dloaded_projects_last_updated_in_2014_last_updated_in_month)
print('')
print('Projects that seem to be from the future:')
print(sorted(future_projects))
print('')
print('Missing (deleted?) projects:')
print(sorted(missing_projects))
print('')