-
Notifications
You must be signed in to change notification settings - Fork 0
/
outlier_detection.py
64 lines (51 loc) · 2.22 KB
/
outlier_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import numpy as np
import os
from config import DATA_DIR, ALPHA
from process_data import load_preprocessed_payloads_from_file, count_mean_freqs, count_std_devs
class Model:
def __init__(self, data_dir):
self.data_dir = data_dir
self.freqs = np.zeros(256, dtype=np.double)
self.variance = np.zeros(256, dtype=np.double)
self.dev = np.zeros(256, dtype=np.double)
def train(self, input_file):
data = load_preprocessed_payloads_from_file(input_file)
freqs = count_mean_freqs(data)
devs = count_std_devs(data)
self.freqs = freqs
self.dev = devs
def save_to_file(self, filename, dir=''):
with(open(dir + '/' + filename, 'w')) as f:
f.write(np.array2string(self.freqs, max_line_width=5120).replace("[", "").replace("]", "") + "\n")
f.write(np.array2string(self.dev, max_line_width=5120).replace("[", "").replace("]", ""))
def load_from_file(self, filename, dir=''):
with(open(dir + '/' + filename, 'r')) as f:
self.freqs = np.fromstring(f.readline(), dtype=np.double, count=256, sep=' ')
self.dev = np.fromstring(f.readline(), dtype=np.double, count=256, sep=' ')
def evaluate(self, payload):
payload_freqs = count_mean_freqs(payload)
print(payload_freqs)
print(simplified_mahalanobis(self.freqs, payload_freqs, self.dev))
# Todo
def update(self, payloads):
pass
# Todo
def simplified_mahalanobis(x_freqs, y_freqs, devs):
if x_freqs.shape != y_freqs.shape or y_freqs.shape != devs.shape or x_freqs.shape != (256, ):
raise ValueError("The vectors passed should be of same shape (x.shape = " + x_freqs.shape + ", y.shape = " + y_freqs.shape + ", devs.shape = " + devs.shape)
return np.absolute((x_freqs - y_freqs)/(devs + ALPHA)).sum()
if __name__ == '__main__':
# test
a = Model(DATA_DIR)
a.train('output_small.csv')
print("CALCULATED")
print(a.freqs)
print(a.dev)
a.save_to_file('calculated.csv', dir=os.getcwd())
print("+++++++++++++++++++++++++++")
a.load_from_file('calculated.csv', dir=os.getcwd())
print("LOADED")
print(a.freqs)
print(a.dev)
print("PAYLOAD:")
a.evaluate("index.html")