Skip to content

Commit

Permalink
Alpha Version 4 for Marc
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanDeveloper committed Apr 11, 2024
1 parent 0ee20a4 commit 5248c57
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 190 deletions.
34 changes: 17 additions & 17 deletions heidgaf/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,23 @@ def check_gpu():


@cli.command(name="train", context_settings={"show_default": True})
@click.option(
"-m",
"--model",
"model",
required=True,
type=click.Path(),
help="Input directory or file for analyzing."
)
@click.option(
"-d",
"--dataset",
"dataset",
required=True,
type=click.Path(),
help="Input directory or file for analyzing."
)
def training_model(model, dataset):
# @click.option(
# "-m",
# "--model",
# "model",
# required=True,
# type=click.Path(),
# help="Input directory or file for analyzing."
# )
# @click.option(
# "-d",
# "--dataset",
# "dataset",
# required=True,
# type=click.Path(),
# help="Input directory or file for analyzing."
# )
def training_model():
click.echo("Start training of model.")
trainer = DNSAnalyzerTraining(
model=LogisticRegression(input_dim=9, output_dim=1, epochs=5000)
Expand Down
78 changes: 74 additions & 4 deletions heidgaf/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,78 @@
from dataclasses import dataclass
from typing import Any, Callable, List

import numpy as np
import polars as pl
import sklearn.model_selection
from torch.utils.data.dataset import Dataset
from fe_polars.encoding.one_hot_encoding import OneHotEncoder


def preprocess(x: pl.DataFrame):
x = x.with_columns(
[
(pl.col("query").str.split(".").alias("labels")),
]
)

x = x.with_columns(
[
# FQDN
(pl.col("query")).alias("fqdn"),
]
)

x = x.with_columns(
[
# Second-level domain
(
pl.when(pl.col("labels").list.len() > 2)
.then(pl.col("labels").list.get(-2))
.otherwise(pl.col("labels").list.get(0))
.alias("secondleveldomain")
)
]
)

x = x.with_columns(
[
# Third-level domain
(
pl.when(pl.col("labels").list.len() > 2)
.then(
pl.col("labels")
.list.slice(0, pl.col("labels").list.len() - 2)
.list.join(".")
)
.otherwise(pl.lit(""))
.alias("thirdleveldomain")
),
]
)
x = x.with_columns(
[
(
pl.when(pl.col("class") == "legit")
.then(pl.lit(0))
.otherwise(pl.lit(1))
.alias("class")
)
]
)
return x


def cast_cic(data_path: List[str]):
dataframes = []
for data in data_path:
y = data.split("_")[-1].split(".")[0]
df = pl.read_csv(data, has_header=False)
df = df.with_columns([pl.lit(y).alias("class")])
if y == "benign":
df = df.with_columns([pl.lit("legit").alias("class")])
else:
df = df.with_columns([pl.lit(y).alias("class")])
df = df.rename({"column_1": "query"})
df = preprocess(df)
dataframes.append(df)
return pl.concat(dataframes)

Expand All @@ -32,17 +92,22 @@ def __custom_decode(data):
# Drop unnecessary column
df = df.drop("__index_level_0__")
df = df.with_columns([pl.col("query").map(__custom_decode)])
df = preprocess(df)
return df


@dataclass
class Dataset:
def __init__(self, data_path: Any, cast_dataset: Callable = None) -> None:
def __init__(self, data_path: Any, data: pl.DataFrame = None, cast_dataset: Callable = None) -> None:
if cast_dataset != None:
self.data = cast_dataset(data_path)
else:
elif data_path != "":
self.data = pl.read_csv(data_path)

elif data != None:
self.data = data
else:
raise NotImplementedError("No data given")
self.label_encoder = OneHotEncoder(features_to_encode=["class"])
self.X_train, self.X_val, self.X_test, self.Y_train, self.Y_val, self.Y_test = (
self.__train_test_val_split()
)
Expand All @@ -51,6 +116,10 @@ def __len__(self):
return len(self.data)

def __train_test_val_split(self, train_frac=0.8, random_state=None):

# TODO binary and multiclass support
# self.data = self.label_encoder.transform(self.data)

X_train, X_tmp, Y_train, Y_tmp = sklearn.model_selection.train_test_split(
self.data.drop("class"),
self.data.select("class"),
Expand Down Expand Up @@ -91,3 +160,4 @@ def val(self):
],
cast_dataset=cast_cic,
)

15 changes: 8 additions & 7 deletions heidgaf/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from enum import Enum, unique

import joblib
import polars as pl
from click import Path

Expand Down Expand Up @@ -67,7 +68,7 @@ def __init__(
redis_port=6379,
redis_db=0,
redis_max_connections=20,
threshold=3
threshold=5
) -> None:
self.df_cache = DataFrameRedisCache(
redis_host, redis_port, redis_db, redis_max_connections
Expand All @@ -86,7 +87,7 @@ def __init__(
self.n_standard_deviations = n_standard_deviations
self.anomaly_influence = anomaly_influence
self.detector = detector
self.threshold = 3
self.threshold = threshold

def load_data(self, path: str, separator: str) -> pl.DataFrame:
"""Loads data from csv files
Expand Down Expand Up @@ -157,17 +158,17 @@ def load_data(self, path: str, separator: str) -> pl.DataFrame:
),
]
)

# Filter invalid domains
x = x.filter(pl.col("query") != "|")
x = x.filter(pl.col("labels").list.len() > 1)

return x

def run(self):
"""Starts the analyzation tasks with given data input."""

# TODO Multithreading
# TODO Handle warnings for machine learning predictions

# preprocessor = Preprocessor(features_to_drop=[])
# processed_data = preprocessor.transform(self.data)

# Creates anomaly detector
config = AnomalyDetectorConfig(
Expand All @@ -185,6 +186,6 @@ def run(self):
raise NotImplementedError(f"Detector not implemented!")

# Run anaylzers to find anomalies in data
config = AnalyzerConfig(detector, self.df_cache, self.threshold)
config = AnalyzerConfig(detector, self.df_cache, self.threshold, joblib.load("model.pkl"))
for analyzer in ["IP"]:
analyzer_factory(analyzer, config).run(self.data)
9 changes: 7 additions & 2 deletions heidgaf/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from fe_polars.encoding.target_encoding import TargetEncoder
from fe_polars.imputing.base_imputing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor

from heidgaf.models.lr import LogisticRegression
from heidgaf.post.feature import Preprocessor
Expand All @@ -22,10 +24,13 @@ def fit(self, x_train, y_train):
x_train = self.preprocessor.transform(x=x_train)
x_train = self.target_encoder.fit_transform(x=x_train, y=y_train)
x_train = self.mean_imputer.fit_transform(x=x_train)
self.clf.fit(x=x_train.to_numpy(), y=y_train)
self.clf.fit(X=x_train.to_numpy(), y=y_train.to_numpy().reshape(-1,1))

def predict(self, x):
x = self.preprocessor.transform(x=x)
x = self.target_encoder.transform(x=x)
x = self.mean_imputer.transform(x=x)
return self.clf.predict(x=x.to_numpy())
return self.clf.predict(X=x.to_numpy())


random_forest_model = RandomForestClassifier()
14 changes: 0 additions & 14 deletions heidgaf/models/lof.py

This file was deleted.

95 changes: 0 additions & 95 deletions heidgaf/models/lr.py

This file was deleted.

20 changes: 0 additions & 20 deletions heidgaf/models/xgboost.py

This file was deleted.

Loading

0 comments on commit 5248c57

Please sign in to comment.