Skip to content

Commit

Permalink
Add dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanDeveloper committed Apr 2, 2024
1 parent 13ad76b commit b582da5
Show file tree
Hide file tree
Showing 15 changed files with 241 additions and 217 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,15 @@ Currently, we support the data format scheme provided by the [DNS-Collector](htt

For training our models, we rely on the following data sets:

- [CICBellDNS2021]()
- [CICBellDNS2021](https://www.unb.ca/cic/datasets/dns-2021.html)
- [DGTA Benchmark](https://data.mendeley.com/datasets/2wzf9bz7xr/1)
- [DNS Tunneling Queries for Binary Classification](https://data.mendeley.com/datasets/mzn9hvdcxg/1)
- [UMUDGA - University of Murcia Domain Generation Algorithm Dataset](https://data.mendeley.com/datasets/y8ph45msv8/1)
- [Majestic Million](https://de.majestic.com/reports/majestic-million)

However, we compute all feature separately and only rely on the `domain` and `class`.
Currently, we are only interested in binary classification, thus, the `class` is either `benign` or `malicious`.

### Exploratory Data Analysis (EDA)

In the folder `./example` we conducted a Exploratory Data Analysis (EDA) to verify the features of interest for our application.
Expand Down
214 changes: 97 additions & 117 deletions example/DGA_dgta_EDA.ipynb

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions heidgaf/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from heidgaf import CONTEXT_SETTINGS
from heidgaf.main import DNSAnalyzerPipeline
from heidgaf.train import DNSAnalyzerTraining, ModelType
from heidgaf.models.lr import LogisticRegression
from heidgaf.train import DNSAnalyzerTraining
from heidgaf.version import __version__

try:
Expand Down Expand Up @@ -45,7 +46,7 @@ def training_model():

@training_model.command(name="start")
def training_start():
trainer = DNSAnalyzerTraining(model=ModelType.LOGISTIC_REGRESSION)
trainer = DNSAnalyzerTraining(model=LogisticRegression(input_dim=9, output_dim=1, epochs=5000))
trainer.train()

@cli.group(name="process", context_settings={"show_default": True})
Expand Down
108 changes: 92 additions & 16 deletions heidgaf/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,109 @@
import string
from dataclasses import dataclass, field
from typing import Callable, Tuple
import logging
from dataclasses import dataclass
from typing import Any, Callable, List

import polars as pl
import torch
import sklearn.model_selection
from torch.utils.data.dataset import Dataset


def cast_cic(data_path: List[str]):
dataframes = []
for data in data_path:
y = data.split("_")[-1].split(".")[0]
df = pl.read_csv(data, has_header=False)
df = df.with_columns(
[
pl.lit(y).alias("class")
]
)
df = df.rename(
{
"column_1": "query"
}
)
dataframes.append(df)
return pl.concat(dataframes)

def cast_dgta(data_path: str) -> pl.DataFrame:
def __custom_decode(data):
retL=[None] * len(data)
for i, datum in enumerate(data):
retL[i]=str(datum.decode('latin-1').encode('utf-8').decode('utf-8'))

return(pl.Series(retL))

df = pl.read_parquet(data_path)
df = df.rename({"domain": "query"})
# Drop unnecessary column
df = df.drop("__index_level_0__")
df = df.with_columns(
[
pl.col('query').map(__custom_decode)
]
)
return df

@dataclass
class Dataset:
train_path: str
val_path: str
test_path: str
cast_dataset: Callable
binary: bool = field(default=True)
def __init__(self, data_path: Any, cast_dataset: Callable = None) -> None:
if cast_dataset != None:
self.data = cast_dataset(data_path)
else:
self.data = pl.read_csv(data_path)

logging.info(self.data)

self.X_train, self.X_val, self.X_test, self.Y_train, self.Y_val, self.Y_test = self.__train_test_val_split()

def __len__(self):
return len(self.data)

def __train_test_val_split(self, train_frac=0.8, random_state=None):
X_train, X_tmp, Y_train, Y_tmp = sklearn.model_selection.train_test_split(
self.data.drop("class"),
self.data.select("class"),
train_size=train_frac,
random_state=random_state
)

X_val, X_test, Y_val, Y_test = sklearn.model_selection.train_test_split(
X_tmp,
Y_tmp,
train_size=0.5,
random_state=random_state
)

return X_train, X_val, X_test, Y_train, Y_val, Y_test

@property
def train(self):
return {
"train_path": self.train_path,
"val_path": self.val_path,
"cast_dataset": self.cast_dataset,
"binary": self.binary,
"X": self.X_train,
"Y": self.Y_train
}

@property
def test(self):
return {
"test_path": self.test_path,
"cast_dataset": self.cast_dataset,
"binary": self.binary,
"X": self.X_test,
"Y": self.Y_test
}

@property
def val(self):
return {
"X": self.X_val,
"Y": self.Y_val
}

dgta_dataset = Dataset(
data_path="/home/smachmeier/projects/heiDGA/data/dgta/dgta-benchmark.parquet",
cast_dataset=cast_dgta
)

cic_dataset = Dataset(
data_path=["/home/smachmeier/projects/heiDGA/example/CICBellDNS2021_CSV_benign.csv"],
cast_dataset=cast_cic
)

10 changes: 0 additions & 10 deletions heidgaf/dataset/cic.py

This file was deleted.

13 changes: 0 additions & 13 deletions heidgaf/dataset/dgta.py

This file was deleted.

14 changes: 0 additions & 14 deletions heidgaf/dataset/heicloud.py

This file was deleted.

11 changes: 7 additions & 4 deletions heidgaf/dataset/majestic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import string

import polars as pl
from torch.utils.data.dataset import Dataset
Expand All @@ -7,12 +8,14 @@


class MajesticMillionDataset(Dataset):
def __init__(self, csv_file, redis_cache: DataFrameRedisCache) -> None:
def __init__(self, csv_file: str = "/home/smachmeier/projects/heiDGA/data/majestic_million/majestic_million.csv") -> None:
self.data = pl.read_csv(csv_file)
self.redis_cache = redis_cache

def __len__(self):
def __len__(self) -> int:
return len(self.data)

def __getitem__(self, idx: int) -> any:
def __getitem__(self, idx: int) -> pl.DataFrame:
return self.data[idx, 0]

def __call__(self, name: str, key: str) -> pl.DataFrame:
return self.data.filter(pl.col(key) == name)
8 changes: 4 additions & 4 deletions heidgaf/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ def __init__(self,

def fit(self, x_train, y_train):
x_train = self.preprocessor.transform(x=x_train)
x_train = self.target_encoder.fit_transform(x=x_train, y=y_train)
x_train = self.mean_imputer.fit_transform(x=x_train)
# x_train = self.target_encoder.fit_transform(x=x_train, y=y_train)
# x_train = self.mean_imputer.fit_transform(x=x_train)
self.clf.fit(x=x_train.to_numpy(), y=y_train)

def predict(self, x):
x = self.preprocessor.transform(x=x)
x = self.target_encoder.transform(x=x)
x = self.mean_imputer.transform(x=x)
# x = self.target_encoder.transform(x=x)
# x = self.mean_imputer.transform(x=x)
return self.clf.predict(x=x.to_numpy())

2 changes: 1 addition & 1 deletion heidgaf/post/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, features_to_drop: List):
"""
self.features_to_drop = features_to_drop
# TODO Set majestic million score
# self.majesticmillion = MajesticMillionDataset()
self.majesticmillion = MajesticMillionDataset()

def transform(self, x: pl.DataFrame) -> pl.DataFrame:
"""Transform our dataset with new features
Expand Down
9 changes: 7 additions & 2 deletions heidgaf/pre/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABCMeta, abstractmethod
from typing import Any

import polars as pl

Expand All @@ -13,5 +14,9 @@ def __init__(self) -> None:
@classmethod
@abstractmethod
def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
# Filter data with no errors
df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)
pass

@classmethod
@abstractmethod
def set_warning(self, data: Any, redis_cache: DataFrameRedisCache):
pass
2 changes: 1 addition & 1 deletion heidgaf/pre/domain_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
# Filter data with no errors
df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)


4 changes: 4 additions & 0 deletions heidgaf/pre/ip_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,16 @@ def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
self.__update_count(df, "SLD", self.KEY_SLD, redis_cache)

# TODO: Process frequency and return values

# TODO: Check if IP has more than threshold error request -> if yes, check distribution.


def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRedisCache) -> None:
frequency = df.group_by(id).count()

# TODO Dividing highest and lowest timestamp
df.max(pl.col("timestamp")) - df.min(pl.col("timestamp"))

# Check if dns_server_frequency exists in redis cache
if key in redis_cache:
frequency = pl.concat([redis_cache[key], frequency]).groupby(id).agg(pl.sum('count'))
Expand Down
32 changes: 7 additions & 25 deletions heidgaf/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,16 @@
import torch
from fe_polars.encoding.target_encoding import TargetEncoder
from fe_polars.imputing.base_imputing import Imputer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from heidgaf import dataset
from heidgaf.cache import DataFrameRedisCache
from heidgaf.dataset.dgta import DGTA
from heidgaf.dataset.majestic import MajesticMillionDataset
from heidgaf.models import Pipeline
from heidgaf.models.lr import LogisticRegression
from heidgaf.post.feature import Preprocessor


class ModelType(Enum):
LOGISTIC_REGRESSION = LogisticRegression

class DNSAnalyzerTraining:
def __init__(self, model: ModelType, redis_host="localhost", redis_port=6379, redis_db=0, redis_max_connections=20) -> None:
def __init__(self, model: torch.nn.Module, redis_host="localhost", redis_port=6379, redis_db=0, redis_max_connections=20) -> None:
self.redis_cache = DataFrameRedisCache(redis_host, redis_port, redis_db, redis_max_connections)

def train(self, seed=42):
Expand All @@ -43,32 +36,21 @@ def train(self, seed=42):
logging.info(f"\tAllocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
logging.info(f"\tCached: {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")

# TODO Load data set
logging.info(f'Loading data sets')
majestic_dataset = MajesticMillionDataset(self.redis_cache)
dgta_dataset = DGTA()

# TODO Handle Data loader
train_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
# train_features, train_labels = next(iter(train_dataloader))

# Training model
model_pipeline = Pipeline(
preprocessor=Preprocessor(
features_to_drop=[]),
features_to_drop=["query"]),
mean_imputer=Imputer(
features_to_impute=["", ""], strategy="mean"),
features_to_impute=["FQDN_full_count"], strategy="mean"),
target_encoder=TargetEncoder(
smoothing=100,
features_to_encode=["", "", "", "", "",""]),
features_to_encode=[]),
clf=LogisticRegression(input_dim=9, output_dim=1, epochs=5000)
)

# train, target, test = data_loader()
# x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.33, random_state=seed)

# model_pipeline.fit(x_train=x_train, y_train=y_train)

model_pipeline.fit(x_train=dataset.dgta_dataset.X_train, y_train=dataset.dgta_dataset.Y_train)



Loading

0 comments on commit b582da5

Please sign in to comment.