From ae136669fdc51e2f08e7c78d9bd5a568703ccfc3 Mon Sep 17 00:00:00 2001 From: Stefan Machmeier Date: Mon, 25 Mar 2024 16:34:14 +0100 Subject: [PATCH] Current state --- README.md | 25 ++++++++++++++++++++++++- docs/source/conf.py | 16 ++++++++++++---- heidgaf/cli.py | 6 +++--- heidgaf/main.py | 15 +++++++++++---- heidgaf/models/__init__.py | 2 -- heidgaf/post/feature.py | 3 ++- heidgaf/pre/__init__.py | 4 +++- heidgaf/pre/domain_analyzer.py | 6 +++++- heidgaf/pre/ip_analyzer.py | 9 +++++++-- heidgaf/pre/noerror_analyzer.py | 16 ++++++++++++++++ 10 files changed, 83 insertions(+), 19 deletions(-) create mode 100644 heidgaf/pre/noerror_analyzer.py diff --git a/README.md b/README.md index d841a65..ea83407 100644 --- a/README.md +++ b/README.md @@ -88,10 +88,33 @@ In the folder `./example` we conducted a Exploratory Data Analysis (EDA) to veri ## Literature -Based on the following work we implement heiDGAF to find malicious behaviour in DNS request. +Based on the following work, we implement heiDGAF to find malicious behaviour such as tunneling or data exfiltration in DNS requests. - EXPOSURE: Finding Malicious Domains Using Passive DNS Analysis A passiv DNS pipeline for finding malicious domains using J48 decision tree algorithm. - Real-Time Detection System for Data Exfiltration over DNS Tunneling Using Machine Learning + + Propose a hybrid DNS tunneling detection system using Tabu-PIO for feature selection. + +- Classifying Malicious Domains using DNS Traffic Analysis + + +- [DeepDGA](https://github.com/roreagan/DeepDGA): Adversarially-Tuned Domain Generation and Detection + + DeepDGA detecting (and generating) domains on a per-domain basis which provides a simple and flexible means to detect known DGA families. It uses GANs to bypass detectors and shows the effectiveness of such solutions. + +- Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection + +- SHAP Interpretations of Tree and Neural Network DNS Classifiers for Analyzing DGA Family Characteristics + + + +### Similar Projects + +- [Deep Lookup](https://github.com/ybubnov/deep-lookup/) is a deep learning approach for DNS. +- [DGA Detective](https://github.com/COSSAS/dgad) +- https://github.com/Erxathos/DGA-Detector +- https://github.com/gfek/Real-CyberSecurity-Datasets/ +- https://github.com/aasthac67/DNS-Tunneling-Detection/ diff --git a/docs/source/conf.py b/docs/source/conf.py index 10a932e..562ec04 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,3 +1,6 @@ +import os +import sys + # Configuration file for the Sphinx documentation builder. # -- Project information @@ -6,8 +9,11 @@ copyright = '2024, Stefan Machmeier' author = 'Stefan Machmeier' -release = '0.1' -version = '0.1.0' +exec(open('../../heidgaf/version.py').read()) + +version = __version__ +# The full version, including alpha/beta/rc tags +release = __version__ # -- Path setup -------------------------------------------------------------- @@ -15,8 +21,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -import os -import sys + sys.path.insert(0, os.path.abspath('../..')) @@ -40,6 +45,9 @@ 'sphinx_design', ] +# -- nbsphinx settings ------------------------------------------------------- +nbsphinx_execute = "auto" + # -- apidoc settings --------------------------------------------------------- apidoc_module_dir = '../../heidgaf' apidoc_output_dir = 'api' diff --git a/heidgaf/cli.py b/heidgaf/cli.py index 933324d..41023f1 100644 --- a/heidgaf/cli.py +++ b/heidgaf/cli.py @@ -5,7 +5,7 @@ from heidgaf import CONTEXT_SETTINGS from heidgaf.main import DNSAnalyzerPipeline -from heidgaf.train import train +from heidgaf.train import DNSAnalyzerTraining, ModelType from heidgaf.version import __version__ try: @@ -45,8 +45,8 @@ def training_model(): @training_model.command(name="start") def training_start(): - train() - + trainer = DNSAnalyzerTraining(model=ModelType.LOGISTIC_REGRESSION) + trainer.train() @cli.group(name="process", context_settings={"show_default": True}) def training_model(): diff --git a/heidgaf/main.py b/heidgaf/main.py index f1a0680..7347407 100644 --- a/heidgaf/main.py +++ b/heidgaf/main.py @@ -7,7 +7,10 @@ from click import Path from heidgaf.cache import DataFrameRedisCache +from heidgaf.post.feature import Preprocessor +from heidgaf.pre.domain_analyzer import DomainAnalyzer from heidgaf.pre.ip_analyzer import IPAnalyzer +from heidgaf.pre.time_analyer import TimeAnalyzer class FileType(Enum): @@ -32,13 +35,12 @@ def __init__(self, path: Path, redis_host="localhost", redis_port=6379, redis_db logging.debug(f"Processing files: {path}/*.{filetype.value}") self.data = self.load_data(f'{path}/*.{filetype.value}', separator.value) - self.redis_cache["data"] = self.data + # self.redis_cache["data"] = self.data def load_data(self, path, separator): dataframes = pl.read_csv(path, separator=separator, try_parse_dates=False, has_header=False).with_columns( [ - (pl.col('column_1').str.strptime(pl.Datetime).cast(pl.Datetime).alias("timestamp")), - (pl.col('column_2').alias("return_code")) + (pl.col('column_1').str.strptime(pl.Datetime).cast(pl.Datetime)) ] ) @@ -59,5 +61,10 @@ def load_data(self, path, separator): def run(self): # Running modules to analyze log files + # TODO Multithreading + preprocessor = Preprocessor(features_to_drop=[]) + processed_data = preprocessor.transform(self.data) - IPAnalyzer.run(self.data, self.redis_cache) + IPAnalyzer.run(processed_data, self.redis_cache) + DomainAnalyzer.run(processed_data, self.redis_cache) + TimeAnalyzer.run(processed_data, self.redis_cache) diff --git a/heidgaf/models/__init__.py b/heidgaf/models/__init__.py index b06f957..a89b44d 100644 --- a/heidgaf/models/__init__.py +++ b/heidgaf/models/__init__.py @@ -6,8 +6,6 @@ class Pipeline(): - """Pipeline runner for training of models - """ def __init__(self, preprocessor: Preprocessor, mean_imputer: Imputer, diff --git a/heidgaf/post/feature.py b/heidgaf/post/feature.py index 53beb93..d3c7e8e 100644 --- a/heidgaf/post/feature.py +++ b/heidgaf/post/feature.py @@ -15,7 +15,8 @@ def __init__(self, features_to_drop: List): feature_to_drop (list): list of feature to drop """ self.features_to_drop = features_to_drop - self.majesticmillion = MajesticMillionDataset() + # TODO Set majestic million score + # self.majesticmillion = MajesticMillionDataset() def transform(self, x: pl.DataFrame) -> pl.DataFrame: """Transform our dataset with new features diff --git a/heidgaf/pre/__init__.py b/heidgaf/pre/__init__.py index 44d7754..ce552df 100644 --- a/heidgaf/pre/__init__.py +++ b/heidgaf/pre/__init__.py @@ -2,6 +2,7 @@ import polars as pl +from heidgaf import ReturnCode from heidgaf.cache import DataFrameRedisCache @@ -12,4 +13,5 @@ def __init__(self) -> None: @classmethod @abstractmethod def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache): - pass \ No newline at end of file + # Filter data with no errors + df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1) \ No newline at end of file diff --git a/heidgaf/pre/domain_analyzer.py b/heidgaf/pre/domain_analyzer.py index fe7a017..c59ad8d 100644 --- a/heidgaf/pre/domain_analyzer.py +++ b/heidgaf/pre/domain_analyzer.py @@ -1,6 +1,7 @@ import polars as pl import redis +from heidgaf import ReturnCode from heidgaf.cache import DataFrameRedisCache from heidgaf.pre import Analyzer @@ -11,4 +12,7 @@ def __init__(self) -> None: @classmethod def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache): - pass \ No newline at end of file + # Filter data with no errors + df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1) + + diff --git a/heidgaf/pre/ip_analyzer.py b/heidgaf/pre/ip_analyzer.py index eae1292..794ae4d 100644 --- a/heidgaf/pre/ip_analyzer.py +++ b/heidgaf/pre/ip_analyzer.py @@ -10,6 +10,7 @@ class IPAnalyzer(Analyzer): KEY_IP_FREQUENCY = "client_ip_frequency" KEY_DNS_SERVER = "dns_server_frequency" + KEY_SLD = "sld_frequency" def __init__(self) -> None: super().__init__() @@ -24,6 +25,12 @@ def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache): self.__update_count(df, "client_ip", self.KEY_IP_FREQUENCY, redis_cache) self.__update_count(df, "dns_server", self.KEY_DNS_SERVER, redis_cache) + self.__update_count(df, "SLD", self.KEY_SLD, redis_cache) + + # TODO: Process frequency and return values + # TODO: Check if IP has more than threshold error request -> if yes, check distribution. + + def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRedisCache) -> None: frequency = df.group_by(id).count() @@ -35,6 +42,4 @@ def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRe # Store information in redis client redis_cache[key] = frequency - # TODO: Process frequency and return values - \ No newline at end of file diff --git a/heidgaf/pre/noerror_analyzer.py b/heidgaf/pre/noerror_analyzer.py new file mode 100644 index 0000000..1e667f5 --- /dev/null +++ b/heidgaf/pre/noerror_analyzer.py @@ -0,0 +1,16 @@ +import polars as pl +import redis + +from heidgaf import ReturnCode +from heidgaf.cache import DataFrameRedisCache +from heidgaf.pre import Analyzer + + +class NoErrorAnalyzer(Analyzer): + def __init__(self) -> None: + super().__init__() + + @classmethod + def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache): + # Filter data with no errors + df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)