Current state

stefanDeveloper · Mar 25, 2024 · ae13666 · ae13666
1 parent f83fb63
commit ae13666
Show file tree

Hide file tree

Showing 10 changed files with 83 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -88,10 +88,33 @@ In the folder `./example` we conducted a Exploratory Data Analysis (EDA) to veri
 
 ## Literature
 
-Based on the following work we implement heiDGAF to find malicious behaviour in DNS request.
+Based on the following work, we implement heiDGAF to find malicious behaviour such as tunneling or data exfiltration in DNS requests.
 
 - EXPOSURE: Finding Malicious Domains Using Passive DNS Analysis
 
   A passiv DNS pipeline for finding malicious domains using J48 decision tree algorithm.
 
 - Real-Time Detection System for Data Exﬁltration over DNS Tunneling Using Machine Learning
+
+  Propose a hybrid DNS tunneling detection system using Tabu-PIO for feature selection.
+
+- Classifying Malicious Domains using DNS Traffic Analysis
+
+
+- [DeepDGA](https://github.com/roreagan/DeepDGA): Adversarially-Tuned Domain Generation and Detection
+
+  DeepDGA detecting (and generating) domains on a per-domain basis which provides a simple and ﬂexible means to detect known DGA families. It uses GANs to bypass detectors and shows the effectiveness of such solutions.
+
+- Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection
+
+- SHAP Interpretations of Tree and Neural Network DNS Classifiers for Analyzing DGA Family Characteristics
+
+
+
+### Similar Projects
+
+- [Deep Lookup](https://github.com/ybubnov/deep-lookup/) is a deep learning approach for DNS.
+- [DGA Detective](https://github.com/COSSAS/dgad)
+- https://github.com/Erxathos/DGA-Detector
+- https://github.com/gfek/Real-CyberSecurity-Datasets/
+- https://github.com/aasthac67/DNS-Tunneling-Detection/
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -1,3 +1,6 @@
+import os
+import sys
+
 # Configuration file for the Sphinx documentation builder.
 
 # -- Project information
@@ -6,17 +9,19 @@
 copyright = '2024, Stefan Machmeier'
 author = 'Stefan Machmeier'
 
-release = '0.1'
-version = '0.1.0'
+exec(open('../../heidgaf/version.py').read())
+
+version = __version__
+# The full version, including alpha/beta/rc tags
+release = __version__
 
 # -- Path setup --------------------------------------------------------------
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-import os
-import sys
+
 
 sys.path.insert(0, os.path.abspath('../..'))
 
@@ -40,6 +45,9 @@
     'sphinx_design',
 ]
 
+# -- nbsphinx settings -------------------------------------------------------
+nbsphinx_execute = "auto"
+
 # -- apidoc settings ---------------------------------------------------------
 apidoc_module_dir = '../../heidgaf'
 apidoc_output_dir = 'api'

diff --git a/heidgaf/cli.py b/heidgaf/cli.py
@@ -5,7 +5,7 @@
 
 from heidgaf import CONTEXT_SETTINGS
 from heidgaf.main import DNSAnalyzerPipeline
-from heidgaf.train import train
+from heidgaf.train import DNSAnalyzerTraining, ModelType
 from heidgaf.version import __version__
 
 try:
@@ -45,8 +45,8 @@ def training_model():
 
 @training_model.command(name="start")
 def training_start():
-    train()
-
+    trainer = DNSAnalyzerTraining(model=ModelType.LOGISTIC_REGRESSION)
+    trainer.train()
 
 @cli.group(name="process", context_settings={"show_default": True})
 def training_model():

diff --git a/heidgaf/main.py b/heidgaf/main.py
@@ -7,7 +7,10 @@
 from click import Path
 
 from heidgaf.cache import DataFrameRedisCache
+from heidgaf.post.feature import Preprocessor
+from heidgaf.pre.domain_analyzer import DomainAnalyzer
 from heidgaf.pre.ip_analyzer import IPAnalyzer
+from heidgaf.pre.time_analyer import TimeAnalyzer
 
 
 class FileType(Enum):
@@ -32,13 +35,12 @@ def __init__(self, path: Path, redis_host="localhost", redis_port=6379, redis_db
             logging.debug(f"Processing files: {path}/*.{filetype.value}")
             self.data = self.load_data(f'{path}/*.{filetype.value}', separator.value)
 
-        self.redis_cache["data"] = self.data
+        # self.redis_cache["data"] = self.data
 
     def load_data(self, path, separator):
         dataframes = pl.read_csv(path, separator=separator, try_parse_dates=False,  has_header=False).with_columns(
             [
-                (pl.col('column_1').str.strptime(pl.Datetime).cast(pl.Datetime).alias("timestamp")),
-                (pl.col('column_2').alias("return_code"))
+                (pl.col('column_1').str.strptime(pl.Datetime).cast(pl.Datetime))
             ]
         )
 
@@ -59,5 +61,10 @@ def load_data(self, path, separator):
 
     def run(self):
         # Running modules to analyze log files
+        # TODO Multithreading
+        preprocessor = Preprocessor(features_to_drop=[])
+        processed_data = preprocessor.transform(self.data)
 
-        IPAnalyzer.run(self.data, self.redis_cache)
+        IPAnalyzer.run(processed_data, self.redis_cache)
+        DomainAnalyzer.run(processed_data, self.redis_cache)
+        TimeAnalyzer.run(processed_data, self.redis_cache)
diff --git a/heidgaf/models/__init__.py b/heidgaf/models/__init__.py
@@ -6,8 +6,6 @@
 
 
 class Pipeline():
-    """Pipeline runner for training of models
-    """
     def __init__(self, 
                  preprocessor: Preprocessor, 
                  mean_imputer: Imputer, 

diff --git a/heidgaf/post/feature.py b/heidgaf/post/feature.py
@@ -15,7 +15,8 @@ def __init__(self, features_to_drop: List):
             feature_to_drop (list): list of feature to drop
         """
         self.features_to_drop = features_to_drop
-        self.majesticmillion = MajesticMillionDataset()
+        # TODO Set majestic million score
+        # self.majesticmillion = MajesticMillionDataset()
 
     def transform(self, x: pl.DataFrame) -> pl.DataFrame:
         """Transform our dataset with new features

diff --git a/heidgaf/pre/__init__.py b/heidgaf/pre/__init__.py
@@ -2,6 +2,7 @@
 
 import polars as pl
 
+from heidgaf import ReturnCode
 from heidgaf.cache import DataFrameRedisCache
 
 
@@ -12,4 +13,5 @@ def __init__(self) -> None:
     @classmethod
     @abstractmethod
     def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
-        pass
+        # Filter data with no errors
+        df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)
diff --git a/heidgaf/pre/domain_analyzer.py b/heidgaf/pre/domain_analyzer.py
@@ -1,6 +1,7 @@
 import polars as pl
 import redis
 
+from heidgaf import ReturnCode
 from heidgaf.cache import DataFrameRedisCache
 from heidgaf.pre import Analyzer
 
@@ -11,4 +12,7 @@ def __init__(self) -> None:
 
     @classmethod
     def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
-        pass
+        # Filter data with no errors
+        df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)
+
+
diff --git a/heidgaf/pre/ip_analyzer.py b/heidgaf/pre/ip_analyzer.py
@@ -10,6 +10,7 @@
 class IPAnalyzer(Analyzer):
     KEY_IP_FREQUENCY = "client_ip_frequency"
     KEY_DNS_SERVER = "dns_server_frequency"
+    KEY_SLD = "sld_frequency"
 
     def __init__(self) -> None:
         super().__init__()
@@ -24,6 +25,12 @@ def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
         self.__update_count(df, "client_ip", self.KEY_IP_FREQUENCY, redis_cache)
         self.__update_count(df, "dns_server", self.KEY_DNS_SERVER, redis_cache)
 
+        self.__update_count(df, "SLD", self.KEY_SLD, redis_cache)
+
+        # TODO: Process frequency and return values
+        # TODO: Check if IP has more than threshold error request -> if yes, check distribution.
+
+
     def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRedisCache) -> None:
         frequency = df.group_by(id).count()
 
@@ -35,6 +42,4 @@ def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRe
         # Store information in redis client
         redis_cache[key] = frequency
 
-        # TODO: Process frequency and return values
-
 
diff --git a/heidgaf/pre/noerror_analyzer.py b/heidgaf/pre/noerror_analyzer.py
@@ -0,0 +1,16 @@
+import polars as pl
+import redis
+
+from heidgaf import ReturnCode
+from heidgaf.cache import DataFrameRedisCache
+from heidgaf.pre import Analyzer
+
+
+class NoErrorAnalyzer(Analyzer):
+    def __init__(self) -> None:
+        super().__init__()
+
+    @classmethod
+    def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
+        # Filter data with no errors
+        df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)