Skip to content

Commit

Permalink
Current state
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanDeveloper committed Mar 25, 2024
1 parent f83fb63 commit ae13666
Show file tree
Hide file tree
Showing 10 changed files with 83 additions and 19 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,33 @@ In the folder `./example` we conducted a Exploratory Data Analysis (EDA) to veri

## Literature

Based on the following work we implement heiDGAF to find malicious behaviour in DNS request.
Based on the following work, we implement heiDGAF to find malicious behaviour such as tunneling or data exfiltration in DNS requests.

- EXPOSURE: Finding Malicious Domains Using Passive DNS Analysis

A passiv DNS pipeline for finding malicious domains using J48 decision tree algorithm.

- Real-Time Detection System for Data Exfiltration over DNS Tunneling Using Machine Learning

Propose a hybrid DNS tunneling detection system using Tabu-PIO for feature selection.

- Classifying Malicious Domains using DNS Traffic Analysis


- [DeepDGA](https://github.com/roreagan/DeepDGA): Adversarially-Tuned Domain Generation and Detection

DeepDGA detecting (and generating) domains on a per-domain basis which provides a simple and flexible means to detect known DGA families. It uses GANs to bypass detectors and shows the effectiveness of such solutions.

- Kitsune: An Ensemble of Autoencoders for Online Network Intrusion Detection

- SHAP Interpretations of Tree and Neural Network DNS Classifiers for Analyzing DGA Family Characteristics



### Similar Projects

- [Deep Lookup](https://github.com/ybubnov/deep-lookup/) is a deep learning approach for DNS.
- [DGA Detective](https://github.com/COSSAS/dgad)
- https://github.com/Erxathos/DGA-Detector
- https://github.com/gfek/Real-CyberSecurity-Datasets/
- https://github.com/aasthac67/DNS-Tunneling-Detection/
16 changes: 12 additions & 4 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys

# Configuration file for the Sphinx documentation builder.

# -- Project information
Expand All @@ -6,17 +9,19 @@
copyright = '2024, Stefan Machmeier'
author = 'Stefan Machmeier'

release = '0.1'
version = '0.1.0'
exec(open('../../heidgaf/version.py').read())

version = __version__
# The full version, including alpha/beta/rc tags
release = __version__

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys


sys.path.insert(0, os.path.abspath('../..'))

Expand All @@ -40,6 +45,9 @@
'sphinx_design',
]

# -- nbsphinx settings -------------------------------------------------------
nbsphinx_execute = "auto"

# -- apidoc settings ---------------------------------------------------------
apidoc_module_dir = '../../heidgaf'
apidoc_output_dir = 'api'
Expand Down
6 changes: 3 additions & 3 deletions heidgaf/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from heidgaf import CONTEXT_SETTINGS
from heidgaf.main import DNSAnalyzerPipeline
from heidgaf.train import train
from heidgaf.train import DNSAnalyzerTraining, ModelType
from heidgaf.version import __version__

try:
Expand Down Expand Up @@ -45,8 +45,8 @@ def training_model():

@training_model.command(name="start")
def training_start():
train()

trainer = DNSAnalyzerTraining(model=ModelType.LOGISTIC_REGRESSION)
trainer.train()

@cli.group(name="process", context_settings={"show_default": True})
def training_model():
Expand Down
15 changes: 11 additions & 4 deletions heidgaf/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
from click import Path

from heidgaf.cache import DataFrameRedisCache
from heidgaf.post.feature import Preprocessor
from heidgaf.pre.domain_analyzer import DomainAnalyzer
from heidgaf.pre.ip_analyzer import IPAnalyzer
from heidgaf.pre.time_analyer import TimeAnalyzer


class FileType(Enum):
Expand All @@ -32,13 +35,12 @@ def __init__(self, path: Path, redis_host="localhost", redis_port=6379, redis_db
logging.debug(f"Processing files: {path}/*.{filetype.value}")
self.data = self.load_data(f'{path}/*.{filetype.value}', separator.value)

self.redis_cache["data"] = self.data
# self.redis_cache["data"] = self.data

def load_data(self, path, separator):
dataframes = pl.read_csv(path, separator=separator, try_parse_dates=False, has_header=False).with_columns(
[
(pl.col('column_1').str.strptime(pl.Datetime).cast(pl.Datetime).alias("timestamp")),
(pl.col('column_2').alias("return_code"))
(pl.col('column_1').str.strptime(pl.Datetime).cast(pl.Datetime))
]
)

Expand All @@ -59,5 +61,10 @@ def load_data(self, path, separator):

def run(self):
# Running modules to analyze log files
# TODO Multithreading
preprocessor = Preprocessor(features_to_drop=[])
processed_data = preprocessor.transform(self.data)

IPAnalyzer.run(self.data, self.redis_cache)
IPAnalyzer.run(processed_data, self.redis_cache)
DomainAnalyzer.run(processed_data, self.redis_cache)
TimeAnalyzer.run(processed_data, self.redis_cache)
2 changes: 0 additions & 2 deletions heidgaf/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@


class Pipeline():
"""Pipeline runner for training of models
"""
def __init__(self,
preprocessor: Preprocessor,
mean_imputer: Imputer,
Expand Down
3 changes: 2 additions & 1 deletion heidgaf/post/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def __init__(self, features_to_drop: List):
feature_to_drop (list): list of feature to drop
"""
self.features_to_drop = features_to_drop
self.majesticmillion = MajesticMillionDataset()
# TODO Set majestic million score
# self.majesticmillion = MajesticMillionDataset()

def transform(self, x: pl.DataFrame) -> pl.DataFrame:
"""Transform our dataset with new features
Expand Down
4 changes: 3 additions & 1 deletion heidgaf/pre/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import polars as pl

from heidgaf import ReturnCode
from heidgaf.cache import DataFrameRedisCache


Expand All @@ -12,4 +13,5 @@ def __init__(self) -> None:
@classmethod
@abstractmethod
def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
pass
# Filter data with no errors
df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)
6 changes: 5 additions & 1 deletion heidgaf/pre/domain_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import polars as pl
import redis

from heidgaf import ReturnCode
from heidgaf.cache import DataFrameRedisCache
from heidgaf.pre import Analyzer

Expand All @@ -11,4 +12,7 @@ def __init__(self) -> None:

@classmethod
def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
pass
# Filter data with no errors
df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)


9 changes: 7 additions & 2 deletions heidgaf/pre/ip_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
class IPAnalyzer(Analyzer):
KEY_IP_FREQUENCY = "client_ip_frequency"
KEY_DNS_SERVER = "dns_server_frequency"
KEY_SLD = "sld_frequency"

def __init__(self) -> None:
super().__init__()
Expand All @@ -24,6 +25,12 @@ def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
self.__update_count(df, "client_ip", self.KEY_IP_FREQUENCY, redis_cache)
self.__update_count(df, "dns_server", self.KEY_DNS_SERVER, redis_cache)

self.__update_count(df, "SLD", self.KEY_SLD, redis_cache)

# TODO: Process frequency and return values
# TODO: Check if IP has more than threshold error request -> if yes, check distribution.


def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRedisCache) -> None:
frequency = df.group_by(id).count()

Expand All @@ -35,6 +42,4 @@ def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRe
# Store information in redis client
redis_cache[key] = frequency

# TODO: Process frequency and return values


16 changes: 16 additions & 0 deletions heidgaf/pre/noerror_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import polars as pl
import redis

from heidgaf import ReturnCode
from heidgaf.cache import DataFrameRedisCache
from heidgaf.pre import Analyzer


class NoErrorAnalyzer(Analyzer):
def __init__(self) -> None:
super().__init__()

@classmethod
def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
# Filter data with no errors
df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)

0 comments on commit ae13666

Please sign in to comment.