Add dataset

stefanDeveloper · Apr 2, 2024 · b582da5 · b582da5
1 parent 13ad76b
commit b582da5
Show file tree

Hide file tree

Showing 15 changed files with 241 additions and 217 deletions.
diff --git a/README.md b/README.md
@@ -76,12 +76,15 @@ Currently, we support the data format scheme provided by the [DNS-Collector](htt
 
 For training our models, we rely on the following data sets:
 
-- [CICBellDNS2021]()
+- [CICBellDNS2021](https://www.unb.ca/cic/datasets/dns-2021.html)
 - [DGTA Benchmark](https://data.mendeley.com/datasets/2wzf9bz7xr/1)
 - [DNS Tunneling Queries for Binary Classification](https://data.mendeley.com/datasets/mzn9hvdcxg/1)
 - [UMUDGA - University of Murcia Domain Generation Algorithm Dataset](https://data.mendeley.com/datasets/y8ph45msv8/1)
 - [Majestic Million](https://de.majestic.com/reports/majestic-million)
 
+However, we compute all feature separately and only rely on the `domain` and `class`.
+Currently, we are only interested in binary classification, thus, the `class` is either `benign` or `malicious`.
+
 ### Exploratory Data Analysis (EDA)
 
 In the folder `./example` we conducted a Exploratory Data Analysis (EDA) to verify the features of interest for our application.

diff --git a/example/DGA_dgta_EDA.ipynb b/example/DGA_dgta_EDA.ipynb
diff --git a/heidgaf/cli.py b/heidgaf/cli.py
@@ -5,7 +5,8 @@
 
 from heidgaf import CONTEXT_SETTINGS
 from heidgaf.main import DNSAnalyzerPipeline
-from heidgaf.train import DNSAnalyzerTraining, ModelType
+from heidgaf.models.lr import LogisticRegression
+from heidgaf.train import DNSAnalyzerTraining
 from heidgaf.version import __version__
 
 try:
@@ -45,7 +46,7 @@ def training_model():
 
 @training_model.command(name="start")
 def training_start():
-    trainer = DNSAnalyzerTraining(model=ModelType.LOGISTIC_REGRESSION)
+    trainer = DNSAnalyzerTraining(model=LogisticRegression(input_dim=9, output_dim=1, epochs=5000))
     trainer.train()
 
 @cli.group(name="process", context_settings={"show_default": True})

diff --git a/heidgaf/dataset/__init__.py b/heidgaf/dataset/__init__.py
@@ -1,33 +1,109 @@
-import string
-from dataclasses import dataclass, field
-from typing import Callable, Tuple
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, List
 
 import polars as pl
-import torch
+import sklearn.model_selection
 from torch.utils.data.dataset import Dataset
 
 
+def cast_cic(data_path: List[str]):
+    dataframes = []
+    for data in data_path:
+        y = data.split("_")[-1].split(".")[0]
+        df = pl.read_csv(data, has_header=False)
+        df = df.with_columns(
+            [
+                pl.lit(y).alias("class")
+            ]
+        )
+        df = df.rename(
+            {
+                "column_1": "query"
+            }
+        )
+        dataframes.append(df)
+    return pl.concat(dataframes)
+
+def cast_dgta(data_path: str) -> pl.DataFrame:
+    def __custom_decode(data):
+        retL=[None] * len(data)
+        for i, datum in enumerate(data):
+            retL[i]=str(datum.decode('latin-1').encode('utf-8').decode('utf-8'))
+
+        return(pl.Series(retL))
+
+    df = pl.read_parquet(data_path)
+    df = df.rename({"domain": "query"})
+    # Drop unnecessary column
+    df = df.drop("__index_level_0__")
+    df = df.with_columns(
+        [
+            pl.col('query').map(__custom_decode)
+        ]
+    )
+    return df
+
 @dataclass
 class Dataset:
-    train_path: str
-    val_path: str
-    test_path: str
-    cast_dataset: Callable
-    binary: bool = field(default=True)
+    def __init__(self, data_path: Any, cast_dataset: Callable = None) -> None:
+        if cast_dataset != None:
+            self.data = cast_dataset(data_path)
+        else:
+            self.data = pl.read_csv(data_path)
+
+        logging.info(self.data)
+
+        self.X_train, self.X_val, self.X_test, self.Y_train, self.Y_val, self.Y_test = self.__train_test_val_split()
+
+    def __len__(self):
+        return len(self.data)
+
+    def __train_test_val_split(self, train_frac=0.8, random_state=None):
+        X_train, X_tmp, Y_train, Y_tmp = sklearn.model_selection.train_test_split(
+            self.data.drop("class"), 
+            self.data.select("class"), 
+            train_size=train_frac, 
+            random_state=random_state
+        )
+
+        X_val, X_test, Y_val, Y_test = sklearn.model_selection.train_test_split(
+            X_tmp,
+            Y_tmp,
+            train_size=0.5,
+            random_state=random_state
+        )
+
+        return X_train, X_val, X_test, Y_train, Y_val, Y_test
 
     @property
     def train(self):
         return {
-            "train_path": self.train_path,
-            "val_path": self.val_path,
-            "cast_dataset": self.cast_dataset,
-            "binary": self.binary,
+            "X": self.X_train,
+            "Y": self.Y_train
         }
 
     @property
     def test(self):
         return {
-            "test_path": self.test_path,
-            "cast_dataset": self.cast_dataset,
-            "binary": self.binary,
+            "X": self.X_test,
+            "Y": self.Y_test
         }
+
+    @property
+    def val(self):
+        return {
+            "X": self.X_val,
+            "Y": self.Y_val
+        }
+
+dgta_dataset = Dataset(
+    data_path="/home/smachmeier/projects/heiDGA/data/dgta/dgta-benchmark.parquet",
+    cast_dataset=cast_dgta
+)
+
+cic_dataset = Dataset(
+    data_path=["/home/smachmeier/projects/heiDGA/example/CICBellDNS2021_CSV_benign.csv"],
+    cast_dataset=cast_cic
+)
+
diff --git a/heidgaf/dataset/cic.py b/heidgaf/dataset/cic.py
diff --git a/heidgaf/dataset/dgta.py b/heidgaf/dataset/dgta.py
diff --git a/heidgaf/dataset/heicloud.py b/heidgaf/dataset/heicloud.py
diff --git a/heidgaf/dataset/majestic.py b/heidgaf/dataset/majestic.py
@@ -1,4 +1,5 @@
 import os
+import string
 
 import polars as pl
 from torch.utils.data.dataset import Dataset
@@ -7,12 +8,14 @@
 
 
 class MajesticMillionDataset(Dataset):
-    def __init__(self, csv_file, redis_cache: DataFrameRedisCache) -> None:
+    def __init__(self, csv_file: str = "/home/smachmeier/projects/heiDGA/data/majestic_million/majestic_million.csv") -> None:
         self.data = pl.read_csv(csv_file)
-        self.redis_cache = redis_cache
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.data)
 
-    def __getitem__(self, idx: int) -> any:
+    def __getitem__(self, idx: int) -> pl.DataFrame:
         return self.data[idx, 0]
+
+    def __call__(self, name: str, key: str) -> pl.DataFrame:
+        return self.data.filter(pl.col(key) == name)
diff --git a/heidgaf/models/__init__.py b/heidgaf/models/__init__.py
@@ -18,13 +18,13 @@ def __init__(self,
 
     def fit(self, x_train, y_train):
         x_train = self.preprocessor.transform(x=x_train)
-        x_train = self.target_encoder.fit_transform(x=x_train, y=y_train)
-        x_train = self.mean_imputer.fit_transform(x=x_train)
+        # x_train = self.target_encoder.fit_transform(x=x_train, y=y_train)
+        # x_train = self.mean_imputer.fit_transform(x=x_train)
         self.clf.fit(x=x_train.to_numpy(), y=y_train)
 
     def predict(self, x):
         x = self.preprocessor.transform(x=x)
-        x = self.target_encoder.transform(x=x)
-        x = self.mean_imputer.transform(x=x)
+        # x = self.target_encoder.transform(x=x)
+        # x = self.mean_imputer.transform(x=x)
         return self.clf.predict(x=x.to_numpy())
 
diff --git a/heidgaf/post/feature.py b/heidgaf/post/feature.py
@@ -16,7 +16,7 @@ def __init__(self, features_to_drop: List):
         """
         self.features_to_drop = features_to_drop
         # TODO Set majestic million score
-        # self.majesticmillion = MajesticMillionDataset()
+        self.majesticmillion = MajesticMillionDataset()
 
     def transform(self, x: pl.DataFrame) -> pl.DataFrame:
         """Transform our dataset with new features

diff --git a/heidgaf/pre/__init__.py b/heidgaf/pre/__init__.py
@@ -1,4 +1,5 @@
 from abc import ABCMeta, abstractmethod
+from typing import Any
 
 import polars as pl
 
@@ -13,5 +14,9 @@ def __init__(self) -> None:
     @classmethod
     @abstractmethod
     def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
-        # Filter data with no errors
-        df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)
+        pass
+
+    @classmethod
+    @abstractmethod
+    def set_warning(self, data: Any, redis_cache: DataFrameRedisCache):
+        pass
diff --git a/heidgaf/pre/domain_analyzer.py b/heidgaf/pre/domain_analyzer.py
@@ -15,4 +15,4 @@ def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
         # Filter data with no errors
         df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)
 
-        
+
diff --git a/heidgaf/pre/ip_analyzer.py b/heidgaf/pre/ip_analyzer.py
@@ -28,12 +28,16 @@ def run(self, data: pl.DataFrame, redis_cache: DataFrameRedisCache):
         self.__update_count(df, "SLD", self.KEY_SLD, redis_cache)
 
         # TODO: Process frequency and return values
+
         # TODO: Check if IP has more than threshold error request -> if yes, check distribution.
 
 
     def __update_count(df: pl.DataFrame, id: str, key: str, redis_cache: DataFrameRedisCache) -> None:
         frequency = df.group_by(id).count()
 
+        # TODO Dividing highest and lowest timestamp
+        df.max(pl.col("timestamp")) - df.min(pl.col("timestamp"))
+
         # Check if dns_server_frequency exists in redis cache
         if  key in redis_cache:
             frequency = pl.concat([redis_cache[key], frequency]).groupby(id).agg(pl.sum('count'))

diff --git a/heidgaf/train.py b/heidgaf/train.py
@@ -5,23 +5,16 @@
 import torch
 from fe_polars.encoding.target_encoding import TargetEncoder
 from fe_polars.imputing.base_imputing import Imputer
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
-from torch.utils.data import DataLoader
 
+from heidgaf import dataset
 from heidgaf.cache import DataFrameRedisCache
-from heidgaf.dataset.dgta import DGTA
-from heidgaf.dataset.majestic import MajesticMillionDataset
 from heidgaf.models import Pipeline
 from heidgaf.models.lr import LogisticRegression
 from heidgaf.post.feature import Preprocessor
 
 
-class ModelType(Enum):
-    LOGISTIC_REGRESSION = LogisticRegression
-
 class DNSAnalyzerTraining:
-    def __init__(self, model: ModelType, redis_host="localhost", redis_port=6379, redis_db=0, redis_max_connections=20) -> None:
+    def __init__(self, model: torch.nn.Module, redis_host="localhost", redis_port=6379, redis_db=0, redis_max_connections=20) -> None:
         self.redis_cache = DataFrameRedisCache(redis_host, redis_port, redis_db, redis_max_connections)
 
     def train(self, seed=42):
@@ -43,32 +36,21 @@ def train(self, seed=42):
             logging.info(f"\tAllocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
             logging.info(f"\tCached:    {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")
 
-        # TODO Load data set
         logging.info(f'Loading data sets')
-        majestic_dataset = MajesticMillionDataset(self.redis_cache)
-        dgta_dataset = DGTA()
-
-        # TODO Handle Data loader
-        train_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
-        test_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
-        # train_features, train_labels = next(iter(train_dataloader))
 
         # Training model
         model_pipeline = Pipeline(
             preprocessor=Preprocessor(
-                features_to_drop=[]),
+                features_to_drop=["query"]),
             mean_imputer=Imputer(
-                features_to_impute=["", ""], strategy="mean"),
+                features_to_impute=["FQDN_full_count"], strategy="mean"),
             target_encoder=TargetEncoder(
                 smoothing=100,
-                features_to_encode=["", "", "", "", "",""]),
+                features_to_encode=[]),
             clf=LogisticRegression(input_dim=9, output_dim=1, epochs=5000)
         )
-
-        # train, target, test = data_loader()
-        # x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.33, random_state=seed)
-
-        # model_pipeline.fit(x_train=x_train, y_train=y_train)
+
+        model_pipeline.fit(x_train=dataset.dgta_dataset.X_train, y_train=dataset.dgta_dataset.Y_train)