Current state

stefanDeveloper · Mar 22, 2024 · 238992e · 238992e
1 parent 630cd25
commit 238992e
Show file tree

Hide file tree

Showing 10 changed files with 73 additions and 26 deletions.
diff --git a/heidgaf/__init__.py b/heidgaf/__init__.py
@@ -1,18 +1,19 @@
 from enum import Enum
 import logging
+import sys
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"], show_default=True)
 
 
 class ReturnCode(Enum):
     NOERROR = "NOERROR"
 
-
 # set up logging to file
 logging.basicConfig(level=logging.DEBUG,
                     format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                     datefmt='%y-%m-%d %H:%M:%S',
                     handlers=[  
                         logging.FileHandler("heidgaf.log"),
                         logging.StreamHandler()
-                    ])
+                    ]
+                    )
diff --git a/heidgaf/cache.py b/heidgaf/cache.py
@@ -20,7 +20,7 @@ def __contains__(self, key):
         return self.redis_client.exists(key)
 
     def __str__(self):
-        pass
+        return f'Redis has stored following keys: {self.redis_client.keys}'
 
     def __setitem__(self, key: str, df: pl.DataFrame) -> pl.DataFrame:
         self.redis_client.set(key, df.write_ipc(file=None, compression="lz4").getvalue())

diff --git a/heidgaf/cli.py b/heidgaf/cli.py
@@ -23,16 +23,16 @@ def cli():
 def check_gpu():
     # setting device on GPU if available, else CPU
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    logging.info(f'Using device: {device}')
 
-    # Additional Info when using cuda
+    logging.info(f'Using device: {device}')
+    if torch.cuda.is_available():
+        logging.info("GPU detected")
+        logging.info(f"\t{torch.cuda.get_device_name(0)}")
+
     if device.type == 'cuda':
-        logging.info(torch.cuda.get_device_name(0))
-        logging.info('Memory Usage:')
-        logging.info(
-            f'Allocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB')
-        logging.info(
-            f'Cached:    {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB')
+        logging.info("Memory Usage:")
+        logging.info(f"\tAllocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
+        logging.info(f"\tCached:    {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")
 
 
 @cli.group(name="train", context_settings={"show_default": True})

diff --git a/heidgaf/dataset/__init__.py b/heidgaf/dataset/__init__.py
@@ -1,17 +1,43 @@
 from torch.utils.data.dataset import Dataset
 import torch
 import string
-import pandas as pd
+import polars as pl
+from dataclasses import dataclass, field
+from typing import Callable, Tuple
 
+@dataclass
+class Dataset:
+    train_path: str
+    val_path: str
+    test_path: str
+    cast_dataset: Callable
+    binary: bool = field(default=True)
 
-class DomainDataset(Dataset):
+    @property
+    def train(self):
+        return {
+            "train_path": self.train_path,
+            "val_path": self.val_path,
+            "cast_dataset": self.cast_dataset,
+            "binary": self.binary,
+        }
+
+    @property
+    def test(self):
+        return {
+            "test_path": self.test_path,
+            "cast_dataset": self.cast_dataset,
+            "binary": self.binary,
+        }
+
+class DomainDataset():
     def __init__(self, csv_path, train=True):
         """
         Args:
             csv_path (string): path to csv file
             train (string): flag train or test mode i.e. labeled or not
         """
-        self.data_df = pd.read_csv(csv_path, header=None)
+        self.data_df = pl.read_csv(csv_path, header=None)
         self.all_chars = self.__build__chars__()
         self.inputs = self.data_df.iloc[:, 0]
         self.train = train

diff --git a/heidgaf/dataset/dgta.py b/heidgaf/dataset/dgta.py
@@ -1,5 +1,8 @@
 
-class DGTA():
+from heidgaf.dataset import DomainDataset
+
+
+class DGTA(DomainDataset):
     def __init__(self) -> None:
         pass
 

diff --git a/heidgaf/dataset/heicloud.py b/heidgaf/dataset/heicloud.py
@@ -1,8 +1,10 @@
-from torch.utils.data.dataset import Dataset
 
 #2023-10-15T00:00:00.050782194Z NOERROR 129.206.5.241 129.206.100.126 heigitsv02.heigit.org AAAA - 123b
 
-class HeiCLOUD(Dataset):
+from heidgaf.dataset import DomainDataset
+
+
+class HeiCLOUD(DomainDataset):
     def __init__(self, logs_path: str) -> None:
         pass
 

diff --git a/heidgaf/main.py b/heidgaf/main.py
@@ -1,7 +1,6 @@
 
 import os
 import polars as pl
-import redis
 import logging
 from enum import Enum
 from click import Path

diff --git a/heidgaf/models/xgboost.py b/heidgaf/models/xgboost.py
@@ -1,13 +1,19 @@
 
 from heidgaf.models import Model
-
+import xgboost as xgb
+import polars as pl
 
 class XGBoost(Model):
-    def __init__(self) -> None:
+    def __init__(self, pre_trained_model: str, data: pl.DataFrame, train=True) -> None:
         super().__init__()
+        self.model = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
+
+        if train:
+            self.__train()
 
-    def train():
-        pass
+    def __train(self, df: pl.DataFrame) -> None:
+        self.model.fit()
+        self.model.save_model("clf.json")
 
-    def evaluate():
+    def __evaluate(self) -> None:
         pass
diff --git a/heidgaf/post/feature.py b/heidgaf/post/feature.py
@@ -15,9 +15,9 @@ def lexical_features(self, dataframes: pl.DataFrame) -> pl.DataFrame:
                 (pl.col("query").str.strip_chars(".").str.len_chars().alias("label_average")),
             ]
         )
-        
+
         dataframes = dataframes.with_columns(
-            [   
+            [
                 # FQDN
                 (pl.when(pl.col("labels").list.len() > 2)
                     .then(
@@ -66,6 +66,9 @@ def lexical_features(self, dataframes: pl.DataFrame) -> pl.DataFrame:
         dataframes = dataframes.with_columns([
             (pl.col("query").entropy(base=2).alias("FQDN_entropy")),
         ])
+
+        # TODO Add features
 
     def majesticmillion_rank_feature():
+        # TODO Implement feature rank
         pass
diff --git a/heidgaf/train.py b/heidgaf/train.py
@@ -2,6 +2,7 @@
 from torch.utils.data import DataLoader
 
 import logging
+from heidgaf.dataset.dgta import DGTA
 from heidgaf.dataset.majestic import MajesticMillionDataset
 # from heidgaf.metrics
 
@@ -19,10 +20,16 @@ def train():
         logging.info(f"\tAllocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
         logging.info(f"\tCached:    {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")
 
+    # TODO Load data set
     logging.info(f'Loading data sets')
-    majestic_dataset = MajesticMillionDataset("./data/majestic_million.csv")
+    majestic_dataset = MajesticMillionDataset()
+    dgta_dataset = DGTA()
+
+    # TODO Handle Data loader
     train_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
     test_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
     train_features, train_labels = next(iter(train_dataloader))
+
+    # TODO Train models