Skip to content

Commit

Permalink
Current state
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanDeveloper committed Mar 22, 2024
1 parent 630cd25 commit 238992e
Show file tree
Hide file tree
Showing 10 changed files with 73 additions and 26 deletions.
5 changes: 3 additions & 2 deletions heidgaf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from enum import Enum
import logging
import sys

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"], show_default=True)


class ReturnCode(Enum):
NOERROR = "NOERROR"


# set up logging to file
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%y-%m-%d %H:%M:%S',
handlers=[
logging.FileHandler("heidgaf.log"),
logging.StreamHandler()
])
]
)
2 changes: 1 addition & 1 deletion heidgaf/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __contains__(self, key):
return self.redis_client.exists(key)

def __str__(self):
pass
return f'Redis has stored following keys: {self.redis_client.keys}'

def __setitem__(self, key: str, df: pl.DataFrame) -> pl.DataFrame:
self.redis_client.set(key, df.write_ipc(file=None, compression="lz4").getvalue())
Expand Down
16 changes: 8 additions & 8 deletions heidgaf/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@ def cli():
def check_gpu():
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.info(f'Using device: {device}')

# Additional Info when using cuda
logging.info(f'Using device: {device}')
if torch.cuda.is_available():
logging.info("GPU detected")
logging.info(f"\t{torch.cuda.get_device_name(0)}")

if device.type == 'cuda':
logging.info(torch.cuda.get_device_name(0))
logging.info('Memory Usage:')
logging.info(
f'Allocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB')
logging.info(
f'Cached: {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB')
logging.info("Memory Usage:")
logging.info(f"\tAllocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
logging.info(f"\tCached: {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")


@cli.group(name="train", context_settings={"show_default": True})
Expand Down
32 changes: 29 additions & 3 deletions heidgaf/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,43 @@
from torch.utils.data.dataset import Dataset
import torch
import string
import pandas as pd
import polars as pl
from dataclasses import dataclass, field
from typing import Callable, Tuple

@dataclass
class Dataset:
train_path: str
val_path: str
test_path: str
cast_dataset: Callable
binary: bool = field(default=True)

class DomainDataset(Dataset):
@property
def train(self):
return {
"train_path": self.train_path,
"val_path": self.val_path,
"cast_dataset": self.cast_dataset,
"binary": self.binary,
}

@property
def test(self):
return {
"test_path": self.test_path,
"cast_dataset": self.cast_dataset,
"binary": self.binary,
}

class DomainDataset():
def __init__(self, csv_path, train=True):
"""
Args:
csv_path (string): path to csv file
train (string): flag train or test mode i.e. labeled or not
"""
self.data_df = pd.read_csv(csv_path, header=None)
self.data_df = pl.read_csv(csv_path, header=None)
self.all_chars = self.__build__chars__()
self.inputs = self.data_df.iloc[:, 0]
self.train = train
Expand Down
5 changes: 4 additions & 1 deletion heidgaf/dataset/dgta.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@

class DGTA():
from heidgaf.dataset import DomainDataset


class DGTA(DomainDataset):
def __init__(self) -> None:
pass

Expand Down
6 changes: 4 additions & 2 deletions heidgaf/dataset/heicloud.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from torch.utils.data.dataset import Dataset

#2023-10-15T00:00:00.050782194Z NOERROR 129.206.5.241 129.206.100.126 heigitsv02.heigit.org AAAA - 123b

class HeiCLOUD(Dataset):
from heidgaf.dataset import DomainDataset


class HeiCLOUD(DomainDataset):
def __init__(self, logs_path: str) -> None:
pass

Expand Down
1 change: 0 additions & 1 deletion heidgaf/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@

import os
import polars as pl
import redis
import logging
from enum import Enum
from click import Path
Expand Down
16 changes: 11 additions & 5 deletions heidgaf/models/xgboost.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@

from heidgaf.models import Model

import xgboost as xgb
import polars as pl

class XGBoost(Model):
def __init__(self) -> None:
def __init__(self, pre_trained_model: str, data: pl.DataFrame, train=True) -> None:
super().__init__()
self.model = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)

if train:
self.__train()

def train():
pass
def __train(self, df: pl.DataFrame) -> None:
self.model.fit()
self.model.save_model("clf.json")

def evaluate():
def __evaluate(self) -> None:
pass
7 changes: 5 additions & 2 deletions heidgaf/post/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ def lexical_features(self, dataframes: pl.DataFrame) -> pl.DataFrame:
(pl.col("query").str.strip_chars(".").str.len_chars().alias("label_average")),
]
)

dataframes = dataframes.with_columns(
[
[
# FQDN
(pl.when(pl.col("labels").list.len() > 2)
.then(
Expand Down Expand Up @@ -66,6 +66,9 @@ def lexical_features(self, dataframes: pl.DataFrame) -> pl.DataFrame:
dataframes = dataframes.with_columns([
(pl.col("query").entropy(base=2).alias("FQDN_entropy")),
])

# TODO Add features

def majesticmillion_rank_feature():
# TODO Implement feature rank
pass
9 changes: 8 additions & 1 deletion heidgaf/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from torch.utils.data import DataLoader

import logging
from heidgaf.dataset.dgta import DGTA
from heidgaf.dataset.majestic import MajesticMillionDataset
# from heidgaf.metrics

Expand All @@ -19,10 +20,16 @@ def train():
logging.info(f"\tAllocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
logging.info(f"\tCached: {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")

# TODO Load data set
logging.info(f'Loading data sets')
majestic_dataset = MajesticMillionDataset("./data/majestic_million.csv")
majestic_dataset = MajesticMillionDataset()
dgta_dataset = DGTA()

# TODO Handle Data loader
train_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(majestic_dataset, batch_size=64, shuffle=True)
train_features, train_labels = next(iter(train_dataloader))

# TODO Train models


0 comments on commit 238992e

Please sign in to comment.