From 1118ad12f77a4ab9e1a1774cfea4df36c1d71305 Mon Sep 17 00:00:00 2001
From: 98MM <47939788+98MM@users.noreply.github.com>
Date: Thu, 4 Jul 2024 09:04:52 +0200
Subject: [PATCH 1/9] added complex synthetic feature generators
Added a suite of functions related to synthetic feature generation.
---
.idea/.gitignore | 8 +
.../inspectionProfiles/profiles_settings.xml | 6 +
.idea/misc.xml | 7 +
.idea/modules.xml | 8 +
.idea/outrank.iml | 14 +
.idea/vcs.xml | 6 +
.../synthetic_data_generators/cc_generator.py | 702 ++++++++++++++++++
tests/cc_generator_tests.py | 157 ++++
8 files changed, 908 insertions(+)
create mode 100644 .idea/.gitignore
create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
create mode 100644 .idea/misc.xml
create mode 100644 .idea/modules.xml
create mode 100644 .idea/outrank.iml
create mode 100644 .idea/vcs.xml
create mode 100644 outrank/algorithms/synthetic_data_generators/cc_generator.py
create mode 100644 tests/cc_generator_tests.py
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..877d184
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..3c2f566
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/outrank.iml b/.idea/outrank.iml
new file mode 100644
index 0000000..8e5446a
--- /dev/null
+++ b/.idea/outrank.iml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py
new file mode 100644
index 0000000..2f00f7e
--- /dev/null
+++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py
@@ -0,0 +1,702 @@
+import numpy as np
+from scipy.linalg import qr
+from scipy.stats import norm
+from sklearn.cluster import KMeans
+from sklearn.utils import resample
+from typing import List, Union, Optional, Tuple
+class CategoricalClassification:
+
+ def __init__(self):
+ self.dataset_info = {
+ 'general': {},
+ 'combinations': [],
+ 'correlations': [],
+ 'duplicates': [],
+ 'labels': [],
+ 'noise': []
+ }
+
+ def __repr__(self):
+ return f"CategoricalClassification(dataset_info={self.dataset_info})"
+
+ def generate_data(self,
+ n_features: int,
+ n_samples: int,
+ cardinality: int = 5,
+ structure: Optional = None,
+ ensure_rep: bool = False,
+ seed: int = 42) -> np.ndarray:
+
+ """
+ Generates dataset based on parameters
+ :param n_features: number of generated features
+ :param n_samples: number of generated samples
+ :param cardinality: default cardinality of the dataset
+ :param structure: structure of the dataset
+ :param ensure_rep: flag, ensures all given values represented
+ :param seed: sets seed of numpy random
+ :return: X, 2D dataset
+ """
+
+ self.dataset_info.update({
+ 'general': {
+ 'n_features': n_features,
+ 'n_samples': n_samples,
+ 'cardinality': cardinality,
+ 'structure': structure,
+ 'ensure_rep': ensure_rep,
+ 'seed': seed
+ }
+ })
+
+ np.random.seed(seed)
+ X = np.empty([n_features, n_samples])
+
+ if structure == None:
+
+ for i in range(n_features):
+ x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ X[i] = x
+
+ else:
+
+ ix = 0
+ for data in structure:
+
+ if not isinstance(data[0], (list, np.ndarray)):
+ feature_ix = data[0]
+ feature_cardinality = data[1]
+
+ if ix < feature_ix:
+ for i in range(ix, feature_ix):
+ x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ X[ix] = x
+ ix += 1
+
+ if not isinstance(feature_cardinality, (list, np.ndarray)):
+ x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep)
+ else:
+ if isinstance(feature_cardinality[0], (list, np.ndarray)):
+ value_domain = feature_cardinality[0]
+ value_frequencies = feature_cardinality[1]
+ x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies)
+ else:
+ value_domain = feature_cardinality
+ x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep)
+ X[ix] = x
+ ix += 1
+
+ else:
+ feature_ixs = data[0]
+ feature_cardinality = data[1]
+ for feature_ix in feature_ixs:
+ if ix < feature_ix:
+ for i in range(ix, feature_ix):
+ x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ X[ix] = x
+ ix += 1
+
+ if not isinstance(feature_cardinality, (list, np.ndarray)):
+ x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep)
+ else:
+ value_domain = feature_cardinality[0]
+ value_frequencies = feature_cardinality[1]
+ x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies)
+ X[ix] = x
+ ix += 1
+
+ if ix < n_features:
+ for i in range(ix, n_features):
+ x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ X[i] = x
+
+ return X.T
+
+ def _generate_feature(self,
+ v: Union[int, List[int], np.ndarray],
+ size: int,
+ ensure_rep: bool = False,
+ p: Optional[Union[List[float], np.ndarray]] = None) -> np.ndarray:
+ """
+ Generates feature vector of length size. Default probability density distribution is approx. normal, centred around randomly picked value.
+ :param v: either int for cardinality, or list of values
+ :param size: length of feature vector
+ :param ensure_rep: ensures all values are represented at least once in the feature vector
+ :param p: list of probabilities of each value
+ :return:
+ """
+ if not isinstance(v, (list, np.ndarray)):
+ v = np.arange(0, v, 1)
+ else:
+ v = np.array(v)
+
+ if p is None:
+ v_shift = v - v[np.random.randint(len(v))]
+ p = norm.pdf(v_shift, scale=3)
+ else:
+ p = np.array(p)
+
+ p = p / p.sum()
+
+ if ensure_rep and len(v) < size:
+ sampled_values = np.random.choice(v, size=(size - len(v)), p=p)
+ sampled_values = np.append(sampled_values, v)
+ else:
+ sampled_values = np.random.choice(v, size=size, p=p)
+
+ np.random.shuffle(sampled_values)
+ return sampled_values
+
+ def generate_combinations(self,
+ X: np.ndarray,
+ feature_indices: Union[List[int], np.ndarray],
+ combination_function: Optional = None,
+ combination_type: str ='linear') -> np.ndarray:
+ """
+ Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
+ :param X: dataset
+ :param feature_indices: indexes of features to be in combination
+ :param combination_function: optional custom function for combining feature vectors
+ :param combination_type: string flag, either liner or nonlinear, defining combination type
+ :return: X with added resultant feature
+ """
+
+
+ selected_features = X[:, feature_indices]
+
+ if combination_function is None:
+ if combination_type == 'linear':
+ combination_function = lambda x: np.sum(x, axis=1)
+ elif combination_type == 'nonlinear':
+ combination_function = lambda x: np.sin(np.sum(x, axis=1))
+ else:
+ combination_type = str(combination_function.__name__)
+
+ combination_result = combination_function(selected_features)
+
+ combination_ix = len(X[0])
+
+ self.dataset_info['combinations'].append({
+ 'feature_indices': feature_indices,
+ 'combination_type': combination_type,
+ 'combination_ix': combination_ix
+ })
+
+ return np.column_stack((X, combination_result))
+
+ def _xor(self, arr):
+ """
+ Performs bitwise XOR operation on two integer arrays
+ :param a: array
+ :param b: array
+ :return: bitwise XOR result
+ """
+ arrT = arr.T
+ arrT = arrT.astype(int)
+ out = np.bitwise_xor(arrT[0], arrT[1])
+ if len(arrT) > 2:
+ for i in range(2, len(arrT)):
+ out = np.bitwise_xor(out, arrT[i])
+
+ return out.T
+
+ def _and(self, arr):
+ """
+ Performs bitwise AND operation on two integer arrays
+ :param a: array
+ :param b: array
+ :return: bitwise AND result
+ """
+ arrT = arr.T
+ arrT = arrT.astype(int)
+ out = np.bitwise_xor(arrT[0], arrT[1])
+ if len(arrT) > 2:
+ for i in range(2, len(arrT)):
+ out = np.bitwise_and(out, arrT[i])
+
+ return out.T
+
+ def _or(self, arr):
+ """
+ Performs bitwise OR operation on two integer arrays
+ :param a: array
+ :param b: array
+ :return: bitwise OR result
+ """
+ arrT = arr.T
+ arrT = arrT.astype(int)
+ out = np.bitwise_xor(arrT[0], arrT[1])
+ if len(arrT) > 2:
+ for i in range(2, len(arrT)):
+ out = np.bitwise_or(out, arrT[i])
+
+ return out.T
+ def generate_correlated(self,
+ X: np.ndarray,
+ feature_indices: Union[List[int], np.ndarray],
+ r: float = 0.8) -> np.ndarray:
+
+ """
+ Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
+ :param X: dataset
+ :param feature_indices: indices of features to generate correlated feature to
+ :param r: (Pearson) correlation factor
+ :return: X with generated correlated features
+ """
+
+ if not isinstance(feature_indices, (list, np.ndarray)):
+ feature_indices = np.array([feature_indices])
+
+ if len(feature_indices) > 1:
+ correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
+ else:
+ correlated_ixs = len(X[0])
+
+ selected_features = X[:, feature_indices]
+ transposed = np.transpose(selected_features)
+ correlated_features = []
+
+ for t in transposed:
+ theta = np.arccos(r)
+ t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
+
+ rand = np.random.normal(0, 1, len(t_standard))
+ rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
+
+ M = np.column_stack((t_standard, rand))
+ M_centred = (M - np.mean(M, axis=0))
+
+ Id = np.eye(len(t))
+ Q = qr(M_centred[:, [0]], mode='economic')[0]
+ P = np.dot(Q, Q.T)
+ orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
+ M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
+
+ Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
+ corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
+
+ correlated_features.append(corr)
+
+ correlated_features = np.transpose(correlated_features)
+
+ self.dataset_info['correlations'].append({
+ 'feature_indices': feature_indices,
+ 'correlated_indices': correlated_ixs,
+ 'correlation_factor': r
+ })
+
+ return np.column_stack((X, correlated_features))
+
+ def generate_duplicates(self,
+ X: np.ndarray,
+ feature_indices: Union[List[int], np.ndarray]) -> np.ndarray:
+ """
+ Generates duplicate features
+ :param X: dataset
+ :param feature_indices: indices of features to duplicate
+ :return: dataset with duplicated features
+ """
+ if not isinstance(feature_indices, (list, np.ndarray)):
+ feature_indices = np.array([feature_indices])
+
+ duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
+
+ selected_features = X[:, feature_indices]
+
+ self.dataset_info['duplicates'].append({
+ 'feature_indices': feature_indices,
+ 'duplicate_indices': duplicated_ixs
+ })
+
+ return np.column_stack((X, selected_features))
+
+ def generate_labels(self,
+ X: np.ndarray,
+ n: int = 2,
+ p: Union[float, list[float], np.ndarray] = 0.5,
+ k: Union[int, float] = 2,
+ decision_function: Optional = None,
+ class_relation: str ='linear',
+ balance: bool = False):
+ """
+ Generates labels for dataset X
+ :param X: dataset
+ :param n: number of class labels
+ :param p: class distribution
+ :param k: constant
+ :param decision_function: optional user-defined decision function
+ :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
+ :param balance: boolean, whether to balance clustering class labels
+ :return: array of labels, corresponding to dataset X
+ """
+
+ if isinstance(p, (list, np.ndarray)):
+ if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
+ if len(p) > n: raise ValueError('length of p must equal n')
+
+ if p > 1: raise ValueError('p must be less than 1.0')
+
+ n_samples, n_features = X.shape
+
+ if decision_function is None:
+ if class_relation == 'linear':
+ decision_function = lambda x: np.sum(2 * x + 3, axis=1)
+ elif class_relation == 'nonlinear':
+ decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
+ elif class_relation == 'cluster':
+ decision_function = None
+ else:
+ class_relation = str(decision_function.__name__)
+
+ y = []
+ if decision_function is not None:
+ if n > 2:
+ if type(p) != list:
+ p = 1 / n
+ percentiles = [p * 100]
+ for i in range(1, n - 1):
+ percentiles.append(percentiles[i - 1] + (p * 100))
+
+ decision_boundary = decision_function(X)
+ p_points = np.percentile(decision_boundary, percentiles)
+
+ y = np.zeros_like(decision_boundary, dtype=int)
+ for p_point in p_points:
+ y += (decision_boundary > p_point)
+ else:
+ decision_boundary = decision_function(X)
+ percentiles = [x * 100 for x in p]
+
+ for i in range(1, len(percentiles) - 1):
+ percentiles[i] += percentiles[i - 1]
+
+ percentiles.insert(0, 0)
+ percentiles.pop()
+ print(percentiles)
+
+ p_points = np.percentile(decision_boundary, percentiles)
+ print(p_points)
+
+ y = np.zeros_like(decision_boundary, dtype=int)
+ for i in range(1, n):
+ p_point = p_points[i]
+ for j in range(len(decision_boundary)):
+ if decision_boundary[j] > p_point:
+ y[j] += 1
+ else:
+ decision_boundary = decision_function(X)
+ p_point = np.percentile(decision_boundary, p * 100)
+ y = np.where(decision_boundary > p_point, 1, 0)
+ else:
+ if p == 0.5:
+ p = 1.0
+ else:
+ p = [p, 1 - p]
+ y = self._cluster_data(X, n, p=p, balance=balance)
+
+ self.dataset_info.update({
+ 'labels': {
+ 'class_relation': class_relation,
+ 'n_class': n
+ }
+ })
+
+ return y
+
+ def _cluster_data(self,
+ X: np.ndarray,
+ n: int,
+ p: Optional[Union[float, List[float], np.ndarray]] = 1.0,
+ balance: bool = False) -> np.ndarray:
+ """
+ Cluster data using kmeans
+ :param X: dataset
+ :param n: number of clusters
+ :param p: class distribution
+ :param balance: balance the clusters according to p
+ :return: array of labels, corresponding to dataset X
+ """
+
+ kmeans = KMeans(n_clusters=n)
+
+ kmeans.fit(X)
+
+ cluster_labels = kmeans.labels_
+
+ if not isinstance(p, (list, np.ndarray)): # Fully balanced clusters
+ samples_per_cluster = [len(X) // n] * n
+ else:
+ samples = len(X)
+ samples_per_cluster = []
+ if not isinstance(p, (list, np.ndarray)):
+ samples_per_cluster.append(int(samples * p) // n)
+ samples_per_cluster.append(int(samples * (1 - p)) // n)
+ else:
+ if len(p) == n:
+ for val in p:
+ samples_per_cluster.append(int(samples * val))
+ else:
+ raise Exception("Length of balance parameter must equal number of clusters.")
+
+ # Adjust cluster sizes
+ if balance:
+ adjustments = []
+ overflow_samples = []
+ overflow_indices = []
+ for i in range(n):
+ cluster_size = np.sum(cluster_labels == i)
+
+ adjustment = samples_per_cluster[i] - cluster_size
+ adjustments.append(adjustment)
+
+ if adjustment < 0: # Cluter is too large
+
+ centroid = kmeans.cluster_centers_[i]
+ dataset_indices = np.where(cluster_labels == i)[0] # Indices of samples in dataset
+ cluster_samples = np.copy(X[dataset_indices])
+
+ distances = np.linalg.norm(cluster_samples - centroid,
+ axis=1) # Distances of cluster samples to cluster centroid
+ cluster_sample_indices = np.argsort(distances)
+ dataset_indices_sorted = dataset_indices[
+ cluster_sample_indices] # Indices of samples sorted by sample distance to cluster centroid
+
+ overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:] # Overflow samples
+ dataset_indices_sorted = dataset_indices_sorted[
+ samples_per_cluster[i]:] # Dataset indices of overflow samples
+
+ for i in range(len(overflow_sample_indices)):
+ overflow_samples.append(cluster_samples[overflow_sample_indices[i]])
+ overflow_indices.append(dataset_indices_sorted[i])
+
+ overflow_samples = np.array(overflow_samples)
+ overflow_indices = np.array(overflow_indices)
+
+ # Making adjustments
+ for i in range(n):
+
+ if adjustments[i] > 0:
+ centroid = kmeans.cluster_centers_[i]
+ distances = np.linalg.norm(overflow_samples - centroid, axis=1)
+
+ closest_sample_indices = np.argsort(distances)
+
+ overflow_indices_sorted = overflow_indices[closest_sample_indices]
+
+ sample_indices_slice = closest_sample_indices[:adjustments[i]]
+ overflow_indices_slice = overflow_indices_sorted[:adjustments[i]]
+
+ cluster_labels[overflow_indices_slice] = i
+
+ overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0)
+ overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0)
+
+ return np.array(cluster_labels)
+
+ def generate_noise(self,
+ X: np.ndarray,
+ y: Union[List[int], np.ndarray],
+ p: float = 0.2,
+ type: str = "categorical",
+ missing_val: Union[str, int, float] = float('-inf')) -> np.ndarray:
+
+ """
+ Simulates noise on given dataset X
+ :param X: dataset to apply noise to
+ :param y: required target labels for categorical noise generation
+ :param p: amount of noise to apply. Defaults to 0.2
+ :param type: type of noise to apply, either categorical or missing
+ :param missing_val: value to simulate missing values. Defaults to float('-inf')
+ :return: X with noise applied
+ """
+
+ self.dataset_info['noise'].append({
+ 'type': type,
+ 'amount': p
+ })
+
+ if type == "categorical":
+ label_values, label_count = np.unique(y, return_counts=True)
+ n_labels = len(label_values)
+
+ inds = y.argsort()
+ y_sort = y[inds]
+ X_sort = X[inds]
+
+ Xs_T = X_sort.T
+ n = Xs_T.shape[1]
+ n_flip = int(n * p)
+
+ for feature in Xs_T:
+ unique_per_label = {}
+
+ for i in range(n_labels):
+ if i == 0:
+ unique = np.unique(feature[:label_count[i]])
+ unique_per_label[label_values[i]] = set(unique)
+ else:
+ unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
+ unique_per_label[label_values[i]] = set(unique)
+
+ ixs = np.random.choice(n, n_flip, replace=False)
+
+ for ix in ixs:
+ current_label = y_sort[ix]
+ possible_labels = np.where(label_values != current_label)[0]
+
+ # find all unique values from labels != current label
+ values = set()
+ for key in possible_labels:
+ values = values.union(unique_per_label[key])
+
+ # remove any overlapping values, ensuring replacement values are unique & from a target label !=
+ # current label
+ for val in unique_per_label[current_label] & values:
+ values.remove(val)
+
+ if len(values) > 0:
+ val = np.random.choice(list(values))
+
+ else:
+ key = possible_labels[np.random.randint(len(possible_labels))]
+ values = unique_per_label[key]
+ val = np.random.choice(list(values))
+
+ feature[ix] = val
+
+ rev_ind = inds.argsort()
+ X_noise = Xs_T.T
+ X_noise = X_noise[rev_ind]
+
+ return X_noise
+
+ elif type == "missing":
+ X_noise = np.copy(X)
+ Xn_T = X_noise.T
+ n = Xn_T.shape[1]
+ n_missing = int(n * p)
+ #print("n to delete:", n_missing)
+
+ for feature in Xn_T:
+ ixs = np.random.choice(n, n_missing, replace=False)
+
+ for ix in ixs:
+ feature[ix] = missing_val
+
+ return Xn_T.T
+
+ def downsample_dataset(self,
+ X: np.array,
+ y: Union[List[int], np.ndarray],
+ N: Optional[Union[int, None]] = None,
+ seed: int = 42,
+ reshuffle: bool=False) -> Tuple[np.array, np.ndarray]:
+
+ """
+ Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
+ :param X: Dataset to downsample
+ :param y: Labels corresponding to X
+ :param N: Optional number of samples per class to downsample to
+ :param seed: Seed for random state of resample function
+ :param reshuffle: Reshuffle the dataset after downsampling
+ :return: Balanced X and y after downsampling
+ """
+
+ original_shape = X.shape
+
+ values, counts = np.unique(y, return_counts=True)
+ if N is None:
+ N = min(counts)
+
+ if N > min(counts):
+ raise ValueError("N must be equal to or less than the number of samples in minority class")
+
+ X_arrays_list = []
+ y_downsampled = []
+ for label in values:
+ X_label = [X[i] for i in range(len(y)) if y[i] == label]
+ X_label_downsample = resample(X_label,
+ replace=True,
+ n_samples=N,
+ random_state=seed)
+ X_arrays_list.append(X_label_downsample)
+ ys = [label] * N
+ y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
+
+ X_downsampled = np.concatenate(X_arrays_list, axis=0)
+
+ if reshuffle:
+ indices = np.arange(len(X_downsampled))
+ np.random.shuffle(indices)
+ X_downsampled = X_downsampled[indices]
+ y_downsampled = y_downsampled[indices]
+
+ downsampled_shape = X_downsampled.shape
+
+ self.dataset_info.update({
+ 'downsampling': {
+ 'original_shape': original_shape,
+ 'downsampled_shape': downsampled_shape
+ }
+ })
+
+ return X_downsampled, y_downsampled
+
+ def print_dataset(self, X, y):
+ """
+ Prints given dataset
+ :param X: dataset
+ :param y: labels
+ :return:
+ """
+
+ n_samples, n_features = X.shape
+ n = 0
+ for arr in X:
+ print('[', end='')
+ for i in range(n_features):
+ if i == n_features - 1:
+ print(arr[i], end='')
+ else:
+ print(arr[i], end=', ')
+ print("], Label: {}".format(y[n]))
+ n += 1
+
+
+ def summarize(self):
+
+ print(f"Number of features: {self.dataset_info['general']['n_features']}")
+ print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
+ if self.dataset_info['downsampling']:
+ print(f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}")
+ print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
+ print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
+
+
+ print('-------------------------------------')
+
+ if len(self.dataset_info['combinations']) > 0:
+ print("Combinations:")
+ for comb in self.dataset_info['combinations']:
+ print(f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}")
+ print('-------------------------------------')
+
+ if len(self.dataset_info['correlations']) > 0:
+ print("Correlations:")
+ for corr in self.dataset_info['correlations']:
+ print(f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}")
+ print('-------------------------------------')
+
+ if len(self.dataset_info['duplicates']) > 0:
+ print("Duplicates:")
+ for dup in self.dataset_info['duplicates']:
+ print(f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}")
+ print('-------------------------------------')
+
+ if len(self.dataset_info['noise']) > 0:
+ print("Simulated noise:")
+ for noise in self.dataset_info['noise']:
+ print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}")
+ print('-------------------------------------')
+
+ print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
\ No newline at end of file
diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py
new file mode 100644
index 0000000..e5665f6
--- /dev/null
+++ b/tests/cc_generator_tests.py
@@ -0,0 +1,157 @@
+import pytest
+import numpy as np
+from scipy.stats import pearsonr
+from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification
+
+@pytest.fixture
+def cc_instance():
+ return CategoricalClassification()
+
+def test_init(cc_instance):
+ assert cc_instance.dataset_info == ''
+
+def test_generate_data_shape_and_type(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ assert isinstance(X, np.ndarray), "Output should be a numpy array"
+ assert X.shape == (100, 5), "Shape should be (n_samples, n_features)"
+
+def test_generate_data_cardinality(cc_instance):
+ n_features = 5
+ cardinality = 3
+ X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality)
+ unique_values = np.unique(X)
+ assert len(unique_values) <= cardinality, "Cardinality not respected for all features"
+
+def test_generate_data_ensure_rep(cc_instance):
+ n_features = 5
+ cardinality = 50
+ X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality, ensure_rep=True)
+ unique_values = np.unique(X)
+ assert len(unique_values) == cardinality, "Not all values represented when 'ensure_rep=True'"
+
+def test_generate_feature_shape_and_type(cc_instance):
+ feature = cc_instance._generate_feature(5, size=100)
+ assert isinstance(feature, np.ndarray), "Output should be a numpy array"
+ assert feature.shape == (100,), "Shape should be (size,)"
+
+def test_generate_feature_cardinality(cc_instance):
+ feature = cc_instance._generate_feature(5, size=100)
+ unique_values = np.unique(feature)
+ assert len(unique_values) <= 5, "Feature cardinality not respected for all features"
+
+def test_generate_feature_ensure_rep(cc_instance):
+ feature = cc_instance._generate_feature(50, size=100, ensure_rep=True)
+ unique_values = np.unique(feature)
+ assert len(unique_values) == 50, "Not all values represented when using 'ensure_rep=True'"
+
+def test_generate_feature_values(cc_instance):
+ values = [5, 6, 7, 8, 9, 10]
+ feature = cc_instance._generate_feature(values, size=100)
+ unique_values = np.unique(feature)
+ assert any(f in feature for f in values), "Feature values not in input list"
+def test_generate_feature_values_ensure_rep(cc_instance):
+ values = [5, 6, 7, 8, 9, 10]
+ feature = cc_instance._generate_feature(values, size=100, ensure_rep=True)
+ unique_values = np.unique(feature)
+ assert (values == unique_values).all(), "Feature values should match input list when 'ensure_rep=True'"
+
+def test_generate_feature_density(cc_instance):
+ values = [0, 1, 2]
+ p = [0.2, 0.4, 0.4]
+ feature = cc_instance._generate_feature(values, size=10000, ensure_rep=True, p=p)
+ values, counts = np.unique(feature, return_counts=True)
+ generated_p = np.round(counts/10000, decimals=1)
+ assert (generated_p == p).all(), "Feature values should have density roughly equal to 'p'"
+
+def test_generate_combinations_shape_and_type(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = [0,1]
+ X = cc_instance.generate_combinations(X, indices, combination_type='linear')
+ assert isinstance(X, np.ndarray), "Output should be a numpy array"
+ assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)"
+
+def test_generate_correlated_shape_and_type(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = cc_instance.generate_correlated(X, indices, r=0.8)
+ assert isinstance(X, np.ndarray), "Output should be a numpy array"
+ assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)"
+
+def test_generate_correlated_correlaton(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = cc_instance.generate_correlated(X, indices, r=0.8)
+ Xt = X.T
+ corr, _ = pearsonr(Xt[0], Xt[5])
+ assert np.round(corr, decimals=1) == 0.8, "Resultant correlation should be equal to the 'r' parameter"
+
+
+def test_generate_duplicates_shape_and_type(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = cc_instance.generate_duplicates(X, indices)
+ assert isinstance(X, np.ndarray), "Output should be a numpy array"
+ assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)"
+
+def test_generate_duplicates_duplication(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = cc_instance.generate_duplicates(X, indices)
+ Xt = X.T
+ assert (Xt[0] == Xt[-1]).all()
+
+def test_xor_operation(cc_instance):
+ a = np.array([1, 0, 1])
+ b = np.array([0, 1, 1])
+ arr = [a, b]
+ result = cc_instance._xor(arr)
+ expected = np.array([1, 1, 0])
+ assert np.array_equal(result, expected), "XOR operation did not produce expected result"
+
+def test_and_operation(cc_instance):
+ a = np.array([1, 0, 1])
+ b = np.array([0, 1, 1])
+ arr = [a, b]
+ result = cc_instance._and(arr)
+ expected = np.array([0, 0, 1])
+ assert np.array_equal(result, expected), "AND operation did not produce expected result"
+
+def test_or_operation(cc_instance):
+ a = np.array([1, 0, 1])
+ b = np.array([0, 1, 1])
+ arr = [a, b]
+ result = cc_instance._or(arr)
+ expected = np.array([1, 1, 1])
+ assert np.array_equal(result, expected), "OR operation did not produce expected result"
+
+def test_generate_labels_shape_and_type(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = cc_instance.generate_labels(X)
+ assert isinstance(labels, np.ndarray), "Output should be a numpy array"
+ assert labels.shape == (100,), "Shape should be (n_samples,)"
+
+def test_generate_labels_distribution(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5])
+ unique, counts = np.unique(labels, return_counts=True)
+ distribution = counts / 100
+ expected_distribution = np.array([0.2, 0.3, 0.5])
+ assert np.allclose(distribution, expected_distribution, atol=0.1), "Label distribution does not match expected distribution"
+
+def test_generate_labels_class_relation_linear(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = cc_instance.generate_labels(X, class_relation='linear')
+ assert isinstance(labels, np.ndarray), "Output should be a numpy array"
+ assert labels.shape == (100,), "Shape should be (n_samples,)"
+
+def test_generate_labels_class_relation_nonlinear(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = cc_instance.generate_labels(X, class_relation='nonlinear')
+ assert isinstance(labels, np.ndarray), "Output should be a numpy array"
+ assert labels.shape == (100,), "Shape should be (n_samples,)"
+
+def test_generate_labels_class_relation_cluster(cc_instance):
+ X = cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = cc_instance.generate_labels(X, class_relation='cluster', balance=True)
+ assert isinstance(labels, np.ndarray), "Output should be a numpy array"
+ assert labels.shape == (100,), "Shape should be (n_samples,)"
\ No newline at end of file
From cb04d4da1f5378d656720839f0ae31c70cd598ec Mon Sep 17 00:00:00 2001
From: 98MM <47939788+98MM@users.noreply.github.com>
Date: Fri, 5 Jul 2024 09:31:42 +0200
Subject: [PATCH 2/9] removed .idea
---
.idea/.gitignore | 8 --------
.idea/inspectionProfiles/profiles_settings.xml | 6 ------
.idea/misc.xml | 7 -------
.idea/modules.xml | 8 --------
.idea/outrank.iml | 14 --------------
.idea/vcs.xml | 6 ------
6 files changed, 49 deletions(-)
delete mode 100644 .idea/.gitignore
delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml
delete mode 100644 .idea/misc.xml
delete mode 100644 .idea/modules.xml
delete mode 100644 .idea/outrank.iml
delete mode 100644 .idea/vcs.xml
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 13566b8..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index 877d184..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 3c2f566..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/outrank.iml b/.idea/outrank.iml
deleted file mode 100644
index 8e5446a..0000000
--- a/.idea/outrank.iml
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 35eb1dd..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
From 30549a4c6635fca147ab122a2413451a994d6b5c Mon Sep 17 00:00:00 2001
From: 98MM <47939788+98MM@users.noreply.github.com>
Date: Mon, 8 Jul 2024 13:37:30 +0200
Subject: [PATCH 3/9] pre-commit, code review changes
pre-commit,
code review changes:
- added _feature_builder method to avoid duplicate code blocks
- added some new parameters to enable random value domains for features
---
.../synthetic_data_generators/cc_generator.py | 391 ++++++++++++------
tests/cc_generator_tests.py | 67 +--
2 files changed, 301 insertions(+), 157 deletions(-)
diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py
index 2f00f7e..dd148a5 100644
--- a/outrank/algorithms/synthetic_data_generators/cc_generator.py
+++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py
@@ -1,9 +1,17 @@
+from __future__ import annotations
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
import numpy as np
from scipy.linalg import qr
from scipy.stats import norm
from sklearn.cluster import KMeans
from sklearn.utils import resample
-from typing import List, Union, Optional, Tuple
+
+
class CategoricalClassification:
def __init__(self):
@@ -13,19 +21,24 @@ def __init__(self):
'correlations': [],
'duplicates': [],
'labels': [],
- 'noise': []
+ 'noise': [],
}
def __repr__(self):
return f"CategoricalClassification(dataset_info={self.dataset_info})"
- def generate_data(self,
- n_features: int,
- n_samples: int,
- cardinality: int = 5,
- structure: Optional = None,
- ensure_rep: bool = False,
- seed: int = 42) -> np.ndarray:
+ def generate_data(
+ self,
+ n_features: int,
+ n_samples: int,
+ cardinality: int = 5,
+ structure: list | np.ndarray | None = None,
+ ensure_rep: bool = False,
+ random_values: bool | None = False,
+ low: int | None = 0,
+ high: int | None = 1000,
+ seed: int = 42,
+ ) -> np.ndarray:
"""
Generates dataset based on parameters
@@ -34,6 +47,9 @@ def generate_data(self,
:param cardinality: default cardinality of the dataset
:param structure: structure of the dataset
:param ensure_rep: flag, ensures all given values represented
+ :param random_values: flag, enables random (integer) feature values from set [low, high]
+ :param low: sets lower bound of random feature values
+ :param high: sets high bound of random feature values
:param seed: sets seed of numpy random
:return: X, 2D dataset
"""
@@ -45,113 +61,214 @@ def generate_data(self,
'cardinality': cardinality,
'structure': structure,
'ensure_rep': ensure_rep,
- 'seed': seed
- }
+ 'seed': seed,
+ },
})
np.random.seed(seed)
X = np.empty([n_features, n_samples])
- if structure == None:
-
+ if structure is None:
+ # No specific structure parameter passed
for i in range(n_features):
- x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ x = self._generate_feature(
+ n_samples,
+ cardinality=cardinality,
+ ensure_rep=ensure_rep,
+ random_values=random_values,
+ low=low,
+ high=high,
+ )
X[i] = x
-
else:
-
+ # Structure parameter passed, building based on structure
ix = 0
for data in structure:
-
if not isinstance(data[0], (list, np.ndarray)):
- feature_ix = data[0]
- feature_cardinality = data[1]
+ # Data in structure is a tuple of (feature index (integer), feature attributes)
+ feature_ix, feature_attributes = data
if ix < feature_ix:
+ # Filling out the dataset up to column index feature_ix
for i in range(ix, feature_ix):
- x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ x = self._generate_feature(
+ n_samples,
+ cardinality=cardinality,
+ ensure_rep=ensure_rep,
+ random_values=random_values,
+ low=low,
+ high=high,
+ )
X[ix] = x
ix += 1
- if not isinstance(feature_cardinality, (list, np.ndarray)):
- x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep)
- else:
- if isinstance(feature_cardinality[0], (list, np.ndarray)):
- value_domain = feature_cardinality[0]
- value_frequencies = feature_cardinality[1]
- x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies)
- else:
- value_domain = feature_cardinality
- x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep)
+ x = self._feature_builder(
+ feature_attributes,
+ n_samples,
+ ensure_rep=ensure_rep,
+ random_values=random_values,
+ low=low,
+ high=high,
+ )
X[ix] = x
ix += 1
else:
+ # Data in structure is a tuple of (list of feature indexes, feature attributes)
feature_ixs = data[0]
- feature_cardinality = data[1]
+ feature_attributes = data[1]
+
for feature_ix in feature_ixs:
+ # Filling out the dataset up to feature_ix
if ix < feature_ix:
for i in range(ix, feature_ix):
- x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ x = self._generate_feature(
+ n_samples,
+ cardinality=cardinality,
+ ensure_rep=ensure_rep,
+ random_values=random_values,
+ low=low,
+ high=high,
+ )
X[ix] = x
ix += 1
- if not isinstance(feature_cardinality, (list, np.ndarray)):
- x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep)
- else:
- value_domain = feature_cardinality[0]
- value_frequencies = feature_cardinality[1]
- x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies)
+ x = self._feature_builder(
+ feature_attributes,
+ n_samples,
+ ensure_rep=ensure_rep,
+ random_values=random_values,
+ low=low,
+ high=high,
+ )
+
X[ix] = x
ix += 1
if ix < n_features:
+ # Fill out the rest of the dataset
for i in range(ix, n_features):
- x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep)
+ x = self._generate_feature(
+ n_samples,
+ cardinality=cardinality,
+ ensure_rep=ensure_rep,
+ random_values=random_values,
+ low=low,
+ high=high,
+ )
X[i] = x
return X.T
- def _generate_feature(self,
- v: Union[int, List[int], np.ndarray],
- size: int,
- ensure_rep: bool = False,
- p: Optional[Union[List[float], np.ndarray]] = None) -> np.ndarray:
+ def _feature_builder(
+ self,
+ feature_attributes: int | list | np.ndarray,
+ n_samples: int,
+ ensure_rep: bool = False,
+ random_values: bool | None = False,
+ low: int | None = 0,
+ high: int | None = 1000,
+ ) -> np.ndarray:
+
"""
- Generates feature vector of length size. Default probability density distribution is approx. normal, centred around randomly picked value.
- :param v: either int for cardinality, or list of values
+ Helper function to avoid duplicate code, builds feature
+ :param feature_attributes: either integer (cardinality) or list of feature attributes
+ :param n_samples: number of samples in dataset
+ :param ensure_rep: ensures all values are represented at least once in the feature vector
+ :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
+ :param low: lower bound of random feature vector values
+ :param high: upper bound of random feature vector values
+ :return: feature vector
+ """
+
+ if not isinstance(feature_attributes, (list, np.ndarray)):
+ # feature_cardinality is just an integer, generate feature either with random values or
+ # [low, low+cardinality]
+ x = self._generate_feature(
+ n_samples,
+ cardinality=feature_attributes,
+ ensure_rep=ensure_rep,
+ random_values=random_values,
+ low=low,
+ high=high,
+ )
+ else:
+ # feature_cardinality is a list of [value_domain, value_frequencies]
+ if isinstance(feature_attributes[0], (list, np.ndarray)):
+ value_domain, value_frequencies = feature_attributes
+ x = self._generate_feature(
+ n_samples,
+ vec=value_domain,
+ ensure_rep=ensure_rep,
+ p=value_frequencies,
+ )
+ else:
+ # feature_cardinality is value_domain (list of values for feature)
+ value_domain = feature_attributes
+ x = self._generate_feature(
+ n_samples,
+ vec=value_domain,
+ ensure_rep=ensure_rep,
+ )
+
+ return x
+
+ def _generate_feature(
+ self,
+ size: int,
+ vec: list[int] | np.ndarray | None = None,
+ cardinality: int = 5,
+ ensure_rep: bool = False,
+ random_values: bool | None = False,
+ low: int | None = 0,
+ high: int | None = 1000,
+ p: list[float] | np.ndarray | None = None,
+ ) -> np.ndarray:
+ """
+ Generates feature vector of length size. Default probability density distribution is approx. normal, centred around a randomly picked value.
+ :param vec: list of feature values
+ :param cardinality: single value cardinality
:param size: length of feature vector
:param ensure_rep: ensures all values are represented at least once in the feature vector
+ :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
+ :param low: lower bound of random feature vector values
+ :param high: upper bound of random feature vector values
:param p: list of probabilities of each value
- :return:
+ :return: feature vector x
"""
- if not isinstance(v, (list, np.ndarray)):
- v = np.arange(0, v, 1)
+
+ if vec is None:
+ if random_values:
+ vec = np.random.choice(range(low, high + 1), cardinality, replace=False)
+ else:
+ vec = np.arange(low, low + cardinality, 1)
else:
- v = np.array(v)
+ vec = np.array(vec)
if p is None:
- v_shift = v - v[np.random.randint(len(v))]
+ v_shift = vec - vec[np.random.randint(len(vec))]
p = norm.pdf(v_shift, scale=3)
else:
p = np.array(p)
p = p / p.sum()
- if ensure_rep and len(v) < size:
- sampled_values = np.random.choice(v, size=(size - len(v)), p=p)
- sampled_values = np.append(sampled_values, v)
+ if ensure_rep and len(vec) < size:
+ sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p)
+ sampled_values = np.append(sampled_values, vec)
else:
- sampled_values = np.random.choice(v, size=size, p=p)
+ sampled_values = np.random.choice(vec, size=size, p=p)
np.random.shuffle(sampled_values)
return sampled_values
- def generate_combinations(self,
- X: np.ndarray,
- feature_indices: Union[List[int], np.ndarray],
- combination_function: Optional = None,
- combination_type: str ='linear') -> np.ndarray:
+ def generate_combinations(
+ self,
+ X: np.ndarray,
+ feature_indices: list[int] | np.ndarray,
+ combination_function: Optional = None,
+ combination_type: str = 'linear',
+ ) -> np.ndarray:
"""
Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
:param X: dataset
@@ -161,7 +278,6 @@ def generate_combinations(self,
:return: X with added resultant feature
"""
-
selected_features = X[:, feature_indices]
if combination_function is None:
@@ -179,7 +295,7 @@ def generate_combinations(self,
self.dataset_info['combinations'].append({
'feature_indices': feature_indices,
'combination_type': combination_type,
- 'combination_ix': combination_ix
+ 'combination_ix': combination_ix,
})
return np.column_stack((X, combination_result))
@@ -231,10 +347,13 @@ def _or(self, arr):
out = np.bitwise_or(out, arrT[i])
return out.T
- def generate_correlated(self,
- X: np.ndarray,
- feature_indices: Union[List[int], np.ndarray],
- r: float = 0.8) -> np.ndarray:
+
+ def generate_correlated(
+ self,
+ X: np.ndarray,
+ feature_indices: list[int] | np.ndarray,
+ r: float = 0.8,
+ ) -> np.ndarray:
"""
Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
@@ -282,14 +401,16 @@ def generate_correlated(self,
self.dataset_info['correlations'].append({
'feature_indices': feature_indices,
'correlated_indices': correlated_ixs,
- 'correlation_factor': r
+ 'correlation_factor': r,
})
return np.column_stack((X, correlated_features))
- def generate_duplicates(self,
- X: np.ndarray,
- feature_indices: Union[List[int], np.ndarray]) -> np.ndarray:
+ def generate_duplicates(
+ self,
+ X: np.ndarray,
+ feature_indices: list[int] | np.ndarray,
+ ) -> np.ndarray:
"""
Generates duplicate features
:param X: dataset
@@ -305,19 +426,21 @@ def generate_duplicates(self,
self.dataset_info['duplicates'].append({
'feature_indices': feature_indices,
- 'duplicate_indices': duplicated_ixs
+ 'duplicate_indices': duplicated_ixs,
})
return np.column_stack((X, selected_features))
- def generate_labels(self,
- X: np.ndarray,
- n: int = 2,
- p: Union[float, list[float], np.ndarray] = 0.5,
- k: Union[int, float] = 2,
- decision_function: Optional = None,
- class_relation: str ='linear',
- balance: bool = False):
+ def generate_labels(
+ self,
+ X: np.ndarray,
+ n: int = 2,
+ p: float | list[float] | np.ndarray = 0.5,
+ k: int | float = 2,
+ decision_function: Optional = None,
+ class_relation: str = 'linear',
+ balance: bool = False,
+ ):
"""
Generates labels for dataset X
:param X: dataset
@@ -397,17 +520,19 @@ def generate_labels(self,
self.dataset_info.update({
'labels': {
'class_relation': class_relation,
- 'n_class': n
- }
+ 'n_class': n,
+ },
})
return y
- def _cluster_data(self,
- X: np.ndarray,
- n: int,
- p: Optional[Union[float, List[float], np.ndarray]] = 1.0,
- balance: bool = False) -> np.ndarray:
+ def _cluster_data(
+ self,
+ X: np.ndarray,
+ n: int,
+ p: float | list[float] | np.ndarray | None = 1.0,
+ balance: bool = False,
+ ) -> np.ndarray:
"""
Cluster data using kmeans
:param X: dataset
@@ -436,7 +561,7 @@ def _cluster_data(self,
for val in p:
samples_per_cluster.append(int(samples * val))
else:
- raise Exception("Length of balance parameter must equal number of clusters.")
+ raise Exception('Length of balance parameter must equal number of clusters.')
# Adjust cluster sizes
if balance:
@@ -455,15 +580,19 @@ def _cluster_data(self,
dataset_indices = np.where(cluster_labels == i)[0] # Indices of samples in dataset
cluster_samples = np.copy(X[dataset_indices])
- distances = np.linalg.norm(cluster_samples - centroid,
- axis=1) # Distances of cluster samples to cluster centroid
+ distances = np.linalg.norm(
+ cluster_samples - centroid,
+ axis=1,
+ ) # Distances of cluster samples to cluster centroid
cluster_sample_indices = np.argsort(distances)
dataset_indices_sorted = dataset_indices[
- cluster_sample_indices] # Indices of samples sorted by sample distance to cluster centroid
+ cluster_sample_indices
+ ] # Indices of samples sorted by sample distance to cluster centroid
overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:] # Overflow samples
dataset_indices_sorted = dataset_indices_sorted[
- samples_per_cluster[i]:] # Dataset indices of overflow samples
+ samples_per_cluster[i]:
+ ] # Dataset indices of overflow samples
for i in range(len(overflow_sample_indices)):
overflow_samples.append(cluster_samples[overflow_sample_indices[i]])
@@ -493,12 +622,14 @@ def _cluster_data(self,
return np.array(cluster_labels)
- def generate_noise(self,
- X: np.ndarray,
- y: Union[List[int], np.ndarray],
- p: float = 0.2,
- type: str = "categorical",
- missing_val: Union[str, int, float] = float('-inf')) -> np.ndarray:
+ def generate_noise(
+ self,
+ X: np.ndarray,
+ y: list[int] | np.ndarray,
+ p: float = 0.2,
+ type: str = 'categorical',
+ missing_val: str | int | float = float('-inf'),
+ ) -> np.ndarray:
"""
Simulates noise on given dataset X
@@ -512,10 +643,10 @@ def generate_noise(self,
self.dataset_info['noise'].append({
'type': type,
- 'amount': p
+ 'amount': p,
})
- if type == "categorical":
+ if type == 'categorical':
label_values, label_count = np.unique(y, return_counts=True)
n_labels = len(label_values)
@@ -570,7 +701,7 @@ def generate_noise(self,
return X_noise
- elif type == "missing":
+ elif type == 'missing':
X_noise = np.copy(X)
Xn_T = X_noise.T
n = Xn_T.shape[1]
@@ -585,12 +716,14 @@ def generate_noise(self,
return Xn_T.T
- def downsample_dataset(self,
- X: np.array,
- y: Union[List[int], np.ndarray],
- N: Optional[Union[int, None]] = None,
- seed: int = 42,
- reshuffle: bool=False) -> Tuple[np.array, np.ndarray]:
+ def downsample_dataset(
+ self,
+ X: np.array,
+ y: list[int] | np.ndarray,
+ N: int | None | None = None,
+ seed: int = 42,
+ reshuffle: bool = False,
+ ) -> tuple[np.array, np.ndarray]:
"""
Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
@@ -609,16 +742,18 @@ def downsample_dataset(self,
N = min(counts)
if N > min(counts):
- raise ValueError("N must be equal to or less than the number of samples in minority class")
+ raise ValueError('N must be equal to or less than the number of samples in minority class')
X_arrays_list = []
y_downsampled = []
for label in values:
X_label = [X[i] for i in range(len(y)) if y[i] == label]
- X_label_downsample = resample(X_label,
- replace=True,
- n_samples=N,
- random_state=seed)
+ X_label_downsample = resample(
+ X_label,
+ replace=True,
+ n_samples=N,
+ random_state=seed,
+ )
X_arrays_list.append(X_label_downsample)
ys = [label] * N
y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
@@ -636,8 +771,8 @@ def downsample_dataset(self,
self.dataset_info.update({
'downsampling': {
'original_shape': original_shape,
- 'downsampled_shape': downsampled_shape
- }
+ 'downsampled_shape': downsampled_shape,
+ },
})
return X_downsampled, y_downsampled
@@ -659,44 +794,50 @@ def print_dataset(self, X, y):
print(arr[i], end='')
else:
print(arr[i], end=', ')
- print("], Label: {}".format(y[n]))
+ print(f'], Label: {y[n]}')
n += 1
-
def summarize(self):
print(f"Number of features: {self.dataset_info['general']['n_features']}")
print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
if self.dataset_info['downsampling']:
- print(f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}")
+ print(
+ f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}",
+ )
print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
-
print('-------------------------------------')
if len(self.dataset_info['combinations']) > 0:
- print("Combinations:")
+ print('Combinations:')
for comb in self.dataset_info['combinations']:
- print(f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}")
+ print(
+ f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}",
+ )
print('-------------------------------------')
if len(self.dataset_info['correlations']) > 0:
- print("Correlations:")
+ print('Correlations:')
for corr in self.dataset_info['correlations']:
- print(f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}")
+ print(
+ f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}",
+ )
print('-------------------------------------')
if len(self.dataset_info['duplicates']) > 0:
- print("Duplicates:")
+ print('Duplicates:')
for dup in self.dataset_info['duplicates']:
- print(f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}")
+ print(
+ f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}",
+ )
print('-------------------------------------')
if len(self.dataset_info['noise']) > 0:
- print("Simulated noise:")
+ print('Simulated noise:')
for noise in self.dataset_info['noise']:
print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}")
print('-------------------------------------')
- print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
\ No newline at end of file
+ print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py
index e5665f6..46ca8c4 100644
--- a/tests/cc_generator_tests.py
+++ b/tests/cc_generator_tests.py
@@ -1,6 +1,9 @@
-import pytest
+from __future__ import annotations
+
import numpy as np
+import pytest
from scipy.stats import pearsonr
+
from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification
@pytest.fixture
@@ -12,15 +15,15 @@ def test_init(cc_instance):
def test_generate_data_shape_and_type(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
- assert isinstance(X, np.ndarray), "Output should be a numpy array"
- assert X.shape == (100, 5), "Shape should be (n_samples, n_features)"
+ assert isinstance(X, np.ndarray), 'Output should be a numpy array'
+ assert X.shape == (100, 5), 'Shape should be (n_samples, n_features)'
def test_generate_data_cardinality(cc_instance):
n_features = 5
cardinality = 3
X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality)
unique_values = np.unique(X)
- assert len(unique_values) <= cardinality, "Cardinality not respected for all features"
+ assert len(unique_values) <= cardinality, 'Cardinality not respected for all features'
def test_generate_data_ensure_rep(cc_instance):
n_features = 5
@@ -30,35 +33,35 @@ def test_generate_data_ensure_rep(cc_instance):
assert len(unique_values) == cardinality, "Not all values represented when 'ensure_rep=True'"
def test_generate_feature_shape_and_type(cc_instance):
- feature = cc_instance._generate_feature(5, size=100)
- assert isinstance(feature, np.ndarray), "Output should be a numpy array"
- assert feature.shape == (100,), "Shape should be (size,)"
+ feature = cc_instance._generate_feature(100, cardinality=5)
+ assert isinstance(feature, np.ndarray), 'Output should be a numpy array'
+ assert feature.shape == (100,), 'Shape should be (size,)'
def test_generate_feature_cardinality(cc_instance):
- feature = cc_instance._generate_feature(5, size=100)
+ feature = cc_instance._generate_feature(100, cardinality=5)
unique_values = np.unique(feature)
- assert len(unique_values) <= 5, "Feature cardinality not respected for all features"
+ assert len(unique_values) <= 5, 'Feature cardinality not respected for all features'
def test_generate_feature_ensure_rep(cc_instance):
- feature = cc_instance._generate_feature(50, size=100, ensure_rep=True)
+ feature = cc_instance._generate_feature(100, cardinality=50, ensure_rep=True)
unique_values = np.unique(feature)
assert len(unique_values) == 50, "Not all values represented when using 'ensure_rep=True'"
def test_generate_feature_values(cc_instance):
values = [5, 6, 7, 8, 9, 10]
- feature = cc_instance._generate_feature(values, size=100)
+ feature = cc_instance._generate_feature(100, vec=values)
unique_values = np.unique(feature)
- assert any(f in feature for f in values), "Feature values not in input list"
+ assert any(f in feature for f in values), 'Feature values not in input list'
def test_generate_feature_values_ensure_rep(cc_instance):
values = [5, 6, 7, 8, 9, 10]
- feature = cc_instance._generate_feature(values, size=100, ensure_rep=True)
+ feature = cc_instance._generate_feature(100, vec=values, ensure_rep=True)
unique_values = np.unique(feature)
assert (values == unique_values).all(), "Feature values should match input list when 'ensure_rep=True'"
def test_generate_feature_density(cc_instance):
values = [0, 1, 2]
p = [0.2, 0.4, 0.4]
- feature = cc_instance._generate_feature(values, size=10000, ensure_rep=True, p=p)
+ feature = cc_instance._generate_feature(10000, vec=values, ensure_rep=True, p=p)
values, counts = np.unique(feature, return_counts=True)
generated_p = np.round(counts/10000, decimals=1)
assert (generated_p == p).all(), "Feature values should have density roughly equal to 'p'"
@@ -67,15 +70,15 @@ def test_generate_combinations_shape_and_type(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
indices = [0,1]
X = cc_instance.generate_combinations(X, indices, combination_type='linear')
- assert isinstance(X, np.ndarray), "Output should be a numpy array"
- assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)"
+ assert isinstance(X, np.ndarray), 'Output should be a numpy array'
+ assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)'
def test_generate_correlated_shape_and_type(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
indices = 0
X = cc_instance.generate_correlated(X, indices, r=0.8)
- assert isinstance(X, np.ndarray), "Output should be a numpy array"
- assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)"
+ assert isinstance(X, np.ndarray), 'Output should be a numpy array'
+ assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)'
def test_generate_correlated_correlaton(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
@@ -90,8 +93,8 @@ def test_generate_duplicates_shape_and_type(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
indices = 0
X = cc_instance.generate_duplicates(X, indices)
- assert isinstance(X, np.ndarray), "Output should be a numpy array"
- assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)"
+ assert isinstance(X, np.ndarray), 'Output should be a numpy array'
+ assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)'
def test_generate_duplicates_duplication(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
@@ -106,7 +109,7 @@ def test_xor_operation(cc_instance):
arr = [a, b]
result = cc_instance._xor(arr)
expected = np.array([1, 1, 0])
- assert np.array_equal(result, expected), "XOR operation did not produce expected result"
+ assert np.array_equal(result, expected), 'XOR operation did not produce expected result'
def test_and_operation(cc_instance):
a = np.array([1, 0, 1])
@@ -114,7 +117,7 @@ def test_and_operation(cc_instance):
arr = [a, b]
result = cc_instance._and(arr)
expected = np.array([0, 0, 1])
- assert np.array_equal(result, expected), "AND operation did not produce expected result"
+ assert np.array_equal(result, expected), 'AND operation did not produce expected result'
def test_or_operation(cc_instance):
a = np.array([1, 0, 1])
@@ -122,13 +125,13 @@ def test_or_operation(cc_instance):
arr = [a, b]
result = cc_instance._or(arr)
expected = np.array([1, 1, 1])
- assert np.array_equal(result, expected), "OR operation did not produce expected result"
+ assert np.array_equal(result, expected), 'OR operation did not produce expected result'
def test_generate_labels_shape_and_type(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
labels = cc_instance.generate_labels(X)
- assert isinstance(labels, np.ndarray), "Output should be a numpy array"
- assert labels.shape == (100,), "Shape should be (n_samples,)"
+ assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
+ assert labels.shape == (100,), 'Shape should be (n_samples,)'
def test_generate_labels_distribution(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
@@ -136,22 +139,22 @@ def test_generate_labels_distribution(cc_instance):
unique, counts = np.unique(labels, return_counts=True)
distribution = counts / 100
expected_distribution = np.array([0.2, 0.3, 0.5])
- assert np.allclose(distribution, expected_distribution, atol=0.1), "Label distribution does not match expected distribution"
+ assert np.allclose(distribution, expected_distribution, atol=0.1), 'Label distribution does not match expected distribution'
def test_generate_labels_class_relation_linear(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
labels = cc_instance.generate_labels(X, class_relation='linear')
- assert isinstance(labels, np.ndarray), "Output should be a numpy array"
- assert labels.shape == (100,), "Shape should be (n_samples,)"
+ assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
+ assert labels.shape == (100,), 'Shape should be (n_samples,)'
def test_generate_labels_class_relation_nonlinear(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
labels = cc_instance.generate_labels(X, class_relation='nonlinear')
- assert isinstance(labels, np.ndarray), "Output should be a numpy array"
- assert labels.shape == (100,), "Shape should be (n_samples,)"
+ assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
+ assert labels.shape == (100,), 'Shape should be (n_samples,)'
def test_generate_labels_class_relation_cluster(cc_instance):
X = cc_instance.generate_data(n_features=5, n_samples=100)
labels = cc_instance.generate_labels(X, class_relation='cluster', balance=True)
- assert isinstance(labels, np.ndarray), "Output should be a numpy array"
- assert labels.shape == (100,), "Shape should be (n_samples,)"
\ No newline at end of file
+ assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
+ assert labels.shape == (100,), 'Shape should be (n_samples,)'
From d0d50976eb6116e1efb6f0a8519cf5da52b11925 Mon Sep 17 00:00:00 2001
From: 98MM <47939788+98MM@users.noreply.github.com>
Date: Tue, 9 Jul 2024 10:22:59 +0200
Subject: [PATCH 4/9] Rewrote tests with unittest instead of pytest
---
tests/cc_generator_tests.py | 311 ++++++++++++++++++------------------
1 file changed, 158 insertions(+), 153 deletions(-)
diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py
index 46ca8c4..16cb7b2 100644
--- a/tests/cc_generator_tests.py
+++ b/tests/cc_generator_tests.py
@@ -1,160 +1,165 @@
from __future__ import annotations
+import unittest
+
import numpy as np
-import pytest
from scipy.stats import pearsonr
from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification
-@pytest.fixture
-def cc_instance():
- return CategoricalClassification()
-
-def test_init(cc_instance):
- assert cc_instance.dataset_info == ''
-
-def test_generate_data_shape_and_type(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- assert isinstance(X, np.ndarray), 'Output should be a numpy array'
- assert X.shape == (100, 5), 'Shape should be (n_samples, n_features)'
-
-def test_generate_data_cardinality(cc_instance):
- n_features = 5
- cardinality = 3
- X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality)
- unique_values = np.unique(X)
- assert len(unique_values) <= cardinality, 'Cardinality not respected for all features'
-
-def test_generate_data_ensure_rep(cc_instance):
- n_features = 5
- cardinality = 50
- X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality, ensure_rep=True)
- unique_values = np.unique(X)
- assert len(unique_values) == cardinality, "Not all values represented when 'ensure_rep=True'"
-
-def test_generate_feature_shape_and_type(cc_instance):
- feature = cc_instance._generate_feature(100, cardinality=5)
- assert isinstance(feature, np.ndarray), 'Output should be a numpy array'
- assert feature.shape == (100,), 'Shape should be (size,)'
-
-def test_generate_feature_cardinality(cc_instance):
- feature = cc_instance._generate_feature(100, cardinality=5)
- unique_values = np.unique(feature)
- assert len(unique_values) <= 5, 'Feature cardinality not respected for all features'
-
-def test_generate_feature_ensure_rep(cc_instance):
- feature = cc_instance._generate_feature(100, cardinality=50, ensure_rep=True)
- unique_values = np.unique(feature)
- assert len(unique_values) == 50, "Not all values represented when using 'ensure_rep=True'"
-
-def test_generate_feature_values(cc_instance):
- values = [5, 6, 7, 8, 9, 10]
- feature = cc_instance._generate_feature(100, vec=values)
- unique_values = np.unique(feature)
- assert any(f in feature for f in values), 'Feature values not in input list'
-def test_generate_feature_values_ensure_rep(cc_instance):
- values = [5, 6, 7, 8, 9, 10]
- feature = cc_instance._generate_feature(100, vec=values, ensure_rep=True)
- unique_values = np.unique(feature)
- assert (values == unique_values).all(), "Feature values should match input list when 'ensure_rep=True'"
-
-def test_generate_feature_density(cc_instance):
- values = [0, 1, 2]
- p = [0.2, 0.4, 0.4]
- feature = cc_instance._generate_feature(10000, vec=values, ensure_rep=True, p=p)
- values, counts = np.unique(feature, return_counts=True)
- generated_p = np.round(counts/10000, decimals=1)
- assert (generated_p == p).all(), "Feature values should have density roughly equal to 'p'"
-
-def test_generate_combinations_shape_and_type(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- indices = [0,1]
- X = cc_instance.generate_combinations(X, indices, combination_type='linear')
- assert isinstance(X, np.ndarray), 'Output should be a numpy array'
- assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)'
-
-def test_generate_correlated_shape_and_type(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- indices = 0
- X = cc_instance.generate_correlated(X, indices, r=0.8)
- assert isinstance(X, np.ndarray), 'Output should be a numpy array'
- assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)'
-
-def test_generate_correlated_correlaton(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- indices = 0
- X = cc_instance.generate_correlated(X, indices, r=0.8)
- Xt = X.T
- corr, _ = pearsonr(Xt[0], Xt[5])
- assert np.round(corr, decimals=1) == 0.8, "Resultant correlation should be equal to the 'r' parameter"
-
-
-def test_generate_duplicates_shape_and_type(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- indices = 0
- X = cc_instance.generate_duplicates(X, indices)
- assert isinstance(X, np.ndarray), 'Output should be a numpy array'
- assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)'
-
-def test_generate_duplicates_duplication(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- indices = 0
- X = cc_instance.generate_duplicates(X, indices)
- Xt = X.T
- assert (Xt[0] == Xt[-1]).all()
-
-def test_xor_operation(cc_instance):
- a = np.array([1, 0, 1])
- b = np.array([0, 1, 1])
- arr = [a, b]
- result = cc_instance._xor(arr)
- expected = np.array([1, 1, 0])
- assert np.array_equal(result, expected), 'XOR operation did not produce expected result'
-
-def test_and_operation(cc_instance):
- a = np.array([1, 0, 1])
- b = np.array([0, 1, 1])
- arr = [a, b]
- result = cc_instance._and(arr)
- expected = np.array([0, 0, 1])
- assert np.array_equal(result, expected), 'AND operation did not produce expected result'
-
-def test_or_operation(cc_instance):
- a = np.array([1, 0, 1])
- b = np.array([0, 1, 1])
- arr = [a, b]
- result = cc_instance._or(arr)
- expected = np.array([1, 1, 1])
- assert np.array_equal(result, expected), 'OR operation did not produce expected result'
-
-def test_generate_labels_shape_and_type(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- labels = cc_instance.generate_labels(X)
- assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
- assert labels.shape == (100,), 'Shape should be (n_samples,)'
-
-def test_generate_labels_distribution(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- labels = cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5])
- unique, counts = np.unique(labels, return_counts=True)
- distribution = counts / 100
- expected_distribution = np.array([0.2, 0.3, 0.5])
- assert np.allclose(distribution, expected_distribution, atol=0.1), 'Label distribution does not match expected distribution'
-
-def test_generate_labels_class_relation_linear(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- labels = cc_instance.generate_labels(X, class_relation='linear')
- assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
- assert labels.shape == (100,), 'Shape should be (n_samples,)'
-
-def test_generate_labels_class_relation_nonlinear(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- labels = cc_instance.generate_labels(X, class_relation='nonlinear')
- assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
- assert labels.shape == (100,), 'Shape should be (n_samples,)'
-
-def test_generate_labels_class_relation_cluster(cc_instance):
- X = cc_instance.generate_data(n_features=5, n_samples=100)
- labels = cc_instance.generate_labels(X, class_relation='cluster', balance=True)
- assert isinstance(labels, np.ndarray), 'Output should be a numpy array'
- assert labels.shape == (100,), 'Shape should be (n_samples,)'
+class TestCategoricalClassification(unittest.TestCase):
+
+ def setUp(self):
+ self.cc_instance = CategoricalClassification()
+
+ def test_init(self):
+ self.assertEqual(self.cc_instance.dataset_info, '')
+
+ def test_generate_data_shape_and_type(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(X.shape, (100, 5), 'Shape should be (n_samples, n_features)')
+
+ def test_generate_data_cardinality(self):
+ n_features = 5
+ cardinality = 3
+ X = self.cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality)
+ unique_values = np.unique(X)
+ self.assertLessEqual(len(unique_values), cardinality, 'Cardinality not respected for all features')
+
+ def test_generate_data_ensure_rep(self):
+ n_features = 5
+ cardinality = 50
+ X = self.cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality, ensure_rep=True)
+ unique_values = np.unique(X)
+ self.assertEqual(len(unique_values), cardinality, "Not all values represented when 'ensure_rep=True'")
+
+ def test_generate_feature_shape_and_type(self):
+ feature = self.cc_instance._generate_feature(100, cardinality=5)
+ self.assertIsInstance(feature, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(feature.shape, (100,), 'Shape should be (size,)')
+
+ def test_generate_feature_cardinality(self):
+ feature = self.cc_instance._generate_feature(100, cardinality=5)
+ unique_values = np.unique(feature)
+ self.assertLessEqual(len(unique_values), 5, 'Feature cardinality not respected for all features')
+
+ def test_generate_feature_ensure_rep(self):
+ feature = self.cc_instance._generate_feature(100, cardinality=50, ensure_rep=True)
+ unique_values = np.unique(feature)
+ self.assertEqual(len(unique_values), 50, "Not all values represented when using 'ensure_rep=True'")
+
+ def test_generate_feature_values(self):
+ values = [5, 6, 7, 8, 9, 10]
+ feature = self.cc_instance._generate_feature(100, vec=values)
+ unique_values = np.unique(feature)
+ self.assertTrue(any(f in feature for f in values), 'Feature values not in input list')
+
+ def test_generate_feature_values_ensure_rep(self):
+ values = [5, 6, 7, 8, 9, 10]
+ feature = self.cc_instance._generate_feature(100, vec=values, ensure_rep=True)
+ unique_values = np.unique(feature)
+ self.assertTrue(np.array_equal(values, unique_values), "Feature values should match input list when 'ensure_rep=True'")
+
+ def test_generate_feature_density(self):
+ values = [0, 1, 2]
+ p = [0.2, 0.4, 0.4]
+ feature = self.cc_instance._generate_feature(10000, vec=values, ensure_rep=True, p=p)
+ values, counts = np.unique(feature, return_counts=True)
+ generated_p = np.round(counts/10000, decimals=1)
+ self.assertTrue(np.array_equal(generated_p, p), "Feature values should have density roughly equal to 'p'")
+
+ def test_generate_combinations_shape_and_type(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = [0,1]
+ X = self.cc_instance.generate_combinations(X, indices, combination_type='linear')
+ self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(X.shape, (100, 6), 'Shape should be (n_samples, n_features + 1)')
+
+ def test_generate_correlated_shape_and_type(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = self.cc_instance.generate_correlated(X, indices, r=0.8)
+ self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(X.shape, (100, 6), 'Shape should be (n_samples, n_features + 1)')
+
+ def test_generate_correlated_correlaton(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = self.cc_instance.generate_correlated(X, indices, r=0.8)
+ Xt = X.T
+ corr, _ = pearsonr(Xt[0], Xt[5])
+ self.assertAlmostEqual(np.round(corr, decimals=1), 0.8, "Resultant correlation should be equal to the 'r' parameter")
+
+ def test_generate_duplicates_shape_and_type(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = self.cc_instance.generate_duplicates(X, indices)
+ self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(X.shape, (100, 6), 'Shape should be (n_samples, n_features + 1)')
+
+ def test_generate_duplicates_duplication(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ indices = 0
+ X = self.cc_instance.generate_duplicates(X, indices)
+ Xt = X.T
+ self.assertTrue((Xt[0] == Xt[-1]).all())
+
+ def test_xor_operation(self):
+ a = np.array([1, 0, 1])
+ b = np.array([0, 1, 1])
+ arr = [a, b]
+ result = self.cc_instance._xor(arr)
+ expected = np.array([1, 1, 0])
+ self.assertTrue(np.array_equal(result, expected), 'XOR operation did not produce expected result')
+
+ def test_and_operation(self):
+ a = np.array([1, 0, 1])
+ b = np.array([0, 1, 1])
+ arr = [a, b]
+ result = self.cc_instance._and(arr)
+ expected = np.array([0, 0, 1])
+ self.assertTrue(np.array_equal(result, expected), 'AND operation did not produce expected result')
+
+ def test_or_operation(self):
+ a = np.array([1, 0, 1])
+ b = np.array([0, 1, 1])
+ arr = [a, b]
+ result = self.cc_instance._or(arr)
+ expected = np.array([1, 1, 1])
+ self.assertTrue(np.array_equal(result, expected), 'OR operation did not produce expected result')
+
+ def test_generate_labels_shape_and_type(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = self.cc_instance.generate_labels(X)
+ self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
+
+ def test_generate_labels_distribution(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = self.cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5])
+ unique, counts = np.unique(labels, return_counts=True)
+ distribution = counts / 100
+ expected_distribution = np.array([0.2, 0.3, 0.5])
+ self.assertTrue(np.allclose(distribution, expected_distribution, atol=0.1), 'Label distribution does not match expected distribution')
+
+ def test_generate_labels_class_relation_linear(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = self.cc_instance.generate_labels(X, class_relation='linear')
+ self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
+
+ def test_generate_labels_class_relation_nonlinear(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = self.cc_instance.generate_labels(X, class_relation='nonlinear')
+ self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
+
+ def test_generate_labels_class_relation_cluster(self):
+ X = self.cc_instance.generate_data(n_features=5, n_samples=100)
+ labels = self.cc_instance.generate_labels(X, class_relation='cluster', balance=True)
+ self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array')
+ self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
+
+if __name__ == '__main__':
+ unittest.main()
From 79eb4de39200efc1548411ed0a0d6921eb5404f5 Mon Sep 17 00:00:00 2001
From: 98MM <47939788+98MM@users.noreply.github.com>
Date: Tue, 9 Jul 2024 10:43:40 +0200
Subject: [PATCH 5/9] removed if __name__ == '__main__' from file, small fix in
cluster test
---
tests/cc_generator_tests.py | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py
index 16cb7b2..860b079 100644
--- a/tests/cc_generator_tests.py
+++ b/tests/cc_generator_tests.py
@@ -157,9 +157,6 @@ def test_generate_labels_class_relation_nonlinear(self):
def test_generate_labels_class_relation_cluster(self):
X = self.cc_instance.generate_data(n_features=5, n_samples=100)
- labels = self.cc_instance.generate_labels(X, class_relation='cluster', balance=True)
+ labels = self.cc_instance.generate_labels(X, class_relation='cluster')
self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array')
- self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
-
-if __name__ == '__main__':
- unittest.main()
+ self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
\ No newline at end of file
From 1e50ee70541fdefb42999d9bbf873a67fa4b302c Mon Sep 17 00:00:00 2001
From: 98MM <47939788+98MM@users.noreply.github.com>
Date: Thu, 11 Jul 2024 20:10:08 +0200
Subject: [PATCH 6/9] code review fixes
renamed _feature_builder -> _configure_generate_featuer
Replace np.ndarray typing with ArrayLike from numpy typing, other typing fixes
---
.../synthetic_data_generators/cc_generator.py | 82 ++++++++++---------
tests/cc_generator_tests.py | 2 +-
2 files changed, 43 insertions(+), 41 deletions(-)
diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py
index dd148a5..f370580 100644
--- a/outrank/algorithms/synthetic_data_generators/cc_generator.py
+++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py
@@ -1,11 +1,13 @@
from __future__ import annotations
from typing import List
+from typing import Literal
from typing import Optional
from typing import Tuple
from typing import Union
import numpy as np
+from numpy.typing import ArrayLike
from scipy.linalg import qr
from scipy.stats import norm
from sklearn.cluster import KMeans
@@ -32,7 +34,7 @@ def generate_data(
n_features: int,
n_samples: int,
cardinality: int = 5,
- structure: list | np.ndarray | None = None,
+ structure: list | ArrayLike | None = None,
ensure_rep: bool = False,
random_values: bool | None = False,
low: int | None = 0,
@@ -41,7 +43,7 @@ def generate_data(
) -> np.ndarray:
"""
- Generates dataset based on parameters
+ Generates dataset based on given parameters
:param n_features: number of generated features
:param n_samples: number of generated samples
:param cardinality: default cardinality of the dataset
@@ -102,7 +104,7 @@ def generate_data(
X[ix] = x
ix += 1
- x = self._feature_builder(
+ x = self._configure_generate_feature(
feature_attributes,
n_samples,
ensure_rep=ensure_rep,
@@ -115,8 +117,7 @@ def generate_data(
else:
# Data in structure is a tuple of (list of feature indexes, feature attributes)
- feature_ixs = data[0]
- feature_attributes = data[1]
+ feature_ixs, feature_attributes = data
for feature_ix in feature_ixs:
# Filling out the dataset up to feature_ix
@@ -133,7 +134,7 @@ def generate_data(
X[ix] = x
ix += 1
- x = self._feature_builder(
+ x = self._configure_generate_feature(
feature_attributes,
n_samples,
ensure_rep=ensure_rep,
@@ -160,9 +161,9 @@ def generate_data(
return X.T
- def _feature_builder(
+ def _configure_generate_feature(
self,
- feature_attributes: int | list | np.ndarray,
+ feature_attributes: int | list | ArrayLike,
n_samples: int,
ensure_rep: bool = False,
random_values: bool | None = False,
@@ -171,7 +172,7 @@ def _feature_builder(
) -> np.ndarray:
"""
- Helper function to avoid duplicate code, builds feature
+ Helper function, calls _generate_feature with appropriate parameters based on feature_attributes
:param feature_attributes: either integer (cardinality) or list of feature attributes
:param n_samples: number of samples in dataset
:param ensure_rep: ensures all values are represented at least once in the feature vector
@@ -216,7 +217,7 @@ def _feature_builder(
def _generate_feature(
self,
size: int,
- vec: list[int] | np.ndarray | None = None,
+ vec: list[int] | ArrayLike | None = None,
cardinality: int = 5,
ensure_rep: bool = False,
random_values: bool | None = False,
@@ -225,7 +226,7 @@ def _generate_feature(
p: list[float] | np.ndarray | None = None,
) -> np.ndarray:
"""
- Generates feature vector of length size. Default probability density distribution is approx. normal, centred around a randomly picked value.
+ Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value.
:param vec: list of feature values
:param cardinality: single value cardinality
:param size: length of feature vector
@@ -264,10 +265,10 @@ def _generate_feature(
def generate_combinations(
self,
- X: np.ndarray,
- feature_indices: list[int] | np.ndarray,
+ X: ArrayLike,
+ feature_indices: list[int] | ArrayLike,
combination_function: Optional = None,
- combination_type: str = 'linear',
+ combination_type: Literal = 'linear',
) -> np.ndarray:
"""
Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
@@ -300,11 +301,10 @@ def generate_combinations(
return np.column_stack((X, combination_result))
- def _xor(self, arr):
+ def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray:
"""
Performs bitwise XOR operation on two integer arrays
- :param a: array
- :param b: array
+ :param arr: features to perform XOR operation on
:return: bitwise XOR result
"""
arrT = arr.T
@@ -316,11 +316,10 @@ def _xor(self, arr):
return out.T
- def _and(self, arr):
+ def _and(self, arr: list[int] | ArrayLike) -> np.ndarray:
"""
Performs bitwise AND operation on two integer arrays
- :param a: array
- :param b: array
+ :param arr: features to perform AND operation on
:return: bitwise AND result
"""
arrT = arr.T
@@ -332,11 +331,10 @@ def _and(self, arr):
return out.T
- def _or(self, arr):
+ def _or(self, arr: list[int] | ArrayLike) -> np.ndarray:
"""
Performs bitwise OR operation on two integer arrays
- :param a: array
- :param b: array
+ :param arr: features to perform OR operation on
:return: bitwise OR result
"""
arrT = arr.T
@@ -350,8 +348,8 @@ def _or(self, arr):
def generate_correlated(
self,
- X: np.ndarray,
- feature_indices: list[int] | np.ndarray,
+ X: ArrayLike,
+ feature_indices: list[int] | ArrayLike,
r: float = 0.8,
) -> np.ndarray:
@@ -408,8 +406,8 @@ def generate_correlated(
def generate_duplicates(
self,
- X: np.ndarray,
- feature_indices: list[int] | np.ndarray,
+ X: ArrayLike,
+ feature_indices: list[int] | ArrayLike,
) -> np.ndarray:
"""
Generates duplicate features
@@ -433,9 +431,9 @@ def generate_duplicates(
def generate_labels(
self,
- X: np.ndarray,
+ X: ArrayLike,
n: int = 2,
- p: float | list[float] | np.ndarray = 0.5,
+ p: float | list[float] | ArrayLike = 0.5,
k: int | float = 2,
decision_function: Optional = None,
class_relation: str = 'linear',
@@ -528,9 +526,9 @@ def generate_labels(
def _cluster_data(
self,
- X: np.ndarray,
+ X: ArrayLike,
n: int,
- p: float | list[float] | np.ndarray | None = 1.0,
+ p: float | list[float] | ArrayLike | None = 1.0,
balance: bool = False,
) -> np.ndarray:
"""
@@ -624,10 +622,10 @@ def _cluster_data(
def generate_noise(
self,
- X: np.ndarray,
- y: list[int] | np.ndarray,
+ X: ArrayLike,
+ y: list[int] | ArrayLike,
p: float = 0.2,
- type: str = 'categorical',
+ type: Literal = 'categorical',
missing_val: str | int | float = float('-inf'),
) -> np.ndarray:
@@ -718,12 +716,12 @@ def generate_noise(
def downsample_dataset(
self,
- X: np.array,
- y: list[int] | np.ndarray,
- N: int | None | None = None,
+ X: ArrayLike,
+ y: list[int] | ArrayLike,
+ N: int | None = None,
seed: int = 42,
reshuffle: bool = False,
- ) -> tuple[np.array, np.ndarray]:
+ ) -> tuple[np.ndarray, np.ndarray]:
"""
Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
@@ -777,7 +775,11 @@ def downsample_dataset(
return X_downsampled, y_downsampled
- def print_dataset(self, X, y):
+ def print_dataset(
+ self,
+ X: ArrayLike,
+ y: ArrayLike,
+ ):
"""
Prints given dataset
:param X: dataset
@@ -803,7 +805,7 @@ def summarize(self):
print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
if self.dataset_info['downsampling']:
print(
- f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}",
+ f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}",
)
print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py
index 860b079..1cc0796 100644
--- a/tests/cc_generator_tests.py
+++ b/tests/cc_generator_tests.py
@@ -159,4 +159,4 @@ def test_generate_labels_class_relation_cluster(self):
X = self.cc_instance.generate_data(n_features=5, n_samples=100)
labels = self.cc_instance.generate_labels(X, class_relation='cluster')
self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array')
- self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
\ No newline at end of file
+ self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')
From fc029a26ec59e226e53d34e653e0d1c2305c9510 Mon Sep 17 00:00:00 2001
From: 98MM <47939788+98MM@users.noreply.github.com>
Date: Fri, 12 Jul 2024 10:41:45 +0200
Subject: [PATCH 7/9] Added documentation for feature generation
Small demo code in DOCSMAIN as well as pdoc entry
---
docs/DOCSMAIN.md | 20 +
docs/outrank.html | 38 +-
docs/outrank/algorithms.html | 12 +-
docs/outrank/algorithms/feature_ranking.html | 12 +-
.../feature_ranking/ranking_mi_numba.html | 10 +-
.../algorithms/importance_estimator.html | 624 ++--
docs/outrank/algorithms/sketches.html | 12 +-
.../algorithms/sketches/counting_cms.html | 554 ++++
.../sketches/counting_counters_ordinary.html | 413 +++
.../sketches/counting_ultiloglog.html | 52 +-
.../algorithms/synthetic_data_generators.html | 13 +-
.../cc_generator.html | 2832 ++++++++++++++++
.../generator_naive.html | 128 +-
docs/outrank/core_ranking.html | 2837 ++++++++---------
docs/outrank/core_selftest.html | 6 +-
docs/outrank/core_utils.html | 1196 +++----
docs/outrank/feature_transformations.html | 12 +-
.../feature_transformer_vault.html | 23 +-
.../default_transformers.html | 93 +-
.../fw_transformers.html | 89 +-
.../ranking_transformers.html | 62 +-
docs/outrank/task_generators.html | 14 +-
docs/outrank/task_instance_ranking.html | 521 +++
docs/outrank/task_ranking.html | 10 +-
docs/outrank/task_selftest.html | 32 +-
docs/outrank/task_summary.html | 160 +-
docs/outrank/task_visualization.html | 10 +-
docs/outrank/visualizations.html | 12 +-
.../visualizations/ranking_visualization.html | 18 +-
docs/search.js | 4 +-
30 files changed, 7161 insertions(+), 2658 deletions(-)
create mode 100644 docs/outrank/algorithms/sketches/counting_cms.html
create mode 100644 docs/outrank/algorithms/sketches/counting_counters_ordinary.html
create mode 100644 docs/outrank/algorithms/synthetic_data_generators/cc_generator.html
create mode 100644 docs/outrank/task_instance_ranking.html
diff --git a/docs/DOCSMAIN.md b/docs/DOCSMAIN.md
index 1b6681a..fae8ec4 100644
--- a/docs/DOCSMAIN.md
+++ b/docs/DOCSMAIN.md
@@ -64,3 +64,23 @@ scores = [lowest_score, medium_score, high_score]
sorted_score_indices = np.argsort(scores)
assert np.sum(np.array([0, 1, 2]) - sorted_score_indices) == 0
```
+---
+## Creating a simple dataset
+```python
+from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification
+
+cc = CategoricalClassification()
+
+# Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35
+X = cc.generate_data(9,
+ 10000,
+ cardinality=35,
+ ensure_rep=True,
+ random_values=True,
+ low=0,
+ high=40)
+
+# Creates target labels via clustering
+y = cc.generate_labels(X, n=2, class_relation='cluster')
+
+```
\ No newline at end of file
diff --git a/docs/outrank.html b/docs/outrank.html
index 461d09d..6af2d42 100644
--- a/docs/outrank.html
+++ b/docs/outrank.html
@@ -3,7 +3,7 @@
-
+
outrank API documentation
@@ -26,7 +26,10 @@ Contents
Welcome to OutRank's documentation!
Setup
Example use cases
- OutRank as a Python library
+ OutRank as a Python library
+
@@ -38,6 +41,7 @@ Submodules
core_utils
feature_transformations
task_generators
+ task_instance_ranking
task_ranking
task_selftest
task_summary
@@ -129,6 +133,29 @@ OutRank as a Python library
assert np.sum(np.array([0, 1, 2]) - sorted_score_indices) == 0
+
+
+
+Creating a simple dataset
+
+
+
from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification
+
+cc = CategoricalClassification()
+
+# Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35
+X = cc.generate_data(9,
+ 10000,
+ cardinality=35,
+ ensure_rep=True,
+ random_values=True,
+ low=0,
+ high=40)
+
+# Creates target labels via clustering
+y = cc.generate_labels(X, n=2, class_relation='cluster')
+
+
@@ -137,8 +164,9 @@ OutRank as a Python library
1"""
2.. include:: ../docs/DOCSMAIN.md
-3"""
-4from __future__ import annotations
+3.. include:: ../docs/generator_docs.md
+4"""
+5from __future__ import annotations
@@ -326,4 +354,4 @@ OutRank as a Python library
}
});
-