From 1118ad12f77a4ab9e1a1774cfea4df36c1d71305 Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Thu, 4 Jul 2024 09:04:52 +0200 Subject: [PATCH 1/9] added complex synthetic feature generators Added a suite of functions related to synthetic feature generation. --- .idea/.gitignore | 8 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 7 + .idea/modules.xml | 8 + .idea/outrank.iml | 14 + .idea/vcs.xml | 6 + .../synthetic_data_generators/cc_generator.py | 702 ++++++++++++++++++ tests/cc_generator_tests.py | 157 ++++ 8 files changed, 908 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/outrank.iml create mode 100644 .idea/vcs.xml create mode 100644 outrank/algorithms/synthetic_data_generators/cc_generator.py create mode 100644 tests/cc_generator_tests.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..877d184 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..3c2f566 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/outrank.iml b/.idea/outrank.iml new file mode 100644 index 0000000..8e5446a --- /dev/null +++ b/.idea/outrank.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py new file mode 100644 index 0000000..2f00f7e --- /dev/null +++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py @@ -0,0 +1,702 @@ +import numpy as np +from scipy.linalg import qr +from scipy.stats import norm +from sklearn.cluster import KMeans +from sklearn.utils import resample +from typing import List, Union, Optional, Tuple +class CategoricalClassification: + + def __init__(self): + self.dataset_info = { + 'general': {}, + 'combinations': [], + 'correlations': [], + 'duplicates': [], + 'labels': [], + 'noise': [] + } + + def __repr__(self): + return f"CategoricalClassification(dataset_info={self.dataset_info})" + + def generate_data(self, + n_features: int, + n_samples: int, + cardinality: int = 5, + structure: Optional = None, + ensure_rep: bool = False, + seed: int = 42) -> np.ndarray: + + """ + Generates dataset based on parameters + :param n_features: number of generated features + :param n_samples: number of generated samples + :param cardinality: default cardinality of the dataset + :param structure: structure of the dataset + :param ensure_rep: flag, ensures all given values represented + :param seed: sets seed of numpy random + :return: X, 2D dataset + """ + + self.dataset_info.update({ + 'general': { + 'n_features': n_features, + 'n_samples': n_samples, + 'cardinality': cardinality, + 'structure': structure, + 'ensure_rep': ensure_rep, + 'seed': seed + } + }) + + np.random.seed(seed) + X = np.empty([n_features, n_samples]) + + if structure == None: + + for i in range(n_features): + x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + X[i] = x + + else: + + ix = 0 + for data in structure: + + if not isinstance(data[0], (list, np.ndarray)): + feature_ix = data[0] + feature_cardinality = data[1] + + if ix < feature_ix: + for i in range(ix, feature_ix): + x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + X[ix] = x + ix += 1 + + if not isinstance(feature_cardinality, (list, np.ndarray)): + x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep) + else: + if isinstance(feature_cardinality[0], (list, np.ndarray)): + value_domain = feature_cardinality[0] + value_frequencies = feature_cardinality[1] + x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies) + else: + value_domain = feature_cardinality + x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep) + X[ix] = x + ix += 1 + + else: + feature_ixs = data[0] + feature_cardinality = data[1] + for feature_ix in feature_ixs: + if ix < feature_ix: + for i in range(ix, feature_ix): + x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + X[ix] = x + ix += 1 + + if not isinstance(feature_cardinality, (list, np.ndarray)): + x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep) + else: + value_domain = feature_cardinality[0] + value_frequencies = feature_cardinality[1] + x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies) + X[ix] = x + ix += 1 + + if ix < n_features: + for i in range(ix, n_features): + x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + X[i] = x + + return X.T + + def _generate_feature(self, + v: Union[int, List[int], np.ndarray], + size: int, + ensure_rep: bool = False, + p: Optional[Union[List[float], np.ndarray]] = None) -> np.ndarray: + """ + Generates feature vector of length size. Default probability density distribution is approx. normal, centred around randomly picked value. + :param v: either int for cardinality, or list of values + :param size: length of feature vector + :param ensure_rep: ensures all values are represented at least once in the feature vector + :param p: list of probabilities of each value + :return: + """ + if not isinstance(v, (list, np.ndarray)): + v = np.arange(0, v, 1) + else: + v = np.array(v) + + if p is None: + v_shift = v - v[np.random.randint(len(v))] + p = norm.pdf(v_shift, scale=3) + else: + p = np.array(p) + + p = p / p.sum() + + if ensure_rep and len(v) < size: + sampled_values = np.random.choice(v, size=(size - len(v)), p=p) + sampled_values = np.append(sampled_values, v) + else: + sampled_values = np.random.choice(v, size=size, p=p) + + np.random.shuffle(sampled_values) + return sampled_values + + def generate_combinations(self, + X: np.ndarray, + feature_indices: Union[List[int], np.ndarray], + combination_function: Optional = None, + combination_type: str ='linear') -> np.ndarray: + """ + Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X + :param X: dataset + :param feature_indices: indexes of features to be in combination + :param combination_function: optional custom function for combining feature vectors + :param combination_type: string flag, either liner or nonlinear, defining combination type + :return: X with added resultant feature + """ + + + selected_features = X[:, feature_indices] + + if combination_function is None: + if combination_type == 'linear': + combination_function = lambda x: np.sum(x, axis=1) + elif combination_type == 'nonlinear': + combination_function = lambda x: np.sin(np.sum(x, axis=1)) + else: + combination_type = str(combination_function.__name__) + + combination_result = combination_function(selected_features) + + combination_ix = len(X[0]) + + self.dataset_info['combinations'].append({ + 'feature_indices': feature_indices, + 'combination_type': combination_type, + 'combination_ix': combination_ix + }) + + return np.column_stack((X, combination_result)) + + def _xor(self, arr): + """ + Performs bitwise XOR operation on two integer arrays + :param a: array + :param b: array + :return: bitwise XOR result + """ + arrT = arr.T + arrT = arrT.astype(int) + out = np.bitwise_xor(arrT[0], arrT[1]) + if len(arrT) > 2: + for i in range(2, len(arrT)): + out = np.bitwise_xor(out, arrT[i]) + + return out.T + + def _and(self, arr): + """ + Performs bitwise AND operation on two integer arrays + :param a: array + :param b: array + :return: bitwise AND result + """ + arrT = arr.T + arrT = arrT.astype(int) + out = np.bitwise_xor(arrT[0], arrT[1]) + if len(arrT) > 2: + for i in range(2, len(arrT)): + out = np.bitwise_and(out, arrT[i]) + + return out.T + + def _or(self, arr): + """ + Performs bitwise OR operation on two integer arrays + :param a: array + :param b: array + :return: bitwise OR result + """ + arrT = arr.T + arrT = arrT.astype(int) + out = np.bitwise_xor(arrT[0], arrT[1]) + if len(arrT) > 2: + for i in range(2, len(arrT)): + out = np.bitwise_or(out, arrT[i]) + + return out.T + def generate_correlated(self, + X: np.ndarray, + feature_indices: Union[List[int], np.ndarray], + r: float = 0.8) -> np.ndarray: + + """ + Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0. + :param X: dataset + :param feature_indices: indices of features to generate correlated feature to + :param r: (Pearson) correlation factor + :return: X with generated correlated features + """ + + if not isinstance(feature_indices, (list, np.ndarray)): + feature_indices = np.array([feature_indices]) + + if len(feature_indices) > 1: + correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1) + else: + correlated_ixs = len(X[0]) + + selected_features = X[:, feature_indices] + transposed = np.transpose(selected_features) + correlated_features = [] + + for t in transposed: + theta = np.arccos(r) + t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10) + + rand = np.random.normal(0, 1, len(t_standard)) + rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10) + + M = np.column_stack((t_standard, rand)) + M_centred = (M - np.mean(M, axis=0)) + + Id = np.eye(len(t)) + Q = qr(M_centred[:, [0]], mode='economic')[0] + P = np.dot(Q, Q.T) + orthogonal_projection = np.dot(Id - P, M_centred[:, 1]) + M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection)) + + Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0)))) + corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0] + + correlated_features.append(corr) + + correlated_features = np.transpose(correlated_features) + + self.dataset_info['correlations'].append({ + 'feature_indices': feature_indices, + 'correlated_indices': correlated_ixs, + 'correlation_factor': r + }) + + return np.column_stack((X, correlated_features)) + + def generate_duplicates(self, + X: np.ndarray, + feature_indices: Union[List[int], np.ndarray]) -> np.ndarray: + """ + Generates duplicate features + :param X: dataset + :param feature_indices: indices of features to duplicate + :return: dataset with duplicated features + """ + if not isinstance(feature_indices, (list, np.ndarray)): + feature_indices = np.array([feature_indices]) + + duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1) + + selected_features = X[:, feature_indices] + + self.dataset_info['duplicates'].append({ + 'feature_indices': feature_indices, + 'duplicate_indices': duplicated_ixs + }) + + return np.column_stack((X, selected_features)) + + def generate_labels(self, + X: np.ndarray, + n: int = 2, + p: Union[float, list[float], np.ndarray] = 0.5, + k: Union[int, float] = 2, + decision_function: Optional = None, + class_relation: str ='linear', + balance: bool = False): + """ + Generates labels for dataset X + :param X: dataset + :param n: number of class labels + :param p: class distribution + :param k: constant + :param decision_function: optional user-defined decision function + :param class_relation: string, either 'linear', 'nonlinear', or 'cluster' + :param balance: boolean, whether to balance clustering class labels + :return: array of labels, corresponding to dataset X + """ + + if isinstance(p, (list, np.ndarray)): + if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0') + if len(p) > n: raise ValueError('length of p must equal n') + + if p > 1: raise ValueError('p must be less than 1.0') + + n_samples, n_features = X.shape + + if decision_function is None: + if class_relation == 'linear': + decision_function = lambda x: np.sum(2 * x + 3, axis=1) + elif class_relation == 'nonlinear': + decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1) + elif class_relation == 'cluster': + decision_function = None + else: + class_relation = str(decision_function.__name__) + + y = [] + if decision_function is not None: + if n > 2: + if type(p) != list: + p = 1 / n + percentiles = [p * 100] + for i in range(1, n - 1): + percentiles.append(percentiles[i - 1] + (p * 100)) + + decision_boundary = decision_function(X) + p_points = np.percentile(decision_boundary, percentiles) + + y = np.zeros_like(decision_boundary, dtype=int) + for p_point in p_points: + y += (decision_boundary > p_point) + else: + decision_boundary = decision_function(X) + percentiles = [x * 100 for x in p] + + for i in range(1, len(percentiles) - 1): + percentiles[i] += percentiles[i - 1] + + percentiles.insert(0, 0) + percentiles.pop() + print(percentiles) + + p_points = np.percentile(decision_boundary, percentiles) + print(p_points) + + y = np.zeros_like(decision_boundary, dtype=int) + for i in range(1, n): + p_point = p_points[i] + for j in range(len(decision_boundary)): + if decision_boundary[j] > p_point: + y[j] += 1 + else: + decision_boundary = decision_function(X) + p_point = np.percentile(decision_boundary, p * 100) + y = np.where(decision_boundary > p_point, 1, 0) + else: + if p == 0.5: + p = 1.0 + else: + p = [p, 1 - p] + y = self._cluster_data(X, n, p=p, balance=balance) + + self.dataset_info.update({ + 'labels': { + 'class_relation': class_relation, + 'n_class': n + } + }) + + return y + + def _cluster_data(self, + X: np.ndarray, + n: int, + p: Optional[Union[float, List[float], np.ndarray]] = 1.0, + balance: bool = False) -> np.ndarray: + """ + Cluster data using kmeans + :param X: dataset + :param n: number of clusters + :param p: class distribution + :param balance: balance the clusters according to p + :return: array of labels, corresponding to dataset X + """ + + kmeans = KMeans(n_clusters=n) + + kmeans.fit(X) + + cluster_labels = kmeans.labels_ + + if not isinstance(p, (list, np.ndarray)): # Fully balanced clusters + samples_per_cluster = [len(X) // n] * n + else: + samples = len(X) + samples_per_cluster = [] + if not isinstance(p, (list, np.ndarray)): + samples_per_cluster.append(int(samples * p) // n) + samples_per_cluster.append(int(samples * (1 - p)) // n) + else: + if len(p) == n: + for val in p: + samples_per_cluster.append(int(samples * val)) + else: + raise Exception("Length of balance parameter must equal number of clusters.") + + # Adjust cluster sizes + if balance: + adjustments = [] + overflow_samples = [] + overflow_indices = [] + for i in range(n): + cluster_size = np.sum(cluster_labels == i) + + adjustment = samples_per_cluster[i] - cluster_size + adjustments.append(adjustment) + + if adjustment < 0: # Cluter is too large + + centroid = kmeans.cluster_centers_[i] + dataset_indices = np.where(cluster_labels == i)[0] # Indices of samples in dataset + cluster_samples = np.copy(X[dataset_indices]) + + distances = np.linalg.norm(cluster_samples - centroid, + axis=1) # Distances of cluster samples to cluster centroid + cluster_sample_indices = np.argsort(distances) + dataset_indices_sorted = dataset_indices[ + cluster_sample_indices] # Indices of samples sorted by sample distance to cluster centroid + + overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:] # Overflow samples + dataset_indices_sorted = dataset_indices_sorted[ + samples_per_cluster[i]:] # Dataset indices of overflow samples + + for i in range(len(overflow_sample_indices)): + overflow_samples.append(cluster_samples[overflow_sample_indices[i]]) + overflow_indices.append(dataset_indices_sorted[i]) + + overflow_samples = np.array(overflow_samples) + overflow_indices = np.array(overflow_indices) + + # Making adjustments + for i in range(n): + + if adjustments[i] > 0: + centroid = kmeans.cluster_centers_[i] + distances = np.linalg.norm(overflow_samples - centroid, axis=1) + + closest_sample_indices = np.argsort(distances) + + overflow_indices_sorted = overflow_indices[closest_sample_indices] + + sample_indices_slice = closest_sample_indices[:adjustments[i]] + overflow_indices_slice = overflow_indices_sorted[:adjustments[i]] + + cluster_labels[overflow_indices_slice] = i + + overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0) + overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0) + + return np.array(cluster_labels) + + def generate_noise(self, + X: np.ndarray, + y: Union[List[int], np.ndarray], + p: float = 0.2, + type: str = "categorical", + missing_val: Union[str, int, float] = float('-inf')) -> np.ndarray: + + """ + Simulates noise on given dataset X + :param X: dataset to apply noise to + :param y: required target labels for categorical noise generation + :param p: amount of noise to apply. Defaults to 0.2 + :param type: type of noise to apply, either categorical or missing + :param missing_val: value to simulate missing values. Defaults to float('-inf') + :return: X with noise applied + """ + + self.dataset_info['noise'].append({ + 'type': type, + 'amount': p + }) + + if type == "categorical": + label_values, label_count = np.unique(y, return_counts=True) + n_labels = len(label_values) + + inds = y.argsort() + y_sort = y[inds] + X_sort = X[inds] + + Xs_T = X_sort.T + n = Xs_T.shape[1] + n_flip = int(n * p) + + for feature in Xs_T: + unique_per_label = {} + + for i in range(n_labels): + if i == 0: + unique = np.unique(feature[:label_count[i]]) + unique_per_label[label_values[i]] = set(unique) + else: + unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1]) + unique_per_label[label_values[i]] = set(unique) + + ixs = np.random.choice(n, n_flip, replace=False) + + for ix in ixs: + current_label = y_sort[ix] + possible_labels = np.where(label_values != current_label)[0] + + # find all unique values from labels != current label + values = set() + for key in possible_labels: + values = values.union(unique_per_label[key]) + + # remove any overlapping values, ensuring replacement values are unique & from a target label != + # current label + for val in unique_per_label[current_label] & values: + values.remove(val) + + if len(values) > 0: + val = np.random.choice(list(values)) + + else: + key = possible_labels[np.random.randint(len(possible_labels))] + values = unique_per_label[key] + val = np.random.choice(list(values)) + + feature[ix] = val + + rev_ind = inds.argsort() + X_noise = Xs_T.T + X_noise = X_noise[rev_ind] + + return X_noise + + elif type == "missing": + X_noise = np.copy(X) + Xn_T = X_noise.T + n = Xn_T.shape[1] + n_missing = int(n * p) + #print("n to delete:", n_missing) + + for feature in Xn_T: + ixs = np.random.choice(n, n_missing, replace=False) + + for ix in ixs: + feature[ix] = missing_val + + return Xn_T.T + + def downsample_dataset(self, + X: np.array, + y: Union[List[int], np.ndarray], + N: Optional[Union[int, None]] = None, + seed: int = 42, + reshuffle: bool=False) -> Tuple[np.array, np.ndarray]: + + """ + Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset. + :param X: Dataset to downsample + :param y: Labels corresponding to X + :param N: Optional number of samples per class to downsample to + :param seed: Seed for random state of resample function + :param reshuffle: Reshuffle the dataset after downsampling + :return: Balanced X and y after downsampling + """ + + original_shape = X.shape + + values, counts = np.unique(y, return_counts=True) + if N is None: + N = min(counts) + + if N > min(counts): + raise ValueError("N must be equal to or less than the number of samples in minority class") + + X_arrays_list = [] + y_downsampled = [] + for label in values: + X_label = [X[i] for i in range(len(y)) if y[i] == label] + X_label_downsample = resample(X_label, + replace=True, + n_samples=N, + random_state=seed) + X_arrays_list.append(X_label_downsample) + ys = [label] * N + y_downsampled = np.concatenate((y_downsampled, ys), axis=0) + + X_downsampled = np.concatenate(X_arrays_list, axis=0) + + if reshuffle: + indices = np.arange(len(X_downsampled)) + np.random.shuffle(indices) + X_downsampled = X_downsampled[indices] + y_downsampled = y_downsampled[indices] + + downsampled_shape = X_downsampled.shape + + self.dataset_info.update({ + 'downsampling': { + 'original_shape': original_shape, + 'downsampled_shape': downsampled_shape + } + }) + + return X_downsampled, y_downsampled + + def print_dataset(self, X, y): + """ + Prints given dataset + :param X: dataset + :param y: labels + :return: + """ + + n_samples, n_features = X.shape + n = 0 + for arr in X: + print('[', end='') + for i in range(n_features): + if i == n_features - 1: + print(arr[i], end='') + else: + print(arr[i], end=', ') + print("], Label: {}".format(y[n])) + n += 1 + + + def summarize(self): + + print(f"Number of features: {self.dataset_info['general']['n_features']}") + print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}") + if self.dataset_info['downsampling']: + print(f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}") + print(f"Number of classes: {self.dataset_info['labels']['n_class']}") + print(f"Class relation: {self.dataset_info['labels']['class_relation']}") + + + print('-------------------------------------') + + if len(self.dataset_info['combinations']) > 0: + print("Combinations:") + for comb in self.dataset_info['combinations']: + print(f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}") + print('-------------------------------------') + + if len(self.dataset_info['correlations']) > 0: + print("Correlations:") + for corr in self.dataset_info['correlations']: + print(f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}") + print('-------------------------------------') + + if len(self.dataset_info['duplicates']) > 0: + print("Duplicates:") + for dup in self.dataset_info['duplicates']: + print(f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}") + print('-------------------------------------') + + if len(self.dataset_info['noise']) > 0: + print("Simulated noise:") + for noise in self.dataset_info['noise']: + print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}") + print('-------------------------------------') + + print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']") \ No newline at end of file diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py new file mode 100644 index 0000000..e5665f6 --- /dev/null +++ b/tests/cc_generator_tests.py @@ -0,0 +1,157 @@ +import pytest +import numpy as np +from scipy.stats import pearsonr +from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification + +@pytest.fixture +def cc_instance(): + return CategoricalClassification() + +def test_init(cc_instance): + assert cc_instance.dataset_info == '' + +def test_generate_data_shape_and_type(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + assert isinstance(X, np.ndarray), "Output should be a numpy array" + assert X.shape == (100, 5), "Shape should be (n_samples, n_features)" + +def test_generate_data_cardinality(cc_instance): + n_features = 5 + cardinality = 3 + X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality) + unique_values = np.unique(X) + assert len(unique_values) <= cardinality, "Cardinality not respected for all features" + +def test_generate_data_ensure_rep(cc_instance): + n_features = 5 + cardinality = 50 + X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality, ensure_rep=True) + unique_values = np.unique(X) + assert len(unique_values) == cardinality, "Not all values represented when 'ensure_rep=True'" + +def test_generate_feature_shape_and_type(cc_instance): + feature = cc_instance._generate_feature(5, size=100) + assert isinstance(feature, np.ndarray), "Output should be a numpy array" + assert feature.shape == (100,), "Shape should be (size,)" + +def test_generate_feature_cardinality(cc_instance): + feature = cc_instance._generate_feature(5, size=100) + unique_values = np.unique(feature) + assert len(unique_values) <= 5, "Feature cardinality not respected for all features" + +def test_generate_feature_ensure_rep(cc_instance): + feature = cc_instance._generate_feature(50, size=100, ensure_rep=True) + unique_values = np.unique(feature) + assert len(unique_values) == 50, "Not all values represented when using 'ensure_rep=True'" + +def test_generate_feature_values(cc_instance): + values = [5, 6, 7, 8, 9, 10] + feature = cc_instance._generate_feature(values, size=100) + unique_values = np.unique(feature) + assert any(f in feature for f in values), "Feature values not in input list" +def test_generate_feature_values_ensure_rep(cc_instance): + values = [5, 6, 7, 8, 9, 10] + feature = cc_instance._generate_feature(values, size=100, ensure_rep=True) + unique_values = np.unique(feature) + assert (values == unique_values).all(), "Feature values should match input list when 'ensure_rep=True'" + +def test_generate_feature_density(cc_instance): + values = [0, 1, 2] + p = [0.2, 0.4, 0.4] + feature = cc_instance._generate_feature(values, size=10000, ensure_rep=True, p=p) + values, counts = np.unique(feature, return_counts=True) + generated_p = np.round(counts/10000, decimals=1) + assert (generated_p == p).all(), "Feature values should have density roughly equal to 'p'" + +def test_generate_combinations_shape_and_type(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + indices = [0,1] + X = cc_instance.generate_combinations(X, indices, combination_type='linear') + assert isinstance(X, np.ndarray), "Output should be a numpy array" + assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)" + +def test_generate_correlated_shape_and_type(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = cc_instance.generate_correlated(X, indices, r=0.8) + assert isinstance(X, np.ndarray), "Output should be a numpy array" + assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)" + +def test_generate_correlated_correlaton(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = cc_instance.generate_correlated(X, indices, r=0.8) + Xt = X.T + corr, _ = pearsonr(Xt[0], Xt[5]) + assert np.round(corr, decimals=1) == 0.8, "Resultant correlation should be equal to the 'r' parameter" + + +def test_generate_duplicates_shape_and_type(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = cc_instance.generate_duplicates(X, indices) + assert isinstance(X, np.ndarray), "Output should be a numpy array" + assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)" + +def test_generate_duplicates_duplication(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = cc_instance.generate_duplicates(X, indices) + Xt = X.T + assert (Xt[0] == Xt[-1]).all() + +def test_xor_operation(cc_instance): + a = np.array([1, 0, 1]) + b = np.array([0, 1, 1]) + arr = [a, b] + result = cc_instance._xor(arr) + expected = np.array([1, 1, 0]) + assert np.array_equal(result, expected), "XOR operation did not produce expected result" + +def test_and_operation(cc_instance): + a = np.array([1, 0, 1]) + b = np.array([0, 1, 1]) + arr = [a, b] + result = cc_instance._and(arr) + expected = np.array([0, 0, 1]) + assert np.array_equal(result, expected), "AND operation did not produce expected result" + +def test_or_operation(cc_instance): + a = np.array([1, 0, 1]) + b = np.array([0, 1, 1]) + arr = [a, b] + result = cc_instance._or(arr) + expected = np.array([1, 1, 1]) + assert np.array_equal(result, expected), "OR operation did not produce expected result" + +def test_generate_labels_shape_and_type(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + labels = cc_instance.generate_labels(X) + assert isinstance(labels, np.ndarray), "Output should be a numpy array" + assert labels.shape == (100,), "Shape should be (n_samples,)" + +def test_generate_labels_distribution(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + labels = cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5]) + unique, counts = np.unique(labels, return_counts=True) + distribution = counts / 100 + expected_distribution = np.array([0.2, 0.3, 0.5]) + assert np.allclose(distribution, expected_distribution, atol=0.1), "Label distribution does not match expected distribution" + +def test_generate_labels_class_relation_linear(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + labels = cc_instance.generate_labels(X, class_relation='linear') + assert isinstance(labels, np.ndarray), "Output should be a numpy array" + assert labels.shape == (100,), "Shape should be (n_samples,)" + +def test_generate_labels_class_relation_nonlinear(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + labels = cc_instance.generate_labels(X, class_relation='nonlinear') + assert isinstance(labels, np.ndarray), "Output should be a numpy array" + assert labels.shape == (100,), "Shape should be (n_samples,)" + +def test_generate_labels_class_relation_cluster(cc_instance): + X = cc_instance.generate_data(n_features=5, n_samples=100) + labels = cc_instance.generate_labels(X, class_relation='cluster', balance=True) + assert isinstance(labels, np.ndarray), "Output should be a numpy array" + assert labels.shape == (100,), "Shape should be (n_samples,)" \ No newline at end of file From cb04d4da1f5378d656720839f0ae31c70cd598ec Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Fri, 5 Jul 2024 09:31:42 +0200 Subject: [PATCH 2/9] removed .idea --- .idea/.gitignore | 8 -------- .idea/inspectionProfiles/profiles_settings.xml | 6 ------ .idea/misc.xml | 7 ------- .idea/modules.xml | 8 -------- .idea/outrank.iml | 14 -------------- .idea/vcs.xml | 6 ------ 6 files changed, 49 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/outrank.iml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 877d184..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 3c2f566..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/outrank.iml b/.idea/outrank.iml deleted file mode 100644 index 8e5446a..0000000 --- a/.idea/outrank.iml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1dd..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 30549a4c6635fca147ab122a2413451a994d6b5c Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:37:30 +0200 Subject: [PATCH 3/9] pre-commit, code review changes pre-commit, code review changes: - added _feature_builder method to avoid duplicate code blocks - added some new parameters to enable random value domains for features --- .../synthetic_data_generators/cc_generator.py | 391 ++++++++++++------ tests/cc_generator_tests.py | 67 +-- 2 files changed, 301 insertions(+), 157 deletions(-) diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py index 2f00f7e..dd148a5 100644 --- a/outrank/algorithms/synthetic_data_generators/cc_generator.py +++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py @@ -1,9 +1,17 @@ +from __future__ import annotations + +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + import numpy as np from scipy.linalg import qr from scipy.stats import norm from sklearn.cluster import KMeans from sklearn.utils import resample -from typing import List, Union, Optional, Tuple + + class CategoricalClassification: def __init__(self): @@ -13,19 +21,24 @@ def __init__(self): 'correlations': [], 'duplicates': [], 'labels': [], - 'noise': [] + 'noise': [], } def __repr__(self): return f"CategoricalClassification(dataset_info={self.dataset_info})" - def generate_data(self, - n_features: int, - n_samples: int, - cardinality: int = 5, - structure: Optional = None, - ensure_rep: bool = False, - seed: int = 42) -> np.ndarray: + def generate_data( + self, + n_features: int, + n_samples: int, + cardinality: int = 5, + structure: list | np.ndarray | None = None, + ensure_rep: bool = False, + random_values: bool | None = False, + low: int | None = 0, + high: int | None = 1000, + seed: int = 42, + ) -> np.ndarray: """ Generates dataset based on parameters @@ -34,6 +47,9 @@ def generate_data(self, :param cardinality: default cardinality of the dataset :param structure: structure of the dataset :param ensure_rep: flag, ensures all given values represented + :param random_values: flag, enables random (integer) feature values from set [low, high] + :param low: sets lower bound of random feature values + :param high: sets high bound of random feature values :param seed: sets seed of numpy random :return: X, 2D dataset """ @@ -45,113 +61,214 @@ def generate_data(self, 'cardinality': cardinality, 'structure': structure, 'ensure_rep': ensure_rep, - 'seed': seed - } + 'seed': seed, + }, }) np.random.seed(seed) X = np.empty([n_features, n_samples]) - if structure == None: - + if structure is None: + # No specific structure parameter passed for i in range(n_features): - x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + x = self._generate_feature( + n_samples, + cardinality=cardinality, + ensure_rep=ensure_rep, + random_values=random_values, + low=low, + high=high, + ) X[i] = x - else: - + # Structure parameter passed, building based on structure ix = 0 for data in structure: - if not isinstance(data[0], (list, np.ndarray)): - feature_ix = data[0] - feature_cardinality = data[1] + # Data in structure is a tuple of (feature index (integer), feature attributes) + feature_ix, feature_attributes = data if ix < feature_ix: + # Filling out the dataset up to column index feature_ix for i in range(ix, feature_ix): - x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + x = self._generate_feature( + n_samples, + cardinality=cardinality, + ensure_rep=ensure_rep, + random_values=random_values, + low=low, + high=high, + ) X[ix] = x ix += 1 - if not isinstance(feature_cardinality, (list, np.ndarray)): - x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep) - else: - if isinstance(feature_cardinality[0], (list, np.ndarray)): - value_domain = feature_cardinality[0] - value_frequencies = feature_cardinality[1] - x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies) - else: - value_domain = feature_cardinality - x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep) + x = self._feature_builder( + feature_attributes, + n_samples, + ensure_rep=ensure_rep, + random_values=random_values, + low=low, + high=high, + ) X[ix] = x ix += 1 else: + # Data in structure is a tuple of (list of feature indexes, feature attributes) feature_ixs = data[0] - feature_cardinality = data[1] + feature_attributes = data[1] + for feature_ix in feature_ixs: + # Filling out the dataset up to feature_ix if ix < feature_ix: for i in range(ix, feature_ix): - x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + x = self._generate_feature( + n_samples, + cardinality=cardinality, + ensure_rep=ensure_rep, + random_values=random_values, + low=low, + high=high, + ) X[ix] = x ix += 1 - if not isinstance(feature_cardinality, (list, np.ndarray)): - x = self._generate_feature(feature_cardinality, n_samples, ensure_rep=ensure_rep) - else: - value_domain = feature_cardinality[0] - value_frequencies = feature_cardinality[1] - x = self._generate_feature(value_domain, n_samples, ensure_rep=ensure_rep, p=value_frequencies) + x = self._feature_builder( + feature_attributes, + n_samples, + ensure_rep=ensure_rep, + random_values=random_values, + low=low, + high=high, + ) + X[ix] = x ix += 1 if ix < n_features: + # Fill out the rest of the dataset for i in range(ix, n_features): - x = self._generate_feature(cardinality, n_samples, ensure_rep=ensure_rep) + x = self._generate_feature( + n_samples, + cardinality=cardinality, + ensure_rep=ensure_rep, + random_values=random_values, + low=low, + high=high, + ) X[i] = x return X.T - def _generate_feature(self, - v: Union[int, List[int], np.ndarray], - size: int, - ensure_rep: bool = False, - p: Optional[Union[List[float], np.ndarray]] = None) -> np.ndarray: + def _feature_builder( + self, + feature_attributes: int | list | np.ndarray, + n_samples: int, + ensure_rep: bool = False, + random_values: bool | None = False, + low: int | None = 0, + high: int | None = 1000, + ) -> np.ndarray: + """ - Generates feature vector of length size. Default probability density distribution is approx. normal, centred around randomly picked value. - :param v: either int for cardinality, or list of values + Helper function to avoid duplicate code, builds feature + :param feature_attributes: either integer (cardinality) or list of feature attributes + :param n_samples: number of samples in dataset + :param ensure_rep: ensures all values are represented at least once in the feature vector + :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1 + :param low: lower bound of random feature vector values + :param high: upper bound of random feature vector values + :return: feature vector + """ + + if not isinstance(feature_attributes, (list, np.ndarray)): + # feature_cardinality is just an integer, generate feature either with random values or + # [low, low+cardinality] + x = self._generate_feature( + n_samples, + cardinality=feature_attributes, + ensure_rep=ensure_rep, + random_values=random_values, + low=low, + high=high, + ) + else: + # feature_cardinality is a list of [value_domain, value_frequencies] + if isinstance(feature_attributes[0], (list, np.ndarray)): + value_domain, value_frequencies = feature_attributes + x = self._generate_feature( + n_samples, + vec=value_domain, + ensure_rep=ensure_rep, + p=value_frequencies, + ) + else: + # feature_cardinality is value_domain (list of values for feature) + value_domain = feature_attributes + x = self._generate_feature( + n_samples, + vec=value_domain, + ensure_rep=ensure_rep, + ) + + return x + + def _generate_feature( + self, + size: int, + vec: list[int] | np.ndarray | None = None, + cardinality: int = 5, + ensure_rep: bool = False, + random_values: bool | None = False, + low: int | None = 0, + high: int | None = 1000, + p: list[float] | np.ndarray | None = None, + ) -> np.ndarray: + """ + Generates feature vector of length size. Default probability density distribution is approx. normal, centred around a randomly picked value. + :param vec: list of feature values + :param cardinality: single value cardinality :param size: length of feature vector :param ensure_rep: ensures all values are represented at least once in the feature vector + :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1 + :param low: lower bound of random feature vector values + :param high: upper bound of random feature vector values :param p: list of probabilities of each value - :return: + :return: feature vector x """ - if not isinstance(v, (list, np.ndarray)): - v = np.arange(0, v, 1) + + if vec is None: + if random_values: + vec = np.random.choice(range(low, high + 1), cardinality, replace=False) + else: + vec = np.arange(low, low + cardinality, 1) else: - v = np.array(v) + vec = np.array(vec) if p is None: - v_shift = v - v[np.random.randint(len(v))] + v_shift = vec - vec[np.random.randint(len(vec))] p = norm.pdf(v_shift, scale=3) else: p = np.array(p) p = p / p.sum() - if ensure_rep and len(v) < size: - sampled_values = np.random.choice(v, size=(size - len(v)), p=p) - sampled_values = np.append(sampled_values, v) + if ensure_rep and len(vec) < size: + sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p) + sampled_values = np.append(sampled_values, vec) else: - sampled_values = np.random.choice(v, size=size, p=p) + sampled_values = np.random.choice(vec, size=size, p=p) np.random.shuffle(sampled_values) return sampled_values - def generate_combinations(self, - X: np.ndarray, - feature_indices: Union[List[int], np.ndarray], - combination_function: Optional = None, - combination_type: str ='linear') -> np.ndarray: + def generate_combinations( + self, + X: np.ndarray, + feature_indices: list[int] | np.ndarray, + combination_function: Optional = None, + combination_type: str = 'linear', + ) -> np.ndarray: """ Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X :param X: dataset @@ -161,7 +278,6 @@ def generate_combinations(self, :return: X with added resultant feature """ - selected_features = X[:, feature_indices] if combination_function is None: @@ -179,7 +295,7 @@ def generate_combinations(self, self.dataset_info['combinations'].append({ 'feature_indices': feature_indices, 'combination_type': combination_type, - 'combination_ix': combination_ix + 'combination_ix': combination_ix, }) return np.column_stack((X, combination_result)) @@ -231,10 +347,13 @@ def _or(self, arr): out = np.bitwise_or(out, arrT[i]) return out.T - def generate_correlated(self, - X: np.ndarray, - feature_indices: Union[List[int], np.ndarray], - r: float = 0.8) -> np.ndarray: + + def generate_correlated( + self, + X: np.ndarray, + feature_indices: list[int] | np.ndarray, + r: float = 0.8, + ) -> np.ndarray: """ Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0. @@ -282,14 +401,16 @@ def generate_correlated(self, self.dataset_info['correlations'].append({ 'feature_indices': feature_indices, 'correlated_indices': correlated_ixs, - 'correlation_factor': r + 'correlation_factor': r, }) return np.column_stack((X, correlated_features)) - def generate_duplicates(self, - X: np.ndarray, - feature_indices: Union[List[int], np.ndarray]) -> np.ndarray: + def generate_duplicates( + self, + X: np.ndarray, + feature_indices: list[int] | np.ndarray, + ) -> np.ndarray: """ Generates duplicate features :param X: dataset @@ -305,19 +426,21 @@ def generate_duplicates(self, self.dataset_info['duplicates'].append({ 'feature_indices': feature_indices, - 'duplicate_indices': duplicated_ixs + 'duplicate_indices': duplicated_ixs, }) return np.column_stack((X, selected_features)) - def generate_labels(self, - X: np.ndarray, - n: int = 2, - p: Union[float, list[float], np.ndarray] = 0.5, - k: Union[int, float] = 2, - decision_function: Optional = None, - class_relation: str ='linear', - balance: bool = False): + def generate_labels( + self, + X: np.ndarray, + n: int = 2, + p: float | list[float] | np.ndarray = 0.5, + k: int | float = 2, + decision_function: Optional = None, + class_relation: str = 'linear', + balance: bool = False, + ): """ Generates labels for dataset X :param X: dataset @@ -397,17 +520,19 @@ def generate_labels(self, self.dataset_info.update({ 'labels': { 'class_relation': class_relation, - 'n_class': n - } + 'n_class': n, + }, }) return y - def _cluster_data(self, - X: np.ndarray, - n: int, - p: Optional[Union[float, List[float], np.ndarray]] = 1.0, - balance: bool = False) -> np.ndarray: + def _cluster_data( + self, + X: np.ndarray, + n: int, + p: float | list[float] | np.ndarray | None = 1.0, + balance: bool = False, + ) -> np.ndarray: """ Cluster data using kmeans :param X: dataset @@ -436,7 +561,7 @@ def _cluster_data(self, for val in p: samples_per_cluster.append(int(samples * val)) else: - raise Exception("Length of balance parameter must equal number of clusters.") + raise Exception('Length of balance parameter must equal number of clusters.') # Adjust cluster sizes if balance: @@ -455,15 +580,19 @@ def _cluster_data(self, dataset_indices = np.where(cluster_labels == i)[0] # Indices of samples in dataset cluster_samples = np.copy(X[dataset_indices]) - distances = np.linalg.norm(cluster_samples - centroid, - axis=1) # Distances of cluster samples to cluster centroid + distances = np.linalg.norm( + cluster_samples - centroid, + axis=1, + ) # Distances of cluster samples to cluster centroid cluster_sample_indices = np.argsort(distances) dataset_indices_sorted = dataset_indices[ - cluster_sample_indices] # Indices of samples sorted by sample distance to cluster centroid + cluster_sample_indices + ] # Indices of samples sorted by sample distance to cluster centroid overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:] # Overflow samples dataset_indices_sorted = dataset_indices_sorted[ - samples_per_cluster[i]:] # Dataset indices of overflow samples + samples_per_cluster[i]: + ] # Dataset indices of overflow samples for i in range(len(overflow_sample_indices)): overflow_samples.append(cluster_samples[overflow_sample_indices[i]]) @@ -493,12 +622,14 @@ def _cluster_data(self, return np.array(cluster_labels) - def generate_noise(self, - X: np.ndarray, - y: Union[List[int], np.ndarray], - p: float = 0.2, - type: str = "categorical", - missing_val: Union[str, int, float] = float('-inf')) -> np.ndarray: + def generate_noise( + self, + X: np.ndarray, + y: list[int] | np.ndarray, + p: float = 0.2, + type: str = 'categorical', + missing_val: str | int | float = float('-inf'), + ) -> np.ndarray: """ Simulates noise on given dataset X @@ -512,10 +643,10 @@ def generate_noise(self, self.dataset_info['noise'].append({ 'type': type, - 'amount': p + 'amount': p, }) - if type == "categorical": + if type == 'categorical': label_values, label_count = np.unique(y, return_counts=True) n_labels = len(label_values) @@ -570,7 +701,7 @@ def generate_noise(self, return X_noise - elif type == "missing": + elif type == 'missing': X_noise = np.copy(X) Xn_T = X_noise.T n = Xn_T.shape[1] @@ -585,12 +716,14 @@ def generate_noise(self, return Xn_T.T - def downsample_dataset(self, - X: np.array, - y: Union[List[int], np.ndarray], - N: Optional[Union[int, None]] = None, - seed: int = 42, - reshuffle: bool=False) -> Tuple[np.array, np.ndarray]: + def downsample_dataset( + self, + X: np.array, + y: list[int] | np.ndarray, + N: int | None | None = None, + seed: int = 42, + reshuffle: bool = False, + ) -> tuple[np.array, np.ndarray]: """ Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset. @@ -609,16 +742,18 @@ def downsample_dataset(self, N = min(counts) if N > min(counts): - raise ValueError("N must be equal to or less than the number of samples in minority class") + raise ValueError('N must be equal to or less than the number of samples in minority class') X_arrays_list = [] y_downsampled = [] for label in values: X_label = [X[i] for i in range(len(y)) if y[i] == label] - X_label_downsample = resample(X_label, - replace=True, - n_samples=N, - random_state=seed) + X_label_downsample = resample( + X_label, + replace=True, + n_samples=N, + random_state=seed, + ) X_arrays_list.append(X_label_downsample) ys = [label] * N y_downsampled = np.concatenate((y_downsampled, ys), axis=0) @@ -636,8 +771,8 @@ def downsample_dataset(self, self.dataset_info.update({ 'downsampling': { 'original_shape': original_shape, - 'downsampled_shape': downsampled_shape - } + 'downsampled_shape': downsampled_shape, + }, }) return X_downsampled, y_downsampled @@ -659,44 +794,50 @@ def print_dataset(self, X, y): print(arr[i], end='') else: print(arr[i], end=', ') - print("], Label: {}".format(y[n])) + print(f'], Label: {y[n]}') n += 1 - def summarize(self): print(f"Number of features: {self.dataset_info['general']['n_features']}") print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}") if self.dataset_info['downsampling']: - print(f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}") + print( + f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}", + ) print(f"Number of classes: {self.dataset_info['labels']['n_class']}") print(f"Class relation: {self.dataset_info['labels']['class_relation']}") - print('-------------------------------------') if len(self.dataset_info['combinations']) > 0: - print("Combinations:") + print('Combinations:') for comb in self.dataset_info['combinations']: - print(f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}") + print( + f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}", + ) print('-------------------------------------') if len(self.dataset_info['correlations']) > 0: - print("Correlations:") + print('Correlations:') for corr in self.dataset_info['correlations']: - print(f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}") + print( + f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}", + ) print('-------------------------------------') if len(self.dataset_info['duplicates']) > 0: - print("Duplicates:") + print('Duplicates:') for dup in self.dataset_info['duplicates']: - print(f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}") + print( + f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}", + ) print('-------------------------------------') if len(self.dataset_info['noise']) > 0: - print("Simulated noise:") + print('Simulated noise:') for noise in self.dataset_info['noise']: print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}") print('-------------------------------------') - print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']") \ No newline at end of file + print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']") diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py index e5665f6..46ca8c4 100644 --- a/tests/cc_generator_tests.py +++ b/tests/cc_generator_tests.py @@ -1,6 +1,9 @@ -import pytest +from __future__ import annotations + import numpy as np +import pytest from scipy.stats import pearsonr + from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification @pytest.fixture @@ -12,15 +15,15 @@ def test_init(cc_instance): def test_generate_data_shape_and_type(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) - assert isinstance(X, np.ndarray), "Output should be a numpy array" - assert X.shape == (100, 5), "Shape should be (n_samples, n_features)" + assert isinstance(X, np.ndarray), 'Output should be a numpy array' + assert X.shape == (100, 5), 'Shape should be (n_samples, n_features)' def test_generate_data_cardinality(cc_instance): n_features = 5 cardinality = 3 X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality) unique_values = np.unique(X) - assert len(unique_values) <= cardinality, "Cardinality not respected for all features" + assert len(unique_values) <= cardinality, 'Cardinality not respected for all features' def test_generate_data_ensure_rep(cc_instance): n_features = 5 @@ -30,35 +33,35 @@ def test_generate_data_ensure_rep(cc_instance): assert len(unique_values) == cardinality, "Not all values represented when 'ensure_rep=True'" def test_generate_feature_shape_and_type(cc_instance): - feature = cc_instance._generate_feature(5, size=100) - assert isinstance(feature, np.ndarray), "Output should be a numpy array" - assert feature.shape == (100,), "Shape should be (size,)" + feature = cc_instance._generate_feature(100, cardinality=5) + assert isinstance(feature, np.ndarray), 'Output should be a numpy array' + assert feature.shape == (100,), 'Shape should be (size,)' def test_generate_feature_cardinality(cc_instance): - feature = cc_instance._generate_feature(5, size=100) + feature = cc_instance._generate_feature(100, cardinality=5) unique_values = np.unique(feature) - assert len(unique_values) <= 5, "Feature cardinality not respected for all features" + assert len(unique_values) <= 5, 'Feature cardinality not respected for all features' def test_generate_feature_ensure_rep(cc_instance): - feature = cc_instance._generate_feature(50, size=100, ensure_rep=True) + feature = cc_instance._generate_feature(100, cardinality=50, ensure_rep=True) unique_values = np.unique(feature) assert len(unique_values) == 50, "Not all values represented when using 'ensure_rep=True'" def test_generate_feature_values(cc_instance): values = [5, 6, 7, 8, 9, 10] - feature = cc_instance._generate_feature(values, size=100) + feature = cc_instance._generate_feature(100, vec=values) unique_values = np.unique(feature) - assert any(f in feature for f in values), "Feature values not in input list" + assert any(f in feature for f in values), 'Feature values not in input list' def test_generate_feature_values_ensure_rep(cc_instance): values = [5, 6, 7, 8, 9, 10] - feature = cc_instance._generate_feature(values, size=100, ensure_rep=True) + feature = cc_instance._generate_feature(100, vec=values, ensure_rep=True) unique_values = np.unique(feature) assert (values == unique_values).all(), "Feature values should match input list when 'ensure_rep=True'" def test_generate_feature_density(cc_instance): values = [0, 1, 2] p = [0.2, 0.4, 0.4] - feature = cc_instance._generate_feature(values, size=10000, ensure_rep=True, p=p) + feature = cc_instance._generate_feature(10000, vec=values, ensure_rep=True, p=p) values, counts = np.unique(feature, return_counts=True) generated_p = np.round(counts/10000, decimals=1) assert (generated_p == p).all(), "Feature values should have density roughly equal to 'p'" @@ -67,15 +70,15 @@ def test_generate_combinations_shape_and_type(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) indices = [0,1] X = cc_instance.generate_combinations(X, indices, combination_type='linear') - assert isinstance(X, np.ndarray), "Output should be a numpy array" - assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)" + assert isinstance(X, np.ndarray), 'Output should be a numpy array' + assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)' def test_generate_correlated_shape_and_type(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) indices = 0 X = cc_instance.generate_correlated(X, indices, r=0.8) - assert isinstance(X, np.ndarray), "Output should be a numpy array" - assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)" + assert isinstance(X, np.ndarray), 'Output should be a numpy array' + assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)' def test_generate_correlated_correlaton(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) @@ -90,8 +93,8 @@ def test_generate_duplicates_shape_and_type(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) indices = 0 X = cc_instance.generate_duplicates(X, indices) - assert isinstance(X, np.ndarray), "Output should be a numpy array" - assert X.shape == (100, 6), "Shape should be (n_samples, n_features + 1)" + assert isinstance(X, np.ndarray), 'Output should be a numpy array' + assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)' def test_generate_duplicates_duplication(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) @@ -106,7 +109,7 @@ def test_xor_operation(cc_instance): arr = [a, b] result = cc_instance._xor(arr) expected = np.array([1, 1, 0]) - assert np.array_equal(result, expected), "XOR operation did not produce expected result" + assert np.array_equal(result, expected), 'XOR operation did not produce expected result' def test_and_operation(cc_instance): a = np.array([1, 0, 1]) @@ -114,7 +117,7 @@ def test_and_operation(cc_instance): arr = [a, b] result = cc_instance._and(arr) expected = np.array([0, 0, 1]) - assert np.array_equal(result, expected), "AND operation did not produce expected result" + assert np.array_equal(result, expected), 'AND operation did not produce expected result' def test_or_operation(cc_instance): a = np.array([1, 0, 1]) @@ -122,13 +125,13 @@ def test_or_operation(cc_instance): arr = [a, b] result = cc_instance._or(arr) expected = np.array([1, 1, 1]) - assert np.array_equal(result, expected), "OR operation did not produce expected result" + assert np.array_equal(result, expected), 'OR operation did not produce expected result' def test_generate_labels_shape_and_type(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) labels = cc_instance.generate_labels(X) - assert isinstance(labels, np.ndarray), "Output should be a numpy array" - assert labels.shape == (100,), "Shape should be (n_samples,)" + assert isinstance(labels, np.ndarray), 'Output should be a numpy array' + assert labels.shape == (100,), 'Shape should be (n_samples,)' def test_generate_labels_distribution(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) @@ -136,22 +139,22 @@ def test_generate_labels_distribution(cc_instance): unique, counts = np.unique(labels, return_counts=True) distribution = counts / 100 expected_distribution = np.array([0.2, 0.3, 0.5]) - assert np.allclose(distribution, expected_distribution, atol=0.1), "Label distribution does not match expected distribution" + assert np.allclose(distribution, expected_distribution, atol=0.1), 'Label distribution does not match expected distribution' def test_generate_labels_class_relation_linear(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) labels = cc_instance.generate_labels(X, class_relation='linear') - assert isinstance(labels, np.ndarray), "Output should be a numpy array" - assert labels.shape == (100,), "Shape should be (n_samples,)" + assert isinstance(labels, np.ndarray), 'Output should be a numpy array' + assert labels.shape == (100,), 'Shape should be (n_samples,)' def test_generate_labels_class_relation_nonlinear(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) labels = cc_instance.generate_labels(X, class_relation='nonlinear') - assert isinstance(labels, np.ndarray), "Output should be a numpy array" - assert labels.shape == (100,), "Shape should be (n_samples,)" + assert isinstance(labels, np.ndarray), 'Output should be a numpy array' + assert labels.shape == (100,), 'Shape should be (n_samples,)' def test_generate_labels_class_relation_cluster(cc_instance): X = cc_instance.generate_data(n_features=5, n_samples=100) labels = cc_instance.generate_labels(X, class_relation='cluster', balance=True) - assert isinstance(labels, np.ndarray), "Output should be a numpy array" - assert labels.shape == (100,), "Shape should be (n_samples,)" \ No newline at end of file + assert isinstance(labels, np.ndarray), 'Output should be a numpy array' + assert labels.shape == (100,), 'Shape should be (n_samples,)' From d0d50976eb6116e1efb6f0a8519cf5da52b11925 Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Tue, 9 Jul 2024 10:22:59 +0200 Subject: [PATCH 4/9] Rewrote tests with unittest instead of pytest --- tests/cc_generator_tests.py | 311 ++++++++++++++++++------------------ 1 file changed, 158 insertions(+), 153 deletions(-) diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py index 46ca8c4..16cb7b2 100644 --- a/tests/cc_generator_tests.py +++ b/tests/cc_generator_tests.py @@ -1,160 +1,165 @@ from __future__ import annotations +import unittest + import numpy as np -import pytest from scipy.stats import pearsonr from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification -@pytest.fixture -def cc_instance(): - return CategoricalClassification() - -def test_init(cc_instance): - assert cc_instance.dataset_info == '' - -def test_generate_data_shape_and_type(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - assert isinstance(X, np.ndarray), 'Output should be a numpy array' - assert X.shape == (100, 5), 'Shape should be (n_samples, n_features)' - -def test_generate_data_cardinality(cc_instance): - n_features = 5 - cardinality = 3 - X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality) - unique_values = np.unique(X) - assert len(unique_values) <= cardinality, 'Cardinality not respected for all features' - -def test_generate_data_ensure_rep(cc_instance): - n_features = 5 - cardinality = 50 - X = cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality, ensure_rep=True) - unique_values = np.unique(X) - assert len(unique_values) == cardinality, "Not all values represented when 'ensure_rep=True'" - -def test_generate_feature_shape_and_type(cc_instance): - feature = cc_instance._generate_feature(100, cardinality=5) - assert isinstance(feature, np.ndarray), 'Output should be a numpy array' - assert feature.shape == (100,), 'Shape should be (size,)' - -def test_generate_feature_cardinality(cc_instance): - feature = cc_instance._generate_feature(100, cardinality=5) - unique_values = np.unique(feature) - assert len(unique_values) <= 5, 'Feature cardinality not respected for all features' - -def test_generate_feature_ensure_rep(cc_instance): - feature = cc_instance._generate_feature(100, cardinality=50, ensure_rep=True) - unique_values = np.unique(feature) - assert len(unique_values) == 50, "Not all values represented when using 'ensure_rep=True'" - -def test_generate_feature_values(cc_instance): - values = [5, 6, 7, 8, 9, 10] - feature = cc_instance._generate_feature(100, vec=values) - unique_values = np.unique(feature) - assert any(f in feature for f in values), 'Feature values not in input list' -def test_generate_feature_values_ensure_rep(cc_instance): - values = [5, 6, 7, 8, 9, 10] - feature = cc_instance._generate_feature(100, vec=values, ensure_rep=True) - unique_values = np.unique(feature) - assert (values == unique_values).all(), "Feature values should match input list when 'ensure_rep=True'" - -def test_generate_feature_density(cc_instance): - values = [0, 1, 2] - p = [0.2, 0.4, 0.4] - feature = cc_instance._generate_feature(10000, vec=values, ensure_rep=True, p=p) - values, counts = np.unique(feature, return_counts=True) - generated_p = np.round(counts/10000, decimals=1) - assert (generated_p == p).all(), "Feature values should have density roughly equal to 'p'" - -def test_generate_combinations_shape_and_type(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - indices = [0,1] - X = cc_instance.generate_combinations(X, indices, combination_type='linear') - assert isinstance(X, np.ndarray), 'Output should be a numpy array' - assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)' - -def test_generate_correlated_shape_and_type(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - indices = 0 - X = cc_instance.generate_correlated(X, indices, r=0.8) - assert isinstance(X, np.ndarray), 'Output should be a numpy array' - assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)' - -def test_generate_correlated_correlaton(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - indices = 0 - X = cc_instance.generate_correlated(X, indices, r=0.8) - Xt = X.T - corr, _ = pearsonr(Xt[0], Xt[5]) - assert np.round(corr, decimals=1) == 0.8, "Resultant correlation should be equal to the 'r' parameter" - - -def test_generate_duplicates_shape_and_type(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - indices = 0 - X = cc_instance.generate_duplicates(X, indices) - assert isinstance(X, np.ndarray), 'Output should be a numpy array' - assert X.shape == (100, 6), 'Shape should be (n_samples, n_features + 1)' - -def test_generate_duplicates_duplication(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - indices = 0 - X = cc_instance.generate_duplicates(X, indices) - Xt = X.T - assert (Xt[0] == Xt[-1]).all() - -def test_xor_operation(cc_instance): - a = np.array([1, 0, 1]) - b = np.array([0, 1, 1]) - arr = [a, b] - result = cc_instance._xor(arr) - expected = np.array([1, 1, 0]) - assert np.array_equal(result, expected), 'XOR operation did not produce expected result' - -def test_and_operation(cc_instance): - a = np.array([1, 0, 1]) - b = np.array([0, 1, 1]) - arr = [a, b] - result = cc_instance._and(arr) - expected = np.array([0, 0, 1]) - assert np.array_equal(result, expected), 'AND operation did not produce expected result' - -def test_or_operation(cc_instance): - a = np.array([1, 0, 1]) - b = np.array([0, 1, 1]) - arr = [a, b] - result = cc_instance._or(arr) - expected = np.array([1, 1, 1]) - assert np.array_equal(result, expected), 'OR operation did not produce expected result' - -def test_generate_labels_shape_and_type(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - labels = cc_instance.generate_labels(X) - assert isinstance(labels, np.ndarray), 'Output should be a numpy array' - assert labels.shape == (100,), 'Shape should be (n_samples,)' - -def test_generate_labels_distribution(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - labels = cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5]) - unique, counts = np.unique(labels, return_counts=True) - distribution = counts / 100 - expected_distribution = np.array([0.2, 0.3, 0.5]) - assert np.allclose(distribution, expected_distribution, atol=0.1), 'Label distribution does not match expected distribution' - -def test_generate_labels_class_relation_linear(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - labels = cc_instance.generate_labels(X, class_relation='linear') - assert isinstance(labels, np.ndarray), 'Output should be a numpy array' - assert labels.shape == (100,), 'Shape should be (n_samples,)' - -def test_generate_labels_class_relation_nonlinear(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - labels = cc_instance.generate_labels(X, class_relation='nonlinear') - assert isinstance(labels, np.ndarray), 'Output should be a numpy array' - assert labels.shape == (100,), 'Shape should be (n_samples,)' - -def test_generate_labels_class_relation_cluster(cc_instance): - X = cc_instance.generate_data(n_features=5, n_samples=100) - labels = cc_instance.generate_labels(X, class_relation='cluster', balance=True) - assert isinstance(labels, np.ndarray), 'Output should be a numpy array' - assert labels.shape == (100,), 'Shape should be (n_samples,)' +class TestCategoricalClassification(unittest.TestCase): + + def setUp(self): + self.cc_instance = CategoricalClassification() + + def test_init(self): + self.assertEqual(self.cc_instance.dataset_info, '') + + def test_generate_data_shape_and_type(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array') + self.assertEqual(X.shape, (100, 5), 'Shape should be (n_samples, n_features)') + + def test_generate_data_cardinality(self): + n_features = 5 + cardinality = 3 + X = self.cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality) + unique_values = np.unique(X) + self.assertLessEqual(len(unique_values), cardinality, 'Cardinality not respected for all features') + + def test_generate_data_ensure_rep(self): + n_features = 5 + cardinality = 50 + X = self.cc_instance.generate_data(n_features=n_features, n_samples=100, cardinality=cardinality, ensure_rep=True) + unique_values = np.unique(X) + self.assertEqual(len(unique_values), cardinality, "Not all values represented when 'ensure_rep=True'") + + def test_generate_feature_shape_and_type(self): + feature = self.cc_instance._generate_feature(100, cardinality=5) + self.assertIsInstance(feature, np.ndarray, 'Output should be a numpy array') + self.assertEqual(feature.shape, (100,), 'Shape should be (size,)') + + def test_generate_feature_cardinality(self): + feature = self.cc_instance._generate_feature(100, cardinality=5) + unique_values = np.unique(feature) + self.assertLessEqual(len(unique_values), 5, 'Feature cardinality not respected for all features') + + def test_generate_feature_ensure_rep(self): + feature = self.cc_instance._generate_feature(100, cardinality=50, ensure_rep=True) + unique_values = np.unique(feature) + self.assertEqual(len(unique_values), 50, "Not all values represented when using 'ensure_rep=True'") + + def test_generate_feature_values(self): + values = [5, 6, 7, 8, 9, 10] + feature = self.cc_instance._generate_feature(100, vec=values) + unique_values = np.unique(feature) + self.assertTrue(any(f in feature for f in values), 'Feature values not in input list') + + def test_generate_feature_values_ensure_rep(self): + values = [5, 6, 7, 8, 9, 10] + feature = self.cc_instance._generate_feature(100, vec=values, ensure_rep=True) + unique_values = np.unique(feature) + self.assertTrue(np.array_equal(values, unique_values), "Feature values should match input list when 'ensure_rep=True'") + + def test_generate_feature_density(self): + values = [0, 1, 2] + p = [0.2, 0.4, 0.4] + feature = self.cc_instance._generate_feature(10000, vec=values, ensure_rep=True, p=p) + values, counts = np.unique(feature, return_counts=True) + generated_p = np.round(counts/10000, decimals=1) + self.assertTrue(np.array_equal(generated_p, p), "Feature values should have density roughly equal to 'p'") + + def test_generate_combinations_shape_and_type(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + indices = [0,1] + X = self.cc_instance.generate_combinations(X, indices, combination_type='linear') + self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array') + self.assertEqual(X.shape, (100, 6), 'Shape should be (n_samples, n_features + 1)') + + def test_generate_correlated_shape_and_type(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = self.cc_instance.generate_correlated(X, indices, r=0.8) + self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array') + self.assertEqual(X.shape, (100, 6), 'Shape should be (n_samples, n_features + 1)') + + def test_generate_correlated_correlaton(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = self.cc_instance.generate_correlated(X, indices, r=0.8) + Xt = X.T + corr, _ = pearsonr(Xt[0], Xt[5]) + self.assertAlmostEqual(np.round(corr, decimals=1), 0.8, "Resultant correlation should be equal to the 'r' parameter") + + def test_generate_duplicates_shape_and_type(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = self.cc_instance.generate_duplicates(X, indices) + self.assertIsInstance(X, np.ndarray, 'Output should be a numpy array') + self.assertEqual(X.shape, (100, 6), 'Shape should be (n_samples, n_features + 1)') + + def test_generate_duplicates_duplication(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + indices = 0 + X = self.cc_instance.generate_duplicates(X, indices) + Xt = X.T + self.assertTrue((Xt[0] == Xt[-1]).all()) + + def test_xor_operation(self): + a = np.array([1, 0, 1]) + b = np.array([0, 1, 1]) + arr = [a, b] + result = self.cc_instance._xor(arr) + expected = np.array([1, 1, 0]) + self.assertTrue(np.array_equal(result, expected), 'XOR operation did not produce expected result') + + def test_and_operation(self): + a = np.array([1, 0, 1]) + b = np.array([0, 1, 1]) + arr = [a, b] + result = self.cc_instance._and(arr) + expected = np.array([0, 0, 1]) + self.assertTrue(np.array_equal(result, expected), 'AND operation did not produce expected result') + + def test_or_operation(self): + a = np.array([1, 0, 1]) + b = np.array([0, 1, 1]) + arr = [a, b] + result = self.cc_instance._or(arr) + expected = np.array([1, 1, 1]) + self.assertTrue(np.array_equal(result, expected), 'OR operation did not produce expected result') + + def test_generate_labels_shape_and_type(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + labels = self.cc_instance.generate_labels(X) + self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array') + self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') + + def test_generate_labels_distribution(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + labels = self.cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5]) + unique, counts = np.unique(labels, return_counts=True) + distribution = counts / 100 + expected_distribution = np.array([0.2, 0.3, 0.5]) + self.assertTrue(np.allclose(distribution, expected_distribution, atol=0.1), 'Label distribution does not match expected distribution') + + def test_generate_labels_class_relation_linear(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + labels = self.cc_instance.generate_labels(X, class_relation='linear') + self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array') + self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') + + def test_generate_labels_class_relation_nonlinear(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + labels = self.cc_instance.generate_labels(X, class_relation='nonlinear') + self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array') + self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') + + def test_generate_labels_class_relation_cluster(self): + X = self.cc_instance.generate_data(n_features=5, n_samples=100) + labels = self.cc_instance.generate_labels(X, class_relation='cluster', balance=True) + self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array') + self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') + +if __name__ == '__main__': + unittest.main() From 79eb4de39200efc1548411ed0a0d6921eb5404f5 Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Tue, 9 Jul 2024 10:43:40 +0200 Subject: [PATCH 5/9] removed if __name__ == '__main__' from file, small fix in cluster test --- tests/cc_generator_tests.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py index 16cb7b2..860b079 100644 --- a/tests/cc_generator_tests.py +++ b/tests/cc_generator_tests.py @@ -157,9 +157,6 @@ def test_generate_labels_class_relation_nonlinear(self): def test_generate_labels_class_relation_cluster(self): X = self.cc_instance.generate_data(n_features=5, n_samples=100) - labels = self.cc_instance.generate_labels(X, class_relation='cluster', balance=True) + labels = self.cc_instance.generate_labels(X, class_relation='cluster') self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array') - self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') - -if __name__ == '__main__': - unittest.main() + self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') \ No newline at end of file From 1e50ee70541fdefb42999d9bbf873a67fa4b302c Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Thu, 11 Jul 2024 20:10:08 +0200 Subject: [PATCH 6/9] code review fixes renamed _feature_builder -> _configure_generate_featuer Replace np.ndarray typing with ArrayLike from numpy typing, other typing fixes --- .../synthetic_data_generators/cc_generator.py | 82 ++++++++++--------- tests/cc_generator_tests.py | 2 +- 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py index dd148a5..f370580 100644 --- a/outrank/algorithms/synthetic_data_generators/cc_generator.py +++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py @@ -1,11 +1,13 @@ from __future__ import annotations from typing import List +from typing import Literal from typing import Optional from typing import Tuple from typing import Union import numpy as np +from numpy.typing import ArrayLike from scipy.linalg import qr from scipy.stats import norm from sklearn.cluster import KMeans @@ -32,7 +34,7 @@ def generate_data( n_features: int, n_samples: int, cardinality: int = 5, - structure: list | np.ndarray | None = None, + structure: list | ArrayLike | None = None, ensure_rep: bool = False, random_values: bool | None = False, low: int | None = 0, @@ -41,7 +43,7 @@ def generate_data( ) -> np.ndarray: """ - Generates dataset based on parameters + Generates dataset based on given parameters :param n_features: number of generated features :param n_samples: number of generated samples :param cardinality: default cardinality of the dataset @@ -102,7 +104,7 @@ def generate_data( X[ix] = x ix += 1 - x = self._feature_builder( + x = self._configure_generate_feature( feature_attributes, n_samples, ensure_rep=ensure_rep, @@ -115,8 +117,7 @@ def generate_data( else: # Data in structure is a tuple of (list of feature indexes, feature attributes) - feature_ixs = data[0] - feature_attributes = data[1] + feature_ixs, feature_attributes = data for feature_ix in feature_ixs: # Filling out the dataset up to feature_ix @@ -133,7 +134,7 @@ def generate_data( X[ix] = x ix += 1 - x = self._feature_builder( + x = self._configure_generate_feature( feature_attributes, n_samples, ensure_rep=ensure_rep, @@ -160,9 +161,9 @@ def generate_data( return X.T - def _feature_builder( + def _configure_generate_feature( self, - feature_attributes: int | list | np.ndarray, + feature_attributes: int | list | ArrayLike, n_samples: int, ensure_rep: bool = False, random_values: bool | None = False, @@ -171,7 +172,7 @@ def _feature_builder( ) -> np.ndarray: """ - Helper function to avoid duplicate code, builds feature + Helper function, calls _generate_feature with appropriate parameters based on feature_attributes :param feature_attributes: either integer (cardinality) or list of feature attributes :param n_samples: number of samples in dataset :param ensure_rep: ensures all values are represented at least once in the feature vector @@ -216,7 +217,7 @@ def _feature_builder( def _generate_feature( self, size: int, - vec: list[int] | np.ndarray | None = None, + vec: list[int] | ArrayLike | None = None, cardinality: int = 5, ensure_rep: bool = False, random_values: bool | None = False, @@ -225,7 +226,7 @@ def _generate_feature( p: list[float] | np.ndarray | None = None, ) -> np.ndarray: """ - Generates feature vector of length size. Default probability density distribution is approx. normal, centred around a randomly picked value. + Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value. :param vec: list of feature values :param cardinality: single value cardinality :param size: length of feature vector @@ -264,10 +265,10 @@ def _generate_feature( def generate_combinations( self, - X: np.ndarray, - feature_indices: list[int] | np.ndarray, + X: ArrayLike, + feature_indices: list[int] | ArrayLike, combination_function: Optional = None, - combination_type: str = 'linear', + combination_type: Literal = 'linear', ) -> np.ndarray: """ Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X @@ -300,11 +301,10 @@ def generate_combinations( return np.column_stack((X, combination_result)) - def _xor(self, arr): + def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray: """ Performs bitwise XOR operation on two integer arrays - :param a: array - :param b: array + :param arr: features to perform XOR operation on :return: bitwise XOR result """ arrT = arr.T @@ -316,11 +316,10 @@ def _xor(self, arr): return out.T - def _and(self, arr): + def _and(self, arr: list[int] | ArrayLike) -> np.ndarray: """ Performs bitwise AND operation on two integer arrays - :param a: array - :param b: array + :param arr: features to perform AND operation on :return: bitwise AND result """ arrT = arr.T @@ -332,11 +331,10 @@ def _and(self, arr): return out.T - def _or(self, arr): + def _or(self, arr: list[int] | ArrayLike) -> np.ndarray: """ Performs bitwise OR operation on two integer arrays - :param a: array - :param b: array + :param arr: features to perform OR operation on :return: bitwise OR result """ arrT = arr.T @@ -350,8 +348,8 @@ def _or(self, arr): def generate_correlated( self, - X: np.ndarray, - feature_indices: list[int] | np.ndarray, + X: ArrayLike, + feature_indices: list[int] | ArrayLike, r: float = 0.8, ) -> np.ndarray: @@ -408,8 +406,8 @@ def generate_correlated( def generate_duplicates( self, - X: np.ndarray, - feature_indices: list[int] | np.ndarray, + X: ArrayLike, + feature_indices: list[int] | ArrayLike, ) -> np.ndarray: """ Generates duplicate features @@ -433,9 +431,9 @@ def generate_duplicates( def generate_labels( self, - X: np.ndarray, + X: ArrayLike, n: int = 2, - p: float | list[float] | np.ndarray = 0.5, + p: float | list[float] | ArrayLike = 0.5, k: int | float = 2, decision_function: Optional = None, class_relation: str = 'linear', @@ -528,9 +526,9 @@ def generate_labels( def _cluster_data( self, - X: np.ndarray, + X: ArrayLike, n: int, - p: float | list[float] | np.ndarray | None = 1.0, + p: float | list[float] | ArrayLike | None = 1.0, balance: bool = False, ) -> np.ndarray: """ @@ -624,10 +622,10 @@ def _cluster_data( def generate_noise( self, - X: np.ndarray, - y: list[int] | np.ndarray, + X: ArrayLike, + y: list[int] | ArrayLike, p: float = 0.2, - type: str = 'categorical', + type: Literal = 'categorical', missing_val: str | int | float = float('-inf'), ) -> np.ndarray: @@ -718,12 +716,12 @@ def generate_noise( def downsample_dataset( self, - X: np.array, - y: list[int] | np.ndarray, - N: int | None | None = None, + X: ArrayLike, + y: list[int] | ArrayLike, + N: int | None = None, seed: int = 42, reshuffle: bool = False, - ) -> tuple[np.array, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray]: """ Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset. @@ -777,7 +775,11 @@ def downsample_dataset( return X_downsampled, y_downsampled - def print_dataset(self, X, y): + def print_dataset( + self, + X: ArrayLike, + y: ArrayLike, + ): """ Prints given dataset :param X: dataset @@ -803,7 +805,7 @@ def summarize(self): print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}") if self.dataset_info['downsampling']: print( - f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']},to shape {self.dataset_info['downsampling']['downsampled_shape']}", + f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}", ) print(f"Number of classes: {self.dataset_info['labels']['n_class']}") print(f"Class relation: {self.dataset_info['labels']['class_relation']}") diff --git a/tests/cc_generator_tests.py b/tests/cc_generator_tests.py index 860b079..1cc0796 100644 --- a/tests/cc_generator_tests.py +++ b/tests/cc_generator_tests.py @@ -159,4 +159,4 @@ def test_generate_labels_class_relation_cluster(self): X = self.cc_instance.generate_data(n_features=5, n_samples=100) labels = self.cc_instance.generate_labels(X, class_relation='cluster') self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array') - self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') \ No newline at end of file + self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)') From fc029a26ec59e226e53d34e653e0d1c2305c9510 Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Fri, 12 Jul 2024 10:41:45 +0200 Subject: [PATCH 7/9] Added documentation for feature generation Small demo code in DOCSMAIN as well as pdoc entry --- docs/DOCSMAIN.md | 20 + docs/outrank.html | 38 +- docs/outrank/algorithms.html | 12 +- docs/outrank/algorithms/feature_ranking.html | 12 +- .../feature_ranking/ranking_mi_numba.html | 10 +- .../algorithms/importance_estimator.html | 624 ++-- docs/outrank/algorithms/sketches.html | 12 +- .../algorithms/sketches/counting_cms.html | 554 ++++ .../sketches/counting_counters_ordinary.html | 413 +++ .../sketches/counting_ultiloglog.html | 52 +- .../algorithms/synthetic_data_generators.html | 13 +- .../cc_generator.html | 2832 ++++++++++++++++ .../generator_naive.html | 128 +- docs/outrank/core_ranking.html | 2837 ++++++++--------- docs/outrank/core_selftest.html | 6 +- docs/outrank/core_utils.html | 1196 +++---- docs/outrank/feature_transformations.html | 12 +- .../feature_transformer_vault.html | 23 +- .../default_transformers.html | 93 +- .../fw_transformers.html | 89 +- .../ranking_transformers.html | 62 +- docs/outrank/task_generators.html | 14 +- docs/outrank/task_instance_ranking.html | 521 +++ docs/outrank/task_ranking.html | 10 +- docs/outrank/task_selftest.html | 32 +- docs/outrank/task_summary.html | 160 +- docs/outrank/task_visualization.html | 10 +- docs/outrank/visualizations.html | 12 +- .../visualizations/ranking_visualization.html | 18 +- docs/search.js | 4 +- 30 files changed, 7161 insertions(+), 2658 deletions(-) create mode 100644 docs/outrank/algorithms/sketches/counting_cms.html create mode 100644 docs/outrank/algorithms/sketches/counting_counters_ordinary.html create mode 100644 docs/outrank/algorithms/synthetic_data_generators/cc_generator.html create mode 100644 docs/outrank/task_instance_ranking.html diff --git a/docs/DOCSMAIN.md b/docs/DOCSMAIN.md index 1b6681a..fae8ec4 100644 --- a/docs/DOCSMAIN.md +++ b/docs/DOCSMAIN.md @@ -64,3 +64,23 @@ scores = [lowest_score, medium_score, high_score] sorted_score_indices = np.argsort(scores) assert np.sum(np.array([0, 1, 2]) - sorted_score_indices) == 0 ``` +--- +## Creating a simple dataset +```python +from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification + +cc = CategoricalClassification() + +# Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35 +X = cc.generate_data(9, + 10000, + cardinality=35, + ensure_rep=True, + random_values=True, + low=0, + high=40) + +# Creates target labels via clustering +y = cc.generate_labels(X, n=2, class_relation='cluster') + +``` \ No newline at end of file diff --git a/docs/outrank.html b/docs/outrank.html index 461d09d..6af2d42 100644 --- a/docs/outrank.html +++ b/docs/outrank.html @@ -3,7 +3,7 @@ - + outrank API documentation @@ -26,7 +26,10 @@

Contents

  • Welcome to OutRank's documentation!
  • Setup
  • Example use cases
  • -
  • OutRank as a Python library
  • +
  • OutRank as a Python library +
  • @@ -38,6 +41,7 @@

    Submodules

  • core_utils
  • feature_transformations
  • task_generators
  • +
  • task_instance_ranking
  • task_ranking
  • task_selftest
  • task_summary
  • @@ -129,6 +133,29 @@

    OutRank as a Python library

    assert np.sum(np.array([0, 1, 2]) - sorted_score_indices) == 0 + +
    + +

    Creating a simple dataset

    + +
    +
    from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification
    +
    +cc = CategoricalClassification()
    +
    +# Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35
    +X = cc.generate_data(9, 
    +                     10000, 
    +                     cardinality=35, 
    +                     ensure_rep=True, 
    +                     random_values=True, 
    +                     low=0, 
    +                     high=40)
    +
    +# Creates target labels via clustering
    +y = cc.generate_labels(X, n=2, class_relation='cluster')
    +
    +
    @@ -137,8 +164,9 @@

    OutRank as a Python library

    1"""
     2.. include:: ../docs/DOCSMAIN.md
    -3"""
    -4from __future__ import annotations
    +3.. include:: ../docs/generator_docs.md
    +4"""
    +5from __future__ import annotations
     
    @@ -326,4 +354,4 @@

    OutRank as a Python library

    } }); - + \ No newline at end of file diff --git a/docs/outrank/algorithms.html b/docs/outrank/algorithms.html index 94d44f0..3d656cb 100644 --- a/docs/outrank/algorithms.html +++ b/docs/outrank/algorithms.html @@ -3,7 +3,7 @@ - + outrank.algorithms API documentation @@ -49,10 +49,10 @@

    Submodules

    outrank.algorithms

    - - - - + + + + - + \ No newline at end of file diff --git a/docs/outrank/algorithms/feature_ranking.html b/docs/outrank/algorithms/feature_ranking.html index 9f0d614..9e05eec 100644 --- a/docs/outrank/algorithms/feature_ranking.html +++ b/docs/outrank/algorithms/feature_ranking.html @@ -3,7 +3,7 @@ - + outrank.algorithms.feature_ranking API documentation @@ -46,10 +46,10 @@

    Submodules

    outrank.algorithms.feature_ranking

    - - - - + + + + - + \ No newline at end of file diff --git a/docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html b/docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html index 78d3cc0..6d4bb3d 100644 --- a/docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html +++ b/docs/outrank/algorithms/feature_ranking/ranking_mi_numba.html @@ -3,7 +3,7 @@ - + outrank.algorithms.feature_ranking.ranking_mi_numba API documentation @@ -61,7 +61,7 @@

    API Documentation

    outrank.algorithms.feature_ranking.ranking_mi_numba

    - + @@ -367,7 +367,7 @@

    - +
    @@ -501,7 +501,7 @@

    - +

    @@ -734,4 +734,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/algorithms/importance_estimator.html b/docs/outrank/algorithms/importance_estimator.html index 42a69fe..76a2c80 100644 --- a/docs/outrank/algorithms/importance_estimator.html +++ b/docs/outrank/algorithms/importance_estimator.html @@ -3,7 +3,7 @@ - + outrank.algorithms.importance_estimator API documentation @@ -33,6 +33,9 @@

    API Documentation

  • logger
  • +
  • + num_folds +
  • sklearn_MI
  • @@ -54,6 +57,9 @@

    API Documentation

  • get_importances_estimate_nonmyopic
  • +
  • + initialize_classifier +
  • @@ -70,7 +76,7 @@

    API Documentation

    outrank.algorithms.importance_estimator

    - + @@ -89,62 +95,62 @@

    12from scipy.stats import pearsonr 13from sklearn.feature_selection import mutual_info_classif 14from sklearn.linear_model import LogisticRegression - 15from sklearn.metrics import adjusted_mutual_info_score - 16from sklearn.model_selection import cross_val_score - 17from sklearn.preprocessing import OneHotEncoder - 18from sklearn.svm import SVC - 19 - 20logger = logging.getLogger('syn-logger') - 21logger.setLevel(logging.DEBUG) + 15from sklearn.linear_model import SGDClassifier + 16from sklearn.metrics import adjusted_mutual_info_score + 17from sklearn.model_selection import cross_val_score + 18from sklearn.preprocessing import OneHotEncoder + 19from sklearn.svm import SVC + 20 + 21from outrank.core_utils import is_prior_heuristic 22 - 23try: - 24 from outrank.algorithms.feature_ranking import ranking_mi_numba + 23logger = logging.getLogger('syn-logger') + 24logger.setLevel(logging.DEBUG) 25 - 26 numba_available = True + 26num_folds = 4 27 - 28except Exception as es: - 29 traceback.print_exc(0) - 30 numba_available = False - 31 + 28try: + 29 from outrank.algorithms.feature_ranking import ranking_mi_numba + 30 + 31 numba_available = True 32 - 33def sklearn_MI(vector_first: Any, vector_second: Any) -> float: - 34 estimate_feature_importance = mutual_info_classif( - 35 vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True, - 36 )[0] - 37 return estimate_feature_importance - 38 - 39 - 40def sklearn_surrogate( - 41 vector_first: Any, vector_second: Any, surrogate_model: str, - 42) -> float: - 43 if surrogate_model == 'surrogate-LR': - 44 clf = LogisticRegression(max_iter=100000) - 45 elif surrogate_model == 'surrogate-SVM': - 46 clf = SVC(gamma='auto', probability=True) - 47 - 48 transf = OneHotEncoder() - 49 - 50 # They do not commute, swap if needed - 51 if len(np.unique(vector_second) > 2): - 52 vector_third = vector_second - 53 vector_second = vector_first - 54 vector_first = vector_third - 55 del vector_third - 56 - 57 unique_values, counts = np.unique(vector_second, return_counts=True) - 58 - 59 # Establish min support for this type of ranking. - 60 if counts[0] < len(unique_values) * (2**5): - 61 estimate_feature_importance = 0 - 62 - 63 else: - 64 vector_first = transf.fit_transform(vector_first.reshape(-1, 1)) - 65 estimate_feature_importance_list = cross_val_score( - 66 clf, vector_first, vector_second, scoring='neg_log_loss', cv=4, - 67 ) - 68 - 69 estimate_feature_importance = 1 + \ - 70 np.median(estimate_feature_importance_list) + 33except Exception as es: + 34 traceback.print_exc(0) + 35 numba_available = False + 36 + 37 + 38def sklearn_MI(vector_first: Any, vector_second: Any) -> float: + 39 estimate_feature_importance = mutual_info_classif( + 40 vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True, + 41 )[0] + 42 return estimate_feature_importance + 43 + 44 + 45def sklearn_surrogate( + 46 vector_first: Any, vector_second: Any, X: Any, surrogate_model: str, + 47) -> float: + 48 + 49 clf = initialize_classifier(surrogate_model) + 50 + 51 transf = OneHotEncoder() + 52 + 53 # They do not commute, swap if needed + 54 if len(np.unique(vector_second) > 2): + 55 vector_third = vector_second + 56 vector_second = vector_first + 57 vector_first = vector_third + 58 del vector_third + 59 + 60 if X.size <= 1: + 61 X = vector_first.reshape(-1, 1) + 62 else: + 63 X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1) + 64 + 65 X = transf.fit_transform(X) + 66 estimate_feature_importance_list = cross_val_score( + 67 clf, X, vector_second, scoring='neg_log_loss', cv=num_folds, + 68 ) + 69 estimate_feature_importance = 1 + \ + 70 np.median(estimate_feature_importance_list) 71 72 return estimate_feature_importance 73 @@ -174,7 +180,7 @@

    97 return estimate_feature_importance 98 99 -100def get_importances_estimate_pairwise(combination, args, tmp_df): +100def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df): 101 """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.""" 102 103 feature_one = combination[0] @@ -199,97 +205,113 @@

    122 estimate_feature_importance = sklearn_MI(vector_first, vector_second) 123 124 elif 'surrogate-' in args.heuristic: -125 estimate_feature_importance = sklearn_surrogate( -126 vector_first, vector_second, args.heuristic, -127 ) +125 X = np.array(float) +126 if is_prior_heuristic(args) and (len(reference_model_features) > 0): +127 X = tmp_df[reference_model_features].values 128 -129 elif 'MI-numba' in args.heuristic: -130 estimate_feature_importance = numba_mi( -131 vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio, -132 ) -133 -134 elif args.heuristic == 'AMI': -135 estimate_feature_importance = sklearn_mi_adj( -136 vector_first, vector_second, -137 ) -138 -139 elif args.heuristic == 'correlation-Pearson': -140 estimate_feature_importance = pearsonr(vector_first, vector_second)[0] -141 -142 elif args.heuristic == 'Constant': -143 estimate_feature_importance = 0.0 -144 -145 else: -146 raise ValueError( -147 'Please select one of the possible heuristics (MI, chi2)', -148 ) -149 -150 return (feature_one, feature_two, estimate_feature_importance) -151 -152 -153def rank_features_3MR( -154 relevance_dict: dict[str, float], -155 redundancy_dict: dict[tuple[Any, Any], Any], -156 relational_dict: dict[tuple[Any, Any], Any], -157 strategy: str = 'median', -158 alpha: float = 1, -159 beta: float = 1, -160) -> pd.DataFrame: -161 all_features = relevance_dict.keys() -162 most_important_feature = max( -163 relevance_dict.items(), key=operator.itemgetter(1), -164 )[0] -165 ranked_features = [most_important_feature] -166 -167 def calc_higher_order(feature, is_redundancy=True): -168 values = [] -169 for feat in ranked_features: -170 interaction_tuple = (feat, feature) -171 if is_redundancy: -172 if interaction_tuple in redundancy_dict: -173 values.append(redundancy_dict[interaction_tuple]) -174 else: -175 logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.') -176 else: -177 if interaction_tuple in relational_dict: -178 values.append(relational_dict[interaction_tuple]) -179 else: -180 logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.') -181 -182 if strategy == 'sum': -183 return sum(values) -184 if strategy == 'mean': -185 return np.mean(values) -186 return np.median(values) -187 -188 while len(ranked_features) != len(all_features): -189 top_importance = 0 -190 most_important_feature = '' +129 estimate_feature_importance = sklearn_surrogate( +130 vector_first, vector_second, X, args.heuristic, +131 ) +132 +133 elif 'MI-numba' in args.heuristic: +134 estimate_feature_importance = numba_mi( +135 vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio, +136 ) +137 +138 elif args.heuristic == 'AMI': +139 estimate_feature_importance = sklearn_mi_adj( +140 vector_first, vector_second, +141 ) +142 +143 elif args.heuristic == 'correlation-Pearson': +144 estimate_feature_importance = pearsonr(vector_first, vector_second)[0] +145 +146 elif args.heuristic == 'Constant': +147 estimate_feature_importance = 0.0 +148 +149 else: +150 raise ValueError( +151 'Please select one of the possible heuristics (MI, chi2)', +152 ) +153 +154 return (feature_one, feature_two, estimate_feature_importance) +155 +156 +157def rank_features_3MR( +158 relevance_dict: dict[str, float], +159 redundancy_dict: dict[tuple[Any, Any], Any], +160 relational_dict: dict[tuple[Any, Any], Any], +161 strategy: str = 'median', +162 alpha: float = 1, +163 beta: float = 1, +164) -> pd.DataFrame: +165 all_features = relevance_dict.keys() +166 most_important_feature = max( +167 relevance_dict.items(), key=operator.itemgetter(1), +168 )[0] +169 ranked_features = [most_important_feature] +170 +171 def calc_higher_order(feature, is_redundancy=True): +172 values = [] +173 for feat in ranked_features: +174 interaction_tuple = (feat, feature) +175 if is_redundancy: +176 if interaction_tuple in redundancy_dict: +177 values.append(redundancy_dict[interaction_tuple]) +178 else: +179 logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.') +180 else: +181 if interaction_tuple in relational_dict: +182 values.append(relational_dict[interaction_tuple]) +183 else: +184 logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.') +185 +186 if strategy == 'sum': +187 return sum(values) +188 if strategy == 'mean': +189 return np.mean(values) +190 return np.median(values) 191 -192 for ind, feat in enumerate(set(all_features) - set(ranked_features)): -193 feature_redundancy = calc_higher_order(feat) -194 feature_relation = calc_higher_order(feat, False) -195 feature_relevance = relevance_dict[feat] -196 importance = ( -197 feature_relevance - alpha * feature_redundancy + beta * feature_relation -198 ) -199 -200 if (importance > top_importance) or (ind == 0): -201 top_importance = importance -202 most_important_feature = feat -203 ranked_features.append(most_important_feature) -204 return pd.DataFrame( -205 { -206 'Feature': ranked_features, -207 '3mr_ranking': list(range(1, len(ranked_features) + 1)), -208 }, -209 ) -210 -211 -212def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame): -213 # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label -214 # TODO - this is to be executed directly on df - no need for parallel kernel(s) -215 pass +192 while len(ranked_features) != len(all_features): +193 top_importance = 0 +194 most_important_feature = '' +195 +196 for ind, feat in enumerate(set(all_features) - set(ranked_features)): +197 feature_redundancy = calc_higher_order(feat) +198 feature_relation = calc_higher_order(feat, False) +199 feature_relevance = relevance_dict[feat] +200 importance = ( +201 feature_relevance - alpha * feature_redundancy + beta * feature_relation +202 ) +203 +204 if (importance > top_importance) or (ind == 0): +205 top_importance = importance +206 most_important_feature = feat +207 ranked_features.append(most_important_feature) +208 return pd.DataFrame( +209 { +210 'Feature': ranked_features, +211 '3mr_ranking': list(range(1, len(ranked_features) + 1)), +212 }, +213 ) +214 +215 +216def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame): +217 # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label +218 # TODO - this is to be executed directly on df - no need for parallel kernel(s) +219 pass +220 +221 +222def initialize_classifier(surrogate_model: str): +223 if 'surrogate-LR' in surrogate_model: +224 return LogisticRegression(max_iter=100000) +225 elif 'surrogate-SVM' in surrogate_model: +226 return SVC(gamma='auto', probability=True) +227 elif 'surrogate-SGD' in surrogate_model: +228 return SGDClassifier(max_iter=100000, loss='log_loss') +229 else: +230 logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD') +231 return SGDClassifier(max_iter=100000, loss='log_loss') @@ -299,17 +321,29 @@

    logger = <Logger syn-logger (DEBUG)> - + + + +

    +
    +
    + num_folds = +4 + +
    + + +
    - + def sklearn_MI(vector_first: Any, vector_second: Any) -> float: @@ -317,71 +351,66 @@

    -
    34def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
    -35    estimate_feature_importance = mutual_info_classif(
    -36        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
    -37    )[0]
    -38    return estimate_feature_importance
    +            
    39def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
    +40    estimate_feature_importance = mutual_info_classif(
    +41        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
    +42    )[0]
    +43    return estimate_feature_importance
     
    - +
    - + def - sklearn_surrogate(vector_first: Any, vector_second: Any, surrogate_model: str) -> float: + sklearn_surrogate( vector_first: Any, vector_second: Any, X: Any, surrogate_model: str) -> float:
    -
    41def sklearn_surrogate(
    -42    vector_first: Any, vector_second: Any, surrogate_model: str,
    -43) -> float:
    -44    if surrogate_model == 'surrogate-LR':
    -45        clf = LogisticRegression(max_iter=100000)
    -46    elif surrogate_model == 'surrogate-SVM':
    -47        clf = SVC(gamma='auto', probability=True)
    -48
    -49    transf = OneHotEncoder()
    -50
    -51    # They do not commute, swap if needed
    -52    if len(np.unique(vector_second) > 2):
    -53        vector_third = vector_second
    -54        vector_second = vector_first
    -55        vector_first = vector_third
    -56        del vector_third
    -57
    -58    unique_values, counts = np.unique(vector_second, return_counts=True)
    -59
    -60    # Establish min support for this type of ranking.
    -61    if counts[0] < len(unique_values) * (2**5):
    -62        estimate_feature_importance = 0
    -63
    -64    else:
    -65        vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
    -66        estimate_feature_importance_list = cross_val_score(
    -67            clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
    -68        )
    -69
    -70        estimate_feature_importance = 1 + \
    -71            np.median(estimate_feature_importance_list)
    +            
    46def sklearn_surrogate(
    +47    vector_first: Any, vector_second: Any, X: Any, surrogate_model: str,
    +48) -> float:
    +49
    +50    clf = initialize_classifier(surrogate_model)
    +51
    +52    transf = OneHotEncoder()
    +53
    +54    # They do not commute, swap if needed
    +55    if len(np.unique(vector_second) > 2):
    +56        vector_third = vector_second
    +57        vector_second = vector_first
    +58        vector_first = vector_third
    +59        del vector_third
    +60
    +61    if X.size <= 1:
    +62        X = vector_first.reshape(-1, 1)
    +63    else:
    +64        X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
    +65
    +66    X = transf.fit_transform(X)
    +67    estimate_feature_importance_list = cross_val_score(
    +68        clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
    +69    )
    +70    estimate_feature_importance = 1 + \
    +71        np.median(estimate_feature_importance_list)
     72
     73    return estimate_feature_importance
     
    - +
    - + def numba_mi(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio): @@ -407,13 +436,13 @@

    - +
    - + def sklearn_mi_adj(vector_first, vector_second): @@ -430,21 +459,21 @@

    - +
    - + def - get_importances_estimate_pairwise(combination, args, tmp_df): + get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
    -
    101def get_importances_estimate_pairwise(combination, args, tmp_df):
    +            
    101def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
     102    """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
     103
     104    feature_one = combination[0]
    @@ -469,32 +498,36 @@ 

    123 estimate_feature_importance = sklearn_MI(vector_first, vector_second) 124 125 elif 'surrogate-' in args.heuristic: -126 estimate_feature_importance = sklearn_surrogate( -127 vector_first, vector_second, args.heuristic, -128 ) +126 X = np.array(float) +127 if is_prior_heuristic(args) and (len(reference_model_features) > 0): +128 X = tmp_df[reference_model_features].values 129 -130 elif 'MI-numba' in args.heuristic: -131 estimate_feature_importance = numba_mi( -132 vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio, -133 ) -134 -135 elif args.heuristic == 'AMI': -136 estimate_feature_importance = sklearn_mi_adj( -137 vector_first, vector_second, -138 ) -139 -140 elif args.heuristic == 'correlation-Pearson': -141 estimate_feature_importance = pearsonr(vector_first, vector_second)[0] -142 -143 elif args.heuristic == 'Constant': -144 estimate_feature_importance = 0.0 -145 -146 else: -147 raise ValueError( -148 'Please select one of the possible heuristics (MI, chi2)', -149 ) -150 -151 return (feature_one, feature_two, estimate_feature_importance) +130 estimate_feature_importance = sklearn_surrogate( +131 vector_first, vector_second, X, args.heuristic, +132 ) +133 +134 elif 'MI-numba' in args.heuristic: +135 estimate_feature_importance = numba_mi( +136 vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio, +137 ) +138 +139 elif args.heuristic == 'AMI': +140 estimate_feature_importance = sklearn_mi_adj( +141 vector_first, vector_second, +142 ) +143 +144 elif args.heuristic == 'correlation-Pearson': +145 estimate_feature_importance = pearsonr(vector_first, vector_second)[0] +146 +147 elif args.heuristic == 'Constant': +148 estimate_feature_importance = 0.0 +149 +150 else: +151 raise ValueError( +152 'Please select one of the possible heuristics (MI, chi2)', +153 ) +154 +155 return (feature_one, feature_two, estimate_feature_importance)

    @@ -506,7 +539,7 @@

    - + def rank_features_3MR( relevance_dict: dict[str, float], redundancy_dict: dict[tuple[typing.Any, typing.Any], typing.Any], relational_dict: dict[tuple[typing.Any, typing.Any], typing.Any], strategy: str = 'median', alpha: float = 1, beta: float = 1) -> pandas.core.frame.DataFrame: @@ -514,73 +547,73 @@

    -
    154def rank_features_3MR(
    -155    relevance_dict: dict[str, float],
    -156    redundancy_dict: dict[tuple[Any, Any], Any],
    -157    relational_dict: dict[tuple[Any, Any], Any],
    -158    strategy: str = 'median',
    -159    alpha: float = 1,
    -160    beta: float = 1,
    -161) -> pd.DataFrame:
    -162    all_features = relevance_dict.keys()
    -163    most_important_feature = max(
    -164        relevance_dict.items(), key=operator.itemgetter(1),
    -165    )[0]
    -166    ranked_features = [most_important_feature]
    -167
    -168    def calc_higher_order(feature, is_redundancy=True):
    -169        values = []
    -170        for feat in ranked_features:
    -171            interaction_tuple = (feat, feature)
    -172            if is_redundancy:
    -173                if interaction_tuple in redundancy_dict:
    -174                    values.append(redundancy_dict[interaction_tuple])
    -175                else:
    -176                    logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
    -177            else:
    -178                if interaction_tuple in relational_dict:
    -179                    values.append(relational_dict[interaction_tuple])
    -180                else:
    -181                    logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
    -182
    -183        if strategy == 'sum':
    -184            return sum(values)
    -185        if strategy == 'mean':
    -186            return np.mean(values)
    -187        return np.median(values)
    -188
    -189    while len(ranked_features) != len(all_features):
    -190        top_importance = 0
    -191        most_important_feature = ''
    +            
    158def rank_features_3MR(
    +159    relevance_dict: dict[str, float],
    +160    redundancy_dict: dict[tuple[Any, Any], Any],
    +161    relational_dict: dict[tuple[Any, Any], Any],
    +162    strategy: str = 'median',
    +163    alpha: float = 1,
    +164    beta: float = 1,
    +165) -> pd.DataFrame:
    +166    all_features = relevance_dict.keys()
    +167    most_important_feature = max(
    +168        relevance_dict.items(), key=operator.itemgetter(1),
    +169    )[0]
    +170    ranked_features = [most_important_feature]
    +171
    +172    def calc_higher_order(feature, is_redundancy=True):
    +173        values = []
    +174        for feat in ranked_features:
    +175            interaction_tuple = (feat, feature)
    +176            if is_redundancy:
    +177                if interaction_tuple in redundancy_dict:
    +178                    values.append(redundancy_dict[interaction_tuple])
    +179                else:
    +180                    logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
    +181            else:
    +182                if interaction_tuple in relational_dict:
    +183                    values.append(relational_dict[interaction_tuple])
    +184                else:
    +185                    logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
    +186
    +187        if strategy == 'sum':
    +188            return sum(values)
    +189        if strategy == 'mean':
    +190            return np.mean(values)
    +191        return np.median(values)
     192
    -193        for ind, feat in enumerate(set(all_features) - set(ranked_features)):
    -194            feature_redundancy = calc_higher_order(feat)
    -195            feature_relation = calc_higher_order(feat, False)
    -196            feature_relevance = relevance_dict[feat]
    -197            importance = (
    -198                feature_relevance - alpha * feature_redundancy + beta * feature_relation
    -199            )
    -200
    -201            if (importance > top_importance) or (ind == 0):
    -202                top_importance = importance
    -203                most_important_feature = feat
    -204        ranked_features.append(most_important_feature)
    -205    return pd.DataFrame(
    -206        {
    -207            'Feature': ranked_features,
    -208            '3mr_ranking': list(range(1, len(ranked_features) + 1)),
    -209        },
    -210    )
    +193    while len(ranked_features) != len(all_features):
    +194        top_importance = 0
    +195        most_important_feature = ''
    +196
    +197        for ind, feat in enumerate(set(all_features) - set(ranked_features)):
    +198            feature_redundancy = calc_higher_order(feat)
    +199            feature_relation = calc_higher_order(feat, False)
    +200            feature_relevance = relevance_dict[feat]
    +201            importance = (
    +202                feature_relevance - alpha * feature_redundancy + beta * feature_relation
    +203            )
    +204
    +205            if (importance > top_importance) or (ind == 0):
    +206                top_importance = importance
    +207                most_important_feature = feat
    +208        ranked_features.append(most_important_feature)
    +209    return pd.DataFrame(
    +210        {
    +211            'Feature': ranked_features,
    +212            '3mr_ranking': list(range(1, len(ranked_features) + 1)),
    +213        },
    +214    )
     
    - +
    - + def get_importances_estimate_nonmyopic(args: Any, tmp_df: pandas.core.frame.DataFrame): @@ -588,14 +621,41 @@

    -
    213def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
    -214    # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
    -215    # TODO - this is to be executed directly on df - no need for parallel kernel(s)
    -216    pass
    +            
    217def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
    +218    # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
    +219    # TODO - this is to be executed directly on df - no need for parallel kernel(s)
    +220    pass
     
    + + +
    +
    + +
    + + def + initialize_classifier(surrogate_model: str): + + + +
    + +
    223def initialize_classifier(surrogate_model: str):
    +224    if 'surrogate-LR' in surrogate_model:
    +225        return LogisticRegression(max_iter=100000)
    +226    elif 'surrogate-SVM' in surrogate_model:
    +227        return SVC(gamma='auto', probability=True)
    +228    elif 'surrogate-SGD' in surrogate_model:
    +229        return SGDClassifier(max_iter=100000, loss='log_loss')
    +230    else:
    +231        logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
    +232        return SGDClassifier(max_iter=100000, loss='log_loss')
    +
    + +
    @@ -781,4 +841,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/algorithms/sketches.html b/docs/outrank/algorithms/sketches.html index a83e258..1c538f9 100644 --- a/docs/outrank/algorithms/sketches.html +++ b/docs/outrank/algorithms/sketches.html @@ -3,7 +3,7 @@ - + outrank.algorithms.sketches API documentation @@ -48,10 +48,10 @@

    Submodules

    outrank.algorithms.sketches

    - - - - + + + +
    - + \ No newline at end of file diff --git a/docs/outrank/algorithms/sketches/counting_cms.html b/docs/outrank/algorithms/sketches/counting_cms.html new file mode 100644 index 0000000..de254d1 --- /dev/null +++ b/docs/outrank/algorithms/sketches/counting_cms.html @@ -0,0 +1,554 @@ + + + + + + + outrank.algorithms.sketches.counting_cms API documentation + + + + + + + + + +
    +
    +

    +outrank.algorithms.sketches.counting_cms

    + + + + + + +
     1from __future__ import annotations
    + 2
    + 3import sys
    + 4from collections import Counter
    + 5
    + 6import numpy as np
    + 7from numba import njit
    + 8from numba import prange
    + 9
    +10
    +11@njit
    +12def cms_hash(x, seed, width):
    +13    x_hash = np.uint32(hash(x))
    +14    return (x_hash + seed) % width
    +15
    +16class CountMinSketch:
    +17    """
    +18    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.
    +19    """
    +20
    +21    def __init__(self, depth=6, width=2**15, M=None):
    +22        self.depth = depth
    +23        self.width = width
    +24        self.hash_seeds = np.array(np.random.randint(low=0, high=2**31 - 1, size=depth), dtype=np.uint32)
    +25        self.M = np.zeros((depth, width), dtype=np.int32) if M is None else M
    +26
    +27    @staticmethod
    +28    @njit
    +29    def _add(M, x, depth, width, hash_seeds, delta=1):
    +30        for i in prange(depth):
    +31            location = cms_hash(x, hash_seeds[i], width)
    +32            M[i, location] += delta
    +33
    +34    def add(self, x, delta=1):
    +35        CountMinSketch._add(self.M, x, self.depth, self.width, self.hash_seeds, delta)
    +36
    +37    def batch_add(self, lst, delta=1):
    +38        for x in lst:
    +39            self.add(x, delta)
    +40
    +41    def query(self, x):
    +42        return min(self.M[i][cms_hash(x, self.hash_seeds[i], self.width)] for i in range(self.depth))
    +43
    +44    def get_matrix(self):
    +45        return self.M
    +46
    +47
    +48if __name__ == '__main__':
    +49    from collections import Counter
    +50
    +51    depth = 8
    +52    width = 2**22
    +53    cms = CountMinSketch(depth, width)
    +54
    +55    items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 1000
    +56    cms.batch_add(items)  # Use the batch_add function
    +57
    +58    print(cms.query(3))  # Query for frequency estimates
    +59    print(cms.query(1))
    +60    print(cms.query(2))
    +61    print(cms.query(4))
    +62    print(cms.query(5))
    +63
    +64    print(Counter(items))  # Print the exact counts for comparison
    +
    + + +
    +
    + +
    +
    @njit
    + + def + cms_hash(x, seed, width): + + + +
    + +
    12@njit
    +13def cms_hash(x, seed, width):
    +14    x_hash = np.uint32(hash(x))
    +15    return (x_hash + seed) % width
    +
    + + + + +
    +
    + +
    + + class + CountMinSketch: + + + +
    + +
    17class CountMinSketch:
    +18    """
    +19    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.
    +20    """
    +21
    +22    def __init__(self, depth=6, width=2**15, M=None):
    +23        self.depth = depth
    +24        self.width = width
    +25        self.hash_seeds = np.array(np.random.randint(low=0, high=2**31 - 1, size=depth), dtype=np.uint32)
    +26        self.M = np.zeros((depth, width), dtype=np.int32) if M is None else M
    +27
    +28    @staticmethod
    +29    @njit
    +30    def _add(M, x, depth, width, hash_seeds, delta=1):
    +31        for i in prange(depth):
    +32            location = cms_hash(x, hash_seeds[i], width)
    +33            M[i, location] += delta
    +34
    +35    def add(self, x, delta=1):
    +36        CountMinSketch._add(self.M, x, self.depth, self.width, self.hash_seeds, delta)
    +37
    +38    def batch_add(self, lst, delta=1):
    +39        for x in lst:
    +40            self.add(x, delta)
    +41
    +42    def query(self, x):
    +43        return min(self.M[i][cms_hash(x, self.hash_seeds[i], self.width)] for i in range(self.depth))
    +44
    +45    def get_matrix(self):
    +46        return self.M
    +
    + + +

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    +
    + + +
    + +
    + + CountMinSketch(depth=6, width=32768, M=None) + + + +
    + +
    22    def __init__(self, depth=6, width=2**15, M=None):
    +23        self.depth = depth
    +24        self.width = width
    +25        self.hash_seeds = np.array(np.random.randint(low=0, high=2**31 - 1, size=depth), dtype=np.uint32)
    +26        self.M = np.zeros((depth, width), dtype=np.int32) if M is None else M
    +
    + + + + +
    +
    +
    + depth + + +
    + + + + +
    +
    +
    + width + + +
    + + + + +
    +
    +
    + hash_seeds + + +
    + + + + +
    +
    +
    + M + + +
    + + + + +
    +
    + +
    + + def + add(self, x, delta=1): + + + +
    + +
    35    def add(self, x, delta=1):
    +36        CountMinSketch._add(self.M, x, self.depth, self.width, self.hash_seeds, delta)
    +
    + + + + +
    +
    + +
    + + def + batch_add(self, lst, delta=1): + + + +
    + +
    38    def batch_add(self, lst, delta=1):
    +39        for x in lst:
    +40            self.add(x, delta)
    +
    + + + + +
    +
    + +
    + + def + query(self, x): + + + +
    + +
    42    def query(self, x):
    +43        return min(self.M[i][cms_hash(x, self.hash_seeds[i], self.width)] for i in range(self.depth))
    +
    + + + + +
    +
    + +
    + + def + get_matrix(self): + + + +
    + +
    45    def get_matrix(self):
    +46        return self.M
    +
    + + + + +
    +
    +
    + + \ No newline at end of file diff --git a/docs/outrank/algorithms/sketches/counting_counters_ordinary.html b/docs/outrank/algorithms/sketches/counting_counters_ordinary.html new file mode 100644 index 0000000..59d9477 --- /dev/null +++ b/docs/outrank/algorithms/sketches/counting_counters_ordinary.html @@ -0,0 +1,413 @@ + + + + + + + outrank.algorithms.sketches.counting_counters_ordinary API documentation + + + + + + + + + +
    +
    +

    +outrank.algorithms.sketches.counting_counters_ordinary

    + + + + + + +
     1from __future__ import annotations
    + 2
    + 3from collections import Counter
    + 4
    + 5
    + 6class PrimitiveConstrainedCounter:
    + 7    """
    + 8    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.
    + 9    """
    +10
    +11    def __init__(self, bound: int=(10**4) * 3):
    +12        self.max_bound_thr = bound
    +13        self.default_counter: Counter = Counter()
    +14
    +15    def batch_add(self, lst):
    +16        if len(self.default_counter) < self.max_bound_thr:
    +17            self.default_counter = self.default_counter + Counter(lst)
    +18
    +19    def add(self, val):
    +20        if len(self.default_counter) < self.max_bound_thr:
    +21            self.default_counter[val] += 1
    +22
    +23
    +24if __name__ == '__main__':
    +25    from collections import Counter
    +26
    +27    depth = 8
    +28    width = 2**22
    +29    import numpy as np
    +30    cms = PrimitiveConstrainedCounter()
    +31
    +32    items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 10000
    +33    cms.batch_add(items)  # Use the batch_add function
    +34
    +35    print(Counter(items))  # Print the exact counts for comparison
    +
    + + +
    +
    + +
    + + class + PrimitiveConstrainedCounter: + + + +
    + +
     7class PrimitiveConstrainedCounter:
    + 8    """
    + 9    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.
    +10    """
    +11
    +12    def __init__(self, bound: int=(10**4) * 3):
    +13        self.max_bound_thr = bound
    +14        self.default_counter: Counter = Counter()
    +15
    +16    def batch_add(self, lst):
    +17        if len(self.default_counter) < self.max_bound_thr:
    +18            self.default_counter = self.default_counter + Counter(lst)
    +19
    +20    def add(self, val):
    +21        if len(self.default_counter) < self.max_bound_thr:
    +22            self.default_counter[val] += 1
    +
    + + +

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    +
    + + +
    + +
    + + PrimitiveConstrainedCounter(bound: int = 30000) + + + +
    + +
    12    def __init__(self, bound: int=(10**4) * 3):
    +13        self.max_bound_thr = bound
    +14        self.default_counter: Counter = Counter()
    +
    + + + + +
    +
    +
    + max_bound_thr + + +
    + + + + +
    +
    +
    + default_counter: collections.Counter + + +
    + + + + +
    +
    + +
    + + def + batch_add(self, lst): + + + +
    + +
    16    def batch_add(self, lst):
    +17        if len(self.default_counter) < self.max_bound_thr:
    +18            self.default_counter = self.default_counter + Counter(lst)
    +
    + + + + +
    +
    + +
    + + def + add(self, val): + + + +
    + +
    20    def add(self, val):
    +21        if len(self.default_counter) < self.max_bound_thr:
    +22            self.default_counter[val] += 1
    +
    + + + + +
    +
    +
    + + \ No newline at end of file diff --git a/docs/outrank/algorithms/sketches/counting_ultiloglog.html b/docs/outrank/algorithms/sketches/counting_ultiloglog.html index d326f96..5ce74b8 100644 --- a/docs/outrank/algorithms/sketches/counting_ultiloglog.html +++ b/docs/outrank/algorithms/sketches/counting_ultiloglog.html @@ -3,7 +3,7 @@ - + outrank.algorithms.sketches.counting_ultiloglog API documentation @@ -267,7 +267,7 @@

    - + class HyperLogLogWCache: @@ -328,12 +328,12 @@

    - +
    - + HyperLogLogWCache(error_rate=0.005) @@ -351,79 +351,79 @@

    - +
    p - +
    - - + +
    m - +
    - - + +
    warmup_set - +
    - - + +
    warmup_size - +
    - - + +
    width - +
    - - + +
    hll_flag - +
    - - + +
    - + def add(self, value): @@ -446,7 +446,7 @@

    - +
    @@ -633,4 +633,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/algorithms/synthetic_data_generators.html b/docs/outrank/algorithms/synthetic_data_generators.html index 0a2063c..b72db12 100644 --- a/docs/outrank/algorithms/synthetic_data_generators.html +++ b/docs/outrank/algorithms/synthetic_data_generators.html @@ -3,7 +3,7 @@ - + outrank.algorithms.synthetic_data_generators API documentation @@ -29,6 +29,7 @@

    Submodules

    @@ -46,10 +47,10 @@

    Submodules

    outrank.algorithms.synthetic_data_generators

    - - - - + + + + - + \ No newline at end of file diff --git a/docs/outrank/algorithms/synthetic_data_generators/cc_generator.html b/docs/outrank/algorithms/synthetic_data_generators/cc_generator.html new file mode 100644 index 0000000..7d97838 --- /dev/null +++ b/docs/outrank/algorithms/synthetic_data_generators/cc_generator.html @@ -0,0 +1,2832 @@ + + + + + + + outrank.algorithms.synthetic_data_generators.cc_generator API documentation + + + + + + + + + +
    +
    +

    +outrank.algorithms.synthetic_data_generators.cc_generator

    + + + + + + +
      1from __future__ import annotations
    +  2
    +  3from typing import List
    +  4from typing import Literal
    +  5from typing import Optional
    +  6from typing import Tuple
    +  7from typing import Union
    +  8
    +  9import numpy as np
    + 10from numpy.typing import ArrayLike
    + 11from scipy.linalg import qr
    + 12from scipy.stats import norm
    + 13from sklearn.cluster import KMeans
    + 14from sklearn.utils import resample
    + 15
    + 16
    + 17class CategoricalClassification:
    + 18
    + 19    def __init__(self):
    + 20        self.dataset_info = {
    + 21            'general': {},
    + 22            'combinations': [],
    + 23            'correlations': [],
    + 24            'duplicates': [],
    + 25            'labels': [],
    + 26            'noise': [],
    + 27        }
    + 28
    + 29    def __repr__(self):
    + 30        return f"CategoricalClassification(dataset_info={self.dataset_info})"
    + 31
    + 32    def generate_data(
    + 33        self,
    + 34        n_features: int,
    + 35        n_samples: int,
    + 36        cardinality: int = 5,
    + 37        structure: list | ArrayLike | None = None,
    + 38        ensure_rep: bool = False,
    + 39        random_values: bool | None = False,
    + 40        low: int | None = 0,
    + 41        high: int | None = 1000,
    + 42        seed: int = 42,
    + 43    ) -> np.ndarray:
    + 44
    + 45        """
    + 46        Generates dataset based on given parameters
    + 47        :param n_features: number of generated features
    + 48        :param n_samples: number of generated samples
    + 49        :param cardinality: default cardinality of the dataset
    + 50        :param structure: structure of the dataset
    + 51        :param ensure_rep: flag, ensures all given values represented
    + 52        :param random_values: flag, enables random (integer) feature values from set [low, high]
    + 53        :param low: sets lower bound of random feature values
    + 54        :param high: sets high bound of random feature values
    + 55        :param seed: sets seed of numpy random
    + 56        :return: X, 2D dataset
    + 57        """
    + 58
    + 59        self.dataset_info.update({
    + 60            'general': {
    + 61                'n_features': n_features,
    + 62                'n_samples': n_samples,
    + 63                'cardinality': cardinality,
    + 64                'structure': structure,
    + 65                'ensure_rep': ensure_rep,
    + 66                'seed': seed,
    + 67            },
    + 68        })
    + 69
    + 70        np.random.seed(seed)
    + 71        X = np.empty([n_features, n_samples])
    + 72
    + 73        if structure is None:
    + 74            # No specific structure parameter passed
    + 75            for i in range(n_features):
    + 76                x = self._generate_feature(
    + 77                    n_samples,
    + 78                    cardinality=cardinality,
    + 79                    ensure_rep=ensure_rep,
    + 80                    random_values=random_values,
    + 81                    low=low,
    + 82                    high=high,
    + 83                )
    + 84                X[i] = x
    + 85        else:
    + 86            # Structure parameter passed, building based on structure
    + 87            ix = 0
    + 88            for data in structure:
    + 89                if not isinstance(data[0], (list, np.ndarray)):
    + 90                    # Data in structure is a tuple of (feature index (integer), feature attributes)
    + 91                    feature_ix, feature_attributes = data
    + 92
    + 93                    if ix < feature_ix:
    + 94                        # Filling out the dataset up to column index feature_ix
    + 95                        for i in range(ix, feature_ix):
    + 96                            x = self._generate_feature(
    + 97                                n_samples,
    + 98                                cardinality=cardinality,
    + 99                                ensure_rep=ensure_rep,
    +100                                random_values=random_values,
    +101                                low=low,
    +102                                high=high,
    +103                            )
    +104                            X[ix] = x
    +105                            ix += 1
    +106
    +107                    x = self._configure_generate_feature(
    +108                        feature_attributes,
    +109                        n_samples,
    +110                        ensure_rep=ensure_rep,
    +111                        random_values=random_values,
    +112                        low=low,
    +113                        high=high,
    +114                    )
    +115                    X[ix] = x
    +116                    ix += 1
    +117
    +118                else:
    +119                    # Data in structure is a tuple of (list of feature indexes, feature attributes)
    +120                    feature_ixs, feature_attributes = data
    +121
    +122                    for feature_ix in feature_ixs:
    +123                        # Filling out the dataset up to feature_ix
    +124                        if ix < feature_ix:
    +125                            for i in range(ix, feature_ix):
    +126                                x = self._generate_feature(
    +127                                    n_samples,
    +128                                    cardinality=cardinality,
    +129                                    ensure_rep=ensure_rep,
    +130                                    random_values=random_values,
    +131                                    low=low,
    +132                                    high=high,
    +133                                )
    +134                                X[ix] = x
    +135                                ix += 1
    +136
    +137                        x = self._configure_generate_feature(
    +138                            feature_attributes,
    +139                            n_samples,
    +140                            ensure_rep=ensure_rep,
    +141                            random_values=random_values,
    +142                            low=low,
    +143                            high=high,
    +144                        )
    +145
    +146                        X[ix] = x
    +147                        ix += 1
    +148
    +149            if ix < n_features:
    +150                # Fill out the rest of the dataset
    +151                for i in range(ix, n_features):
    +152                    x = self._generate_feature(
    +153                        n_samples,
    +154                        cardinality=cardinality,
    +155                        ensure_rep=ensure_rep,
    +156                        random_values=random_values,
    +157                        low=low,
    +158                        high=high,
    +159                    )
    +160                    X[i] = x
    +161
    +162        return X.T
    +163
    +164    def _configure_generate_feature(
    +165        self,
    +166        feature_attributes: int | list | ArrayLike,
    +167        n_samples: int,
    +168        ensure_rep: bool = False,
    +169        random_values: bool | None = False,
    +170        low: int | None = 0,
    +171        high: int | None = 1000,
    +172    ) -> np.ndarray:
    +173
    +174        """
    +175        Helper function, calls _generate_feature with appropriate parameters based on feature_attributes
    +176        :param feature_attributes: either integer (cardinality) or list of feature attributes
    +177        :param n_samples: number of samples in dataset
    +178        :param ensure_rep: ensures all values are represented at least once in the feature vector
    +179        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    +180        :param low: lower bound of random feature vector values
    +181        :param high: upper bound of random feature vector values
    +182        :return: feature vector
    +183        """
    +184
    +185        if not isinstance(feature_attributes, (list, np.ndarray)):
    +186            # feature_cardinality is just an integer, generate feature either with random values or
    +187            # [low, low+cardinality]
    +188            x = self._generate_feature(
    +189                n_samples,
    +190                cardinality=feature_attributes,
    +191                ensure_rep=ensure_rep,
    +192                random_values=random_values,
    +193                low=low,
    +194                high=high,
    +195            )
    +196        else:
    +197            # feature_cardinality is a list of [value_domain, value_frequencies]
    +198            if isinstance(feature_attributes[0], (list, np.ndarray)):
    +199                value_domain, value_frequencies = feature_attributes
    +200                x = self._generate_feature(
    +201                    n_samples,
    +202                    vec=value_domain,
    +203                    ensure_rep=ensure_rep,
    +204                    p=value_frequencies,
    +205                )
    +206            else:
    +207                # feature_cardinality is value_domain (list of values for feature)
    +208                value_domain = feature_attributes
    +209                x = self._generate_feature(
    +210                    n_samples,
    +211                    vec=value_domain,
    +212                    ensure_rep=ensure_rep,
    +213                )
    +214
    +215        return x
    +216
    +217    def _generate_feature(
    +218        self,
    +219        size: int,
    +220        vec: list[int] | ArrayLike | None = None,
    +221        cardinality: int = 5,
    +222        ensure_rep: bool = False,
    +223        random_values: bool | None = False,
    +224        low: int | None = 0,
    +225        high: int | None = 1000,
    +226        p: list[float] | np.ndarray | None = None,
    +227    ) -> np.ndarray:
    +228        """
    +229        Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value.
    +230        :param vec: list of feature values
    +231        :param cardinality: single value cardinality
    +232        :param size: length of feature vector
    +233        :param ensure_rep: ensures all values are represented at least once in the feature vector
    +234        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    +235        :param low: lower bound of random feature vector values
    +236        :param high: upper bound of random feature vector values
    +237        :param p: list of probabilities of each value
    +238        :return: feature vector x
    +239        """
    +240
    +241        if vec is None:
    +242            if random_values:
    +243                vec = np.random.choice(range(low, high + 1), cardinality, replace=False)
    +244            else:
    +245                vec = np.arange(low, low + cardinality, 1)
    +246        else:
    +247            vec = np.array(vec)
    +248
    +249        if p is None:
    +250            v_shift = vec - vec[np.random.randint(len(vec))]
    +251            p = norm.pdf(v_shift, scale=3)
    +252        else:
    +253            p = np.array(p)
    +254
    +255        p = p / p.sum()
    +256
    +257        if ensure_rep and len(vec) < size:
    +258            sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p)
    +259            sampled_values = np.append(sampled_values, vec)
    +260        else:
    +261            sampled_values = np.random.choice(vec, size=size, p=p)
    +262
    +263        np.random.shuffle(sampled_values)
    +264        return sampled_values
    +265
    +266    def generate_combinations(
    +267        self,
    +268        X: ArrayLike,
    +269        feature_indices: list[int] | ArrayLike,
    +270        combination_function: Optional = None,
    +271        combination_type: Literal = 'linear',
    +272    ) -> np.ndarray:
    +273        """
    +274        Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
    +275        :param X: dataset
    +276        :param feature_indices: indexes of features to be in combination
    +277        :param combination_function: optional custom function for combining feature vectors
    +278        :param combination_type: string flag, either liner or nonlinear, defining combination type
    +279        :return: X with added resultant feature
    +280        """
    +281
    +282        selected_features = X[:, feature_indices]
    +283
    +284        if combination_function is None:
    +285            if combination_type == 'linear':
    +286                combination_function = lambda x: np.sum(x, axis=1)
    +287            elif combination_type == 'nonlinear':
    +288                combination_function = lambda x: np.sin(np.sum(x, axis=1))
    +289        else:
    +290            combination_type = str(combination_function.__name__)
    +291
    +292        combination_result = combination_function(selected_features)
    +293
    +294        combination_ix = len(X[0])
    +295
    +296        self.dataset_info['combinations'].append({
    +297            'feature_indices': feature_indices,
    +298            'combination_type': combination_type,
    +299            'combination_ix': combination_ix,
    +300        })
    +301
    +302        return np.column_stack((X, combination_result))
    +303
    +304    def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +305        """
    +306        Performs bitwise XOR operation on two integer arrays
    +307        :param arr: features to perform XOR operation on
    +308        :return: bitwise XOR result
    +309        """
    +310        arrT = arr.T
    +311        arrT = arrT.astype(int)
    +312        out = np.bitwise_xor(arrT[0], arrT[1])
    +313        if len(arrT) > 2:
    +314            for i in range(2, len(arrT)):
    +315                out = np.bitwise_xor(out, arrT[i])
    +316
    +317        return out.T
    +318
    +319    def _and(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +320        """
    +321        Performs bitwise AND operation on two integer arrays
    +322        :param arr: features to perform AND operation on
    +323        :return: bitwise AND result
    +324        """
    +325        arrT = arr.T
    +326        arrT = arrT.astype(int)
    +327        out = np.bitwise_xor(arrT[0], arrT[1])
    +328        if len(arrT) > 2:
    +329            for i in range(2, len(arrT)):
    +330                out = np.bitwise_and(out, arrT[i])
    +331
    +332        return out.T
    +333
    +334    def _or(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +335        """
    +336        Performs bitwise OR operation on two integer arrays
    +337        :param arr: features to perform OR operation on
    +338        :return: bitwise OR result
    +339        """
    +340        arrT = arr.T
    +341        arrT = arrT.astype(int)
    +342        out = np.bitwise_xor(arrT[0], arrT[1])
    +343        if len(arrT) > 2:
    +344            for i in range(2, len(arrT)):
    +345                out = np.bitwise_or(out, arrT[i])
    +346
    +347        return out.T
    +348
    +349    def generate_correlated(
    +350        self,
    +351        X: ArrayLike,
    +352        feature_indices: list[int] | ArrayLike,
    +353        r: float = 0.8,
    +354    ) -> np.ndarray:
    +355
    +356        """
    +357        Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
    +358        :param X: dataset
    +359        :param feature_indices: indices of features to generate correlated feature to
    +360        :param r: (Pearson) correlation factor
    +361        :return: X with generated correlated  features
    +362        """
    +363
    +364        if not isinstance(feature_indices, (list, np.ndarray)):
    +365            feature_indices = np.array([feature_indices])
    +366
    +367        if len(feature_indices) > 1:
    +368            correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
    +369        else:
    +370            correlated_ixs = len(X[0])
    +371
    +372        selected_features = X[:, feature_indices]
    +373        transposed = np.transpose(selected_features)
    +374        correlated_features = []
    +375
    +376        for t in transposed:
    +377            theta = np.arccos(r)
    +378            t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
    +379
    +380            rand = np.random.normal(0, 1, len(t_standard))
    +381            rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
    +382
    +383            M = np.column_stack((t_standard, rand))
    +384            M_centred = (M - np.mean(M, axis=0))
    +385
    +386            Id = np.eye(len(t))
    +387            Q = qr(M_centred[:, [0]], mode='economic')[0]
    +388            P = np.dot(Q, Q.T)
    +389            orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
    +390            M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
    +391
    +392            Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
    +393            corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
    +394
    +395            correlated_features.append(corr)
    +396
    +397        correlated_features = np.transpose(correlated_features)
    +398
    +399        self.dataset_info['correlations'].append({
    +400            'feature_indices': feature_indices,
    +401            'correlated_indices': correlated_ixs,
    +402            'correlation_factor': r,
    +403        })
    +404
    +405        return np.column_stack((X, correlated_features))
    +406
    +407    def generate_duplicates(
    +408        self,
    +409        X: ArrayLike,
    +410        feature_indices: list[int] | ArrayLike,
    +411    ) -> np.ndarray:
    +412        """
    +413        Generates duplicate features
    +414        :param X: dataset
    +415        :param feature_indices: indices of features to duplicate
    +416        :return: dataset with duplicated features
    +417        """
    +418        if not isinstance(feature_indices, (list, np.ndarray)):
    +419            feature_indices = np.array([feature_indices])
    +420
    +421        duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
    +422
    +423        selected_features = X[:, feature_indices]
    +424
    +425        self.dataset_info['duplicates'].append({
    +426            'feature_indices': feature_indices,
    +427            'duplicate_indices': duplicated_ixs,
    +428        })
    +429
    +430        return np.column_stack((X, selected_features))
    +431
    +432    def generate_labels(
    +433        self,
    +434        X: ArrayLike,
    +435        n: int = 2,
    +436        p: float | list[float] | ArrayLike = 0.5,
    +437        k: int | float = 2,
    +438        decision_function: Optional = None,
    +439        class_relation: str = 'linear',
    +440        balance: bool = False,
    +441    ):
    +442        """
    +443        Generates labels for dataset X
    +444        :param X: dataset
    +445        :param n: number of class labels
    +446        :param p: class distribution
    +447        :param k: constant
    +448        :param decision_function: optional user-defined decision function
    +449        :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    +450        :param balance: boolean, whether to balance clustering class labels
    +451        :return: array of labels, corresponding to dataset X
    +452        """
    +453
    +454        if isinstance(p, (list, np.ndarray)):
    +455            if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
    +456            if len(p) > n: raise ValueError('length of p must equal n')
    +457
    +458        if p > 1: raise ValueError('p must be less than 1.0')
    +459
    +460        n_samples, n_features = X.shape
    +461
    +462        if decision_function is None:
    +463            if class_relation == 'linear':
    +464                decision_function = lambda x: np.sum(2 * x + 3, axis=1)
    +465            elif class_relation == 'nonlinear':
    +466                decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
    +467            elif class_relation == 'cluster':
    +468                decision_function = None
    +469        else:
    +470            class_relation = str(decision_function.__name__)
    +471
    +472        y = []
    +473        if decision_function is not None:
    +474            if n > 2:
    +475                if type(p) != list:
    +476                    p = 1 / n
    +477                    percentiles = [p * 100]
    +478                    for i in range(1, n - 1):
    +479                        percentiles.append(percentiles[i - 1] + (p * 100))
    +480
    +481                    decision_boundary = decision_function(X)
    +482                    p_points = np.percentile(decision_boundary, percentiles)
    +483
    +484                    y = np.zeros_like(decision_boundary, dtype=int)
    +485                    for p_point in p_points:
    +486                        y += (decision_boundary > p_point)
    +487                else:
    +488                    decision_boundary = decision_function(X)
    +489                    percentiles = [x * 100 for x in p]
    +490
    +491                    for i in range(1, len(percentiles) - 1):
    +492                        percentiles[i] += percentiles[i - 1]
    +493
    +494                    percentiles.insert(0, 0)
    +495                    percentiles.pop()
    +496                    print(percentiles)
    +497
    +498                    p_points = np.percentile(decision_boundary, percentiles)
    +499                    print(p_points)
    +500
    +501                    y = np.zeros_like(decision_boundary, dtype=int)
    +502                    for i in range(1, n):
    +503                        p_point = p_points[i]
    +504                        for j in range(len(decision_boundary)):
    +505                            if decision_boundary[j] > p_point:
    +506                                y[j] += 1
    +507            else:
    +508                decision_boundary = decision_function(X)
    +509                p_point = np.percentile(decision_boundary, p * 100)
    +510                y = np.where(decision_boundary > p_point, 1, 0)
    +511        else:
    +512            if p == 0.5:
    +513                p = 1.0
    +514            else:
    +515                p = [p, 1 - p]
    +516            y = self._cluster_data(X, n, p=p, balance=balance)
    +517
    +518        self.dataset_info.update({
    +519            'labels': {
    +520                'class_relation': class_relation,
    +521                'n_class': n,
    +522            },
    +523        })
    +524
    +525        return y
    +526
    +527    def _cluster_data(
    +528        self,
    +529        X: ArrayLike,
    +530        n: int,
    +531        p: float | list[float] | ArrayLike | None = 1.0,
    +532        balance: bool = False,
    +533    ) -> np.ndarray:
    +534        """
    +535        Cluster data using kmeans
    +536        :param X: dataset
    +537        :param n: number of clusters
    +538        :param p: class distribution
    +539        :param balance: balance the clusters according to p
    +540        :return: array of labels, corresponding to dataset X
    +541        """
    +542
    +543        kmeans = KMeans(n_clusters=n)
    +544
    +545        kmeans.fit(X)
    +546
    +547        cluster_labels = kmeans.labels_
    +548
    +549        if not isinstance(p, (list, np.ndarray)):  # Fully balanced clusters
    +550            samples_per_cluster = [len(X) // n] * n
    +551        else:
    +552            samples = len(X)
    +553            samples_per_cluster = []
    +554            if not isinstance(p, (list, np.ndarray)):
    +555                samples_per_cluster.append(int(samples * p) // n)
    +556                samples_per_cluster.append(int(samples * (1 - p)) // n)
    +557            else:
    +558                if len(p) == n:
    +559                    for val in p:
    +560                        samples_per_cluster.append(int(samples * val))
    +561                else:
    +562                    raise Exception('Length of balance parameter must equal number of clusters.')
    +563
    +564        # Adjust cluster sizes
    +565        if balance:
    +566            adjustments = []
    +567            overflow_samples = []
    +568            overflow_indices = []
    +569            for i in range(n):
    +570                cluster_size = np.sum(cluster_labels == i)
    +571
    +572                adjustment = samples_per_cluster[i] - cluster_size
    +573                adjustments.append(adjustment)
    +574
    +575                if adjustment < 0:  # Cluter is too large
    +576
    +577                    centroid = kmeans.cluster_centers_[i]
    +578                    dataset_indices = np.where(cluster_labels == i)[0]  # Indices of samples in dataset
    +579                    cluster_samples = np.copy(X[dataset_indices])
    +580
    +581                    distances = np.linalg.norm(
    +582                        cluster_samples - centroid,
    +583                        axis=1,
    +584                    )  # Distances of cluster samples to cluster centroid
    +585                    cluster_sample_indices = np.argsort(distances)
    +586                    dataset_indices_sorted = dataset_indices[
    +587                        cluster_sample_indices
    +588                    ]  # Indices of samples sorted by sample distance to cluster centroid
    +589
    +590                    overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:]  # Overflow samples
    +591                    dataset_indices_sorted = dataset_indices_sorted[
    +592                                             samples_per_cluster[i]:
    +593                    ]  # Dataset indices of overflow samples
    +594
    +595                    for i in range(len(overflow_sample_indices)):
    +596                        overflow_samples.append(cluster_samples[overflow_sample_indices[i]])
    +597                        overflow_indices.append(dataset_indices_sorted[i])
    +598
    +599            overflow_samples = np.array(overflow_samples)
    +600            overflow_indices = np.array(overflow_indices)
    +601
    +602            # Making adjustments
    +603            for i in range(n):
    +604
    +605                if adjustments[i] > 0:
    +606                    centroid = kmeans.cluster_centers_[i]
    +607                    distances = np.linalg.norm(overflow_samples - centroid, axis=1)
    +608
    +609                    closest_sample_indices = np.argsort(distances)
    +610
    +611                    overflow_indices_sorted = overflow_indices[closest_sample_indices]
    +612
    +613                    sample_indices_slice = closest_sample_indices[:adjustments[i]]
    +614                    overflow_indices_slice = overflow_indices_sorted[:adjustments[i]]
    +615
    +616                    cluster_labels[overflow_indices_slice] = i
    +617
    +618                    overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0)
    +619                    overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0)
    +620
    +621        return np.array(cluster_labels)
    +622
    +623    def generate_noise(
    +624        self,
    +625        X: ArrayLike,
    +626        y: list[int] | ArrayLike,
    +627        p: float = 0.2,
    +628        type: Literal = 'categorical',
    +629        missing_val: str | int | float = float('-inf'),
    +630    ) -> np.ndarray:
    +631
    +632        """
    +633        Simulates noise on given dataset X
    +634        :param X: dataset to apply noise to
    +635        :param y: required target labels for categorical noise generation
    +636        :param p: amount of noise to apply. Defaults to 0.2
    +637        :param type: type of noise to apply, either categorical or missing
    +638        :param missing_val: value to simulate missing values. Defaults to float('-inf')
    +639        :return: X with noise applied
    +640        """
    +641
    +642        self.dataset_info['noise'].append({
    +643            'type': type,
    +644            'amount': p,
    +645        })
    +646
    +647        if type == 'categorical':
    +648            label_values, label_count = np.unique(y, return_counts=True)
    +649            n_labels = len(label_values)
    +650
    +651            inds = y.argsort()
    +652            y_sort = y[inds]
    +653            X_sort = X[inds]
    +654
    +655            Xs_T = X_sort.T
    +656            n = Xs_T.shape[1]
    +657            n_flip = int(n * p)
    +658
    +659            for feature in Xs_T:
    +660                unique_per_label = {}
    +661
    +662                for i in range(n_labels):
    +663                    if i == 0:
    +664                        unique = np.unique(feature[:label_count[i]])
    +665                        unique_per_label[label_values[i]] = set(unique)
    +666                    else:
    +667                        unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
    +668                        unique_per_label[label_values[i]] = set(unique)
    +669
    +670                ixs = np.random.choice(n, n_flip, replace=False)
    +671
    +672                for ix in ixs:
    +673                    current_label = y_sort[ix]
    +674                    possible_labels = np.where(label_values != current_label)[0]
    +675
    +676                    # find all unique values from labels != current label
    +677                    values = set()
    +678                    for key in possible_labels:
    +679                        values = values.union(unique_per_label[key])
    +680
    +681                    # remove any overlapping values, ensuring replacement values are unique & from a target label !=
    +682                    # current label
    +683                    for val in unique_per_label[current_label] & values:
    +684                        values.remove(val)
    +685
    +686                    if len(values) > 0:
    +687                        val = np.random.choice(list(values))
    +688
    +689                    else:
    +690                        key = possible_labels[np.random.randint(len(possible_labels))]
    +691                        values = unique_per_label[key]
    +692                        val = np.random.choice(list(values))
    +693
    +694                    feature[ix] = val
    +695
    +696            rev_ind = inds.argsort()
    +697            X_noise = Xs_T.T
    +698            X_noise = X_noise[rev_ind]
    +699
    +700            return X_noise
    +701
    +702        elif type == 'missing':
    +703            X_noise = np.copy(X)
    +704            Xn_T = X_noise.T
    +705            n = Xn_T.shape[1]
    +706            n_missing = int(n * p)
    +707            #print("n to delete:", n_missing)
    +708
    +709            for feature in Xn_T:
    +710                ixs = np.random.choice(n, n_missing, replace=False)
    +711
    +712                for ix in ixs:
    +713                    feature[ix] = missing_val
    +714
    +715            return Xn_T.T
    +716
    +717    def downsample_dataset(
    +718        self,
    +719        X: ArrayLike,
    +720        y: list[int] | ArrayLike,
    +721        N: int | None = None,
    +722        seed: int = 42,
    +723        reshuffle: bool = False,
    +724    ) -> tuple[np.ndarray, np.ndarray]:
    +725
    +726        """
    +727        Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
    +728        :param X: Dataset to downsample
    +729        :param y: Labels corresponding to X
    +730        :param N: Optional number of samples per class to downsample to
    +731        :param seed: Seed for random state of resample function
    +732        :param reshuffle: Reshuffle the dataset after downsampling
    +733        :return: Balanced X and y after downsampling
    +734        """
    +735
    +736        original_shape = X.shape
    +737
    +738        values, counts = np.unique(y, return_counts=True)
    +739        if N is None:
    +740            N = min(counts)
    +741
    +742        if N > min(counts):
    +743            raise ValueError('N must be equal to or less than the number of samples in minority class')
    +744
    +745        X_arrays_list = []
    +746        y_downsampled = []
    +747        for label in values:
    +748            X_label = [X[i] for i in range(len(y)) if y[i] == label]
    +749            X_label_downsample = resample(
    +750                X_label,
    +751                replace=True,
    +752                n_samples=N,
    +753                random_state=seed,
    +754            )
    +755            X_arrays_list.append(X_label_downsample)
    +756            ys = [label] * N
    +757            y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
    +758
    +759        X_downsampled = np.concatenate(X_arrays_list, axis=0)
    +760
    +761        if reshuffle:
    +762            indices = np.arange(len(X_downsampled))
    +763            np.random.shuffle(indices)
    +764            X_downsampled = X_downsampled[indices]
    +765            y_downsampled = y_downsampled[indices]
    +766
    +767        downsampled_shape = X_downsampled.shape
    +768
    +769        self.dataset_info.update({
    +770            'downsampling': {
    +771                'original_shape': original_shape,
    +772                'downsampled_shape': downsampled_shape,
    +773            },
    +774        })
    +775
    +776        return X_downsampled, y_downsampled
    +777
    +778    def print_dataset(
    +779        self,
    +780        X: ArrayLike,
    +781        y: ArrayLike,
    +782    ):
    +783        """
    +784        Prints given dataset
    +785        :param X: dataset
    +786        :param y: labels
    +787        :return:
    +788        """
    +789
    +790        n_samples, n_features = X.shape
    +791        n = 0
    +792        for arr in X:
    +793            print('[', end='')
    +794            for i in range(n_features):
    +795                if i == n_features - 1:
    +796                    print(arr[i], end='')
    +797                else:
    +798                    print(arr[i], end=', ')
    +799            print(f'], Label: {y[n]}')
    +800            n += 1
    +801
    +802    def summarize(self):
    +803
    +804        print(f"Number of features: {self.dataset_info['general']['n_features']}")
    +805        print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
    +806        if self.dataset_info['downsampling']:
    +807            print(
    +808                f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}",
    +809            )
    +810        print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
    +811        print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
    +812
    +813        print('-------------------------------------')
    +814
    +815        if len(self.dataset_info['combinations']) > 0:
    +816            print('Combinations:')
    +817            for comb in self.dataset_info['combinations']:
    +818                print(
    +819                    f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}",
    +820                )
    +821            print('-------------------------------------')
    +822
    +823        if len(self.dataset_info['correlations']) > 0:
    +824            print('Correlations:')
    +825            for corr in self.dataset_info['correlations']:
    +826                print(
    +827                    f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}",
    +828                )
    +829            print('-------------------------------------')
    +830
    +831        if len(self.dataset_info['duplicates']) > 0:
    +832            print('Duplicates:')
    +833            for dup in self.dataset_info['duplicates']:
    +834                print(
    +835                    f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}",
    +836                )
    +837            print('-------------------------------------')
    +838
    +839        if len(self.dataset_info['noise']) > 0:
    +840            print('Simulated noise:')
    +841            for noise in self.dataset_info['noise']:
    +842                print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}")
    +843            print('-------------------------------------')
    +844
    +845        print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
    +
    + + +
    +
    + +
    + + class + CategoricalClassification: + + + +
    + +
     18class CategoricalClassification:
    + 19
    + 20    def __init__(self):
    + 21        self.dataset_info = {
    + 22            'general': {},
    + 23            'combinations': [],
    + 24            'correlations': [],
    + 25            'duplicates': [],
    + 26            'labels': [],
    + 27            'noise': [],
    + 28        }
    + 29
    + 30    def __repr__(self):
    + 31        return f"CategoricalClassification(dataset_info={self.dataset_info})"
    + 32
    + 33    def generate_data(
    + 34        self,
    + 35        n_features: int,
    + 36        n_samples: int,
    + 37        cardinality: int = 5,
    + 38        structure: list | ArrayLike | None = None,
    + 39        ensure_rep: bool = False,
    + 40        random_values: bool | None = False,
    + 41        low: int | None = 0,
    + 42        high: int | None = 1000,
    + 43        seed: int = 42,
    + 44    ) -> np.ndarray:
    + 45
    + 46        """
    + 47        Generates dataset based on given parameters
    + 48        :param n_features: number of generated features
    + 49        :param n_samples: number of generated samples
    + 50        :param cardinality: default cardinality of the dataset
    + 51        :param structure: structure of the dataset
    + 52        :param ensure_rep: flag, ensures all given values represented
    + 53        :param random_values: flag, enables random (integer) feature values from set [low, high]
    + 54        :param low: sets lower bound of random feature values
    + 55        :param high: sets high bound of random feature values
    + 56        :param seed: sets seed of numpy random
    + 57        :return: X, 2D dataset
    + 58        """
    + 59
    + 60        self.dataset_info.update({
    + 61            'general': {
    + 62                'n_features': n_features,
    + 63                'n_samples': n_samples,
    + 64                'cardinality': cardinality,
    + 65                'structure': structure,
    + 66                'ensure_rep': ensure_rep,
    + 67                'seed': seed,
    + 68            },
    + 69        })
    + 70
    + 71        np.random.seed(seed)
    + 72        X = np.empty([n_features, n_samples])
    + 73
    + 74        if structure is None:
    + 75            # No specific structure parameter passed
    + 76            for i in range(n_features):
    + 77                x = self._generate_feature(
    + 78                    n_samples,
    + 79                    cardinality=cardinality,
    + 80                    ensure_rep=ensure_rep,
    + 81                    random_values=random_values,
    + 82                    low=low,
    + 83                    high=high,
    + 84                )
    + 85                X[i] = x
    + 86        else:
    + 87            # Structure parameter passed, building based on structure
    + 88            ix = 0
    + 89            for data in structure:
    + 90                if not isinstance(data[0], (list, np.ndarray)):
    + 91                    # Data in structure is a tuple of (feature index (integer), feature attributes)
    + 92                    feature_ix, feature_attributes = data
    + 93
    + 94                    if ix < feature_ix:
    + 95                        # Filling out the dataset up to column index feature_ix
    + 96                        for i in range(ix, feature_ix):
    + 97                            x = self._generate_feature(
    + 98                                n_samples,
    + 99                                cardinality=cardinality,
    +100                                ensure_rep=ensure_rep,
    +101                                random_values=random_values,
    +102                                low=low,
    +103                                high=high,
    +104                            )
    +105                            X[ix] = x
    +106                            ix += 1
    +107
    +108                    x = self._configure_generate_feature(
    +109                        feature_attributes,
    +110                        n_samples,
    +111                        ensure_rep=ensure_rep,
    +112                        random_values=random_values,
    +113                        low=low,
    +114                        high=high,
    +115                    )
    +116                    X[ix] = x
    +117                    ix += 1
    +118
    +119                else:
    +120                    # Data in structure is a tuple of (list of feature indexes, feature attributes)
    +121                    feature_ixs, feature_attributes = data
    +122
    +123                    for feature_ix in feature_ixs:
    +124                        # Filling out the dataset up to feature_ix
    +125                        if ix < feature_ix:
    +126                            for i in range(ix, feature_ix):
    +127                                x = self._generate_feature(
    +128                                    n_samples,
    +129                                    cardinality=cardinality,
    +130                                    ensure_rep=ensure_rep,
    +131                                    random_values=random_values,
    +132                                    low=low,
    +133                                    high=high,
    +134                                )
    +135                                X[ix] = x
    +136                                ix += 1
    +137
    +138                        x = self._configure_generate_feature(
    +139                            feature_attributes,
    +140                            n_samples,
    +141                            ensure_rep=ensure_rep,
    +142                            random_values=random_values,
    +143                            low=low,
    +144                            high=high,
    +145                        )
    +146
    +147                        X[ix] = x
    +148                        ix += 1
    +149
    +150            if ix < n_features:
    +151                # Fill out the rest of the dataset
    +152                for i in range(ix, n_features):
    +153                    x = self._generate_feature(
    +154                        n_samples,
    +155                        cardinality=cardinality,
    +156                        ensure_rep=ensure_rep,
    +157                        random_values=random_values,
    +158                        low=low,
    +159                        high=high,
    +160                    )
    +161                    X[i] = x
    +162
    +163        return X.T
    +164
    +165    def _configure_generate_feature(
    +166        self,
    +167        feature_attributes: int | list | ArrayLike,
    +168        n_samples: int,
    +169        ensure_rep: bool = False,
    +170        random_values: bool | None = False,
    +171        low: int | None = 0,
    +172        high: int | None = 1000,
    +173    ) -> np.ndarray:
    +174
    +175        """
    +176        Helper function, calls _generate_feature with appropriate parameters based on feature_attributes
    +177        :param feature_attributes: either integer (cardinality) or list of feature attributes
    +178        :param n_samples: number of samples in dataset
    +179        :param ensure_rep: ensures all values are represented at least once in the feature vector
    +180        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    +181        :param low: lower bound of random feature vector values
    +182        :param high: upper bound of random feature vector values
    +183        :return: feature vector
    +184        """
    +185
    +186        if not isinstance(feature_attributes, (list, np.ndarray)):
    +187            # feature_cardinality is just an integer, generate feature either with random values or
    +188            # [low, low+cardinality]
    +189            x = self._generate_feature(
    +190                n_samples,
    +191                cardinality=feature_attributes,
    +192                ensure_rep=ensure_rep,
    +193                random_values=random_values,
    +194                low=low,
    +195                high=high,
    +196            )
    +197        else:
    +198            # feature_cardinality is a list of [value_domain, value_frequencies]
    +199            if isinstance(feature_attributes[0], (list, np.ndarray)):
    +200                value_domain, value_frequencies = feature_attributes
    +201                x = self._generate_feature(
    +202                    n_samples,
    +203                    vec=value_domain,
    +204                    ensure_rep=ensure_rep,
    +205                    p=value_frequencies,
    +206                )
    +207            else:
    +208                # feature_cardinality is value_domain (list of values for feature)
    +209                value_domain = feature_attributes
    +210                x = self._generate_feature(
    +211                    n_samples,
    +212                    vec=value_domain,
    +213                    ensure_rep=ensure_rep,
    +214                )
    +215
    +216        return x
    +217
    +218    def _generate_feature(
    +219        self,
    +220        size: int,
    +221        vec: list[int] | ArrayLike | None = None,
    +222        cardinality: int = 5,
    +223        ensure_rep: bool = False,
    +224        random_values: bool | None = False,
    +225        low: int | None = 0,
    +226        high: int | None = 1000,
    +227        p: list[float] | np.ndarray | None = None,
    +228    ) -> np.ndarray:
    +229        """
    +230        Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value.
    +231        :param vec: list of feature values
    +232        :param cardinality: single value cardinality
    +233        :param size: length of feature vector
    +234        :param ensure_rep: ensures all values are represented at least once in the feature vector
    +235        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    +236        :param low: lower bound of random feature vector values
    +237        :param high: upper bound of random feature vector values
    +238        :param p: list of probabilities of each value
    +239        :return: feature vector x
    +240        """
    +241
    +242        if vec is None:
    +243            if random_values:
    +244                vec = np.random.choice(range(low, high + 1), cardinality, replace=False)
    +245            else:
    +246                vec = np.arange(low, low + cardinality, 1)
    +247        else:
    +248            vec = np.array(vec)
    +249
    +250        if p is None:
    +251            v_shift = vec - vec[np.random.randint(len(vec))]
    +252            p = norm.pdf(v_shift, scale=3)
    +253        else:
    +254            p = np.array(p)
    +255
    +256        p = p / p.sum()
    +257
    +258        if ensure_rep and len(vec) < size:
    +259            sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p)
    +260            sampled_values = np.append(sampled_values, vec)
    +261        else:
    +262            sampled_values = np.random.choice(vec, size=size, p=p)
    +263
    +264        np.random.shuffle(sampled_values)
    +265        return sampled_values
    +266
    +267    def generate_combinations(
    +268        self,
    +269        X: ArrayLike,
    +270        feature_indices: list[int] | ArrayLike,
    +271        combination_function: Optional = None,
    +272        combination_type: Literal = 'linear',
    +273    ) -> np.ndarray:
    +274        """
    +275        Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
    +276        :param X: dataset
    +277        :param feature_indices: indexes of features to be in combination
    +278        :param combination_function: optional custom function for combining feature vectors
    +279        :param combination_type: string flag, either liner or nonlinear, defining combination type
    +280        :return: X with added resultant feature
    +281        """
    +282
    +283        selected_features = X[:, feature_indices]
    +284
    +285        if combination_function is None:
    +286            if combination_type == 'linear':
    +287                combination_function = lambda x: np.sum(x, axis=1)
    +288            elif combination_type == 'nonlinear':
    +289                combination_function = lambda x: np.sin(np.sum(x, axis=1))
    +290        else:
    +291            combination_type = str(combination_function.__name__)
    +292
    +293        combination_result = combination_function(selected_features)
    +294
    +295        combination_ix = len(X[0])
    +296
    +297        self.dataset_info['combinations'].append({
    +298            'feature_indices': feature_indices,
    +299            'combination_type': combination_type,
    +300            'combination_ix': combination_ix,
    +301        })
    +302
    +303        return np.column_stack((X, combination_result))
    +304
    +305    def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +306        """
    +307        Performs bitwise XOR operation on two integer arrays
    +308        :param arr: features to perform XOR operation on
    +309        :return: bitwise XOR result
    +310        """
    +311        arrT = arr.T
    +312        arrT = arrT.astype(int)
    +313        out = np.bitwise_xor(arrT[0], arrT[1])
    +314        if len(arrT) > 2:
    +315            for i in range(2, len(arrT)):
    +316                out = np.bitwise_xor(out, arrT[i])
    +317
    +318        return out.T
    +319
    +320    def _and(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +321        """
    +322        Performs bitwise AND operation on two integer arrays
    +323        :param arr: features to perform AND operation on
    +324        :return: bitwise AND result
    +325        """
    +326        arrT = arr.T
    +327        arrT = arrT.astype(int)
    +328        out = np.bitwise_xor(arrT[0], arrT[1])
    +329        if len(arrT) > 2:
    +330            for i in range(2, len(arrT)):
    +331                out = np.bitwise_and(out, arrT[i])
    +332
    +333        return out.T
    +334
    +335    def _or(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +336        """
    +337        Performs bitwise OR operation on two integer arrays
    +338        :param arr: features to perform OR operation on
    +339        :return: bitwise OR result
    +340        """
    +341        arrT = arr.T
    +342        arrT = arrT.astype(int)
    +343        out = np.bitwise_xor(arrT[0], arrT[1])
    +344        if len(arrT) > 2:
    +345            for i in range(2, len(arrT)):
    +346                out = np.bitwise_or(out, arrT[i])
    +347
    +348        return out.T
    +349
    +350    def generate_correlated(
    +351        self,
    +352        X: ArrayLike,
    +353        feature_indices: list[int] | ArrayLike,
    +354        r: float = 0.8,
    +355    ) -> np.ndarray:
    +356
    +357        """
    +358        Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
    +359        :param X: dataset
    +360        :param feature_indices: indices of features to generate correlated feature to
    +361        :param r: (Pearson) correlation factor
    +362        :return: X with generated correlated  features
    +363        """
    +364
    +365        if not isinstance(feature_indices, (list, np.ndarray)):
    +366            feature_indices = np.array([feature_indices])
    +367
    +368        if len(feature_indices) > 1:
    +369            correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
    +370        else:
    +371            correlated_ixs = len(X[0])
    +372
    +373        selected_features = X[:, feature_indices]
    +374        transposed = np.transpose(selected_features)
    +375        correlated_features = []
    +376
    +377        for t in transposed:
    +378            theta = np.arccos(r)
    +379            t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
    +380
    +381            rand = np.random.normal(0, 1, len(t_standard))
    +382            rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
    +383
    +384            M = np.column_stack((t_standard, rand))
    +385            M_centred = (M - np.mean(M, axis=0))
    +386
    +387            Id = np.eye(len(t))
    +388            Q = qr(M_centred[:, [0]], mode='economic')[0]
    +389            P = np.dot(Q, Q.T)
    +390            orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
    +391            M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
    +392
    +393            Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
    +394            corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
    +395
    +396            correlated_features.append(corr)
    +397
    +398        correlated_features = np.transpose(correlated_features)
    +399
    +400        self.dataset_info['correlations'].append({
    +401            'feature_indices': feature_indices,
    +402            'correlated_indices': correlated_ixs,
    +403            'correlation_factor': r,
    +404        })
    +405
    +406        return np.column_stack((X, correlated_features))
    +407
    +408    def generate_duplicates(
    +409        self,
    +410        X: ArrayLike,
    +411        feature_indices: list[int] | ArrayLike,
    +412    ) -> np.ndarray:
    +413        """
    +414        Generates duplicate features
    +415        :param X: dataset
    +416        :param feature_indices: indices of features to duplicate
    +417        :return: dataset with duplicated features
    +418        """
    +419        if not isinstance(feature_indices, (list, np.ndarray)):
    +420            feature_indices = np.array([feature_indices])
    +421
    +422        duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
    +423
    +424        selected_features = X[:, feature_indices]
    +425
    +426        self.dataset_info['duplicates'].append({
    +427            'feature_indices': feature_indices,
    +428            'duplicate_indices': duplicated_ixs,
    +429        })
    +430
    +431        return np.column_stack((X, selected_features))
    +432
    +433    def generate_labels(
    +434        self,
    +435        X: ArrayLike,
    +436        n: int = 2,
    +437        p: float | list[float] | ArrayLike = 0.5,
    +438        k: int | float = 2,
    +439        decision_function: Optional = None,
    +440        class_relation: str = 'linear',
    +441        balance: bool = False,
    +442    ):
    +443        """
    +444        Generates labels for dataset X
    +445        :param X: dataset
    +446        :param n: number of class labels
    +447        :param p: class distribution
    +448        :param k: constant
    +449        :param decision_function: optional user-defined decision function
    +450        :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    +451        :param balance: boolean, whether to balance clustering class labels
    +452        :return: array of labels, corresponding to dataset X
    +453        """
    +454
    +455        if isinstance(p, (list, np.ndarray)):
    +456            if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
    +457            if len(p) > n: raise ValueError('length of p must equal n')
    +458
    +459        if p > 1: raise ValueError('p must be less than 1.0')
    +460
    +461        n_samples, n_features = X.shape
    +462
    +463        if decision_function is None:
    +464            if class_relation == 'linear':
    +465                decision_function = lambda x: np.sum(2 * x + 3, axis=1)
    +466            elif class_relation == 'nonlinear':
    +467                decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
    +468            elif class_relation == 'cluster':
    +469                decision_function = None
    +470        else:
    +471            class_relation = str(decision_function.__name__)
    +472
    +473        y = []
    +474        if decision_function is not None:
    +475            if n > 2:
    +476                if type(p) != list:
    +477                    p = 1 / n
    +478                    percentiles = [p * 100]
    +479                    for i in range(1, n - 1):
    +480                        percentiles.append(percentiles[i - 1] + (p * 100))
    +481
    +482                    decision_boundary = decision_function(X)
    +483                    p_points = np.percentile(decision_boundary, percentiles)
    +484
    +485                    y = np.zeros_like(decision_boundary, dtype=int)
    +486                    for p_point in p_points:
    +487                        y += (decision_boundary > p_point)
    +488                else:
    +489                    decision_boundary = decision_function(X)
    +490                    percentiles = [x * 100 for x in p]
    +491
    +492                    for i in range(1, len(percentiles) - 1):
    +493                        percentiles[i] += percentiles[i - 1]
    +494
    +495                    percentiles.insert(0, 0)
    +496                    percentiles.pop()
    +497                    print(percentiles)
    +498
    +499                    p_points = np.percentile(decision_boundary, percentiles)
    +500                    print(p_points)
    +501
    +502                    y = np.zeros_like(decision_boundary, dtype=int)
    +503                    for i in range(1, n):
    +504                        p_point = p_points[i]
    +505                        for j in range(len(decision_boundary)):
    +506                            if decision_boundary[j] > p_point:
    +507                                y[j] += 1
    +508            else:
    +509                decision_boundary = decision_function(X)
    +510                p_point = np.percentile(decision_boundary, p * 100)
    +511                y = np.where(decision_boundary > p_point, 1, 0)
    +512        else:
    +513            if p == 0.5:
    +514                p = 1.0
    +515            else:
    +516                p = [p, 1 - p]
    +517            y = self._cluster_data(X, n, p=p, balance=balance)
    +518
    +519        self.dataset_info.update({
    +520            'labels': {
    +521                'class_relation': class_relation,
    +522                'n_class': n,
    +523            },
    +524        })
    +525
    +526        return y
    +527
    +528    def _cluster_data(
    +529        self,
    +530        X: ArrayLike,
    +531        n: int,
    +532        p: float | list[float] | ArrayLike | None = 1.0,
    +533        balance: bool = False,
    +534    ) -> np.ndarray:
    +535        """
    +536        Cluster data using kmeans
    +537        :param X: dataset
    +538        :param n: number of clusters
    +539        :param p: class distribution
    +540        :param balance: balance the clusters according to p
    +541        :return: array of labels, corresponding to dataset X
    +542        """
    +543
    +544        kmeans = KMeans(n_clusters=n)
    +545
    +546        kmeans.fit(X)
    +547
    +548        cluster_labels = kmeans.labels_
    +549
    +550        if not isinstance(p, (list, np.ndarray)):  # Fully balanced clusters
    +551            samples_per_cluster = [len(X) // n] * n
    +552        else:
    +553            samples = len(X)
    +554            samples_per_cluster = []
    +555            if not isinstance(p, (list, np.ndarray)):
    +556                samples_per_cluster.append(int(samples * p) // n)
    +557                samples_per_cluster.append(int(samples * (1 - p)) // n)
    +558            else:
    +559                if len(p) == n:
    +560                    for val in p:
    +561                        samples_per_cluster.append(int(samples * val))
    +562                else:
    +563                    raise Exception('Length of balance parameter must equal number of clusters.')
    +564
    +565        # Adjust cluster sizes
    +566        if balance:
    +567            adjustments = []
    +568            overflow_samples = []
    +569            overflow_indices = []
    +570            for i in range(n):
    +571                cluster_size = np.sum(cluster_labels == i)
    +572
    +573                adjustment = samples_per_cluster[i] - cluster_size
    +574                adjustments.append(adjustment)
    +575
    +576                if adjustment < 0:  # Cluter is too large
    +577
    +578                    centroid = kmeans.cluster_centers_[i]
    +579                    dataset_indices = np.where(cluster_labels == i)[0]  # Indices of samples in dataset
    +580                    cluster_samples = np.copy(X[dataset_indices])
    +581
    +582                    distances = np.linalg.norm(
    +583                        cluster_samples - centroid,
    +584                        axis=1,
    +585                    )  # Distances of cluster samples to cluster centroid
    +586                    cluster_sample_indices = np.argsort(distances)
    +587                    dataset_indices_sorted = dataset_indices[
    +588                        cluster_sample_indices
    +589                    ]  # Indices of samples sorted by sample distance to cluster centroid
    +590
    +591                    overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:]  # Overflow samples
    +592                    dataset_indices_sorted = dataset_indices_sorted[
    +593                                             samples_per_cluster[i]:
    +594                    ]  # Dataset indices of overflow samples
    +595
    +596                    for i in range(len(overflow_sample_indices)):
    +597                        overflow_samples.append(cluster_samples[overflow_sample_indices[i]])
    +598                        overflow_indices.append(dataset_indices_sorted[i])
    +599
    +600            overflow_samples = np.array(overflow_samples)
    +601            overflow_indices = np.array(overflow_indices)
    +602
    +603            # Making adjustments
    +604            for i in range(n):
    +605
    +606                if adjustments[i] > 0:
    +607                    centroid = kmeans.cluster_centers_[i]
    +608                    distances = np.linalg.norm(overflow_samples - centroid, axis=1)
    +609
    +610                    closest_sample_indices = np.argsort(distances)
    +611
    +612                    overflow_indices_sorted = overflow_indices[closest_sample_indices]
    +613
    +614                    sample_indices_slice = closest_sample_indices[:adjustments[i]]
    +615                    overflow_indices_slice = overflow_indices_sorted[:adjustments[i]]
    +616
    +617                    cluster_labels[overflow_indices_slice] = i
    +618
    +619                    overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0)
    +620                    overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0)
    +621
    +622        return np.array(cluster_labels)
    +623
    +624    def generate_noise(
    +625        self,
    +626        X: ArrayLike,
    +627        y: list[int] | ArrayLike,
    +628        p: float = 0.2,
    +629        type: Literal = 'categorical',
    +630        missing_val: str | int | float = float('-inf'),
    +631    ) -> np.ndarray:
    +632
    +633        """
    +634        Simulates noise on given dataset X
    +635        :param X: dataset to apply noise to
    +636        :param y: required target labels for categorical noise generation
    +637        :param p: amount of noise to apply. Defaults to 0.2
    +638        :param type: type of noise to apply, either categorical or missing
    +639        :param missing_val: value to simulate missing values. Defaults to float('-inf')
    +640        :return: X with noise applied
    +641        """
    +642
    +643        self.dataset_info['noise'].append({
    +644            'type': type,
    +645            'amount': p,
    +646        })
    +647
    +648        if type == 'categorical':
    +649            label_values, label_count = np.unique(y, return_counts=True)
    +650            n_labels = len(label_values)
    +651
    +652            inds = y.argsort()
    +653            y_sort = y[inds]
    +654            X_sort = X[inds]
    +655
    +656            Xs_T = X_sort.T
    +657            n = Xs_T.shape[1]
    +658            n_flip = int(n * p)
    +659
    +660            for feature in Xs_T:
    +661                unique_per_label = {}
    +662
    +663                for i in range(n_labels):
    +664                    if i == 0:
    +665                        unique = np.unique(feature[:label_count[i]])
    +666                        unique_per_label[label_values[i]] = set(unique)
    +667                    else:
    +668                        unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
    +669                        unique_per_label[label_values[i]] = set(unique)
    +670
    +671                ixs = np.random.choice(n, n_flip, replace=False)
    +672
    +673                for ix in ixs:
    +674                    current_label = y_sort[ix]
    +675                    possible_labels = np.where(label_values != current_label)[0]
    +676
    +677                    # find all unique values from labels != current label
    +678                    values = set()
    +679                    for key in possible_labels:
    +680                        values = values.union(unique_per_label[key])
    +681
    +682                    # remove any overlapping values, ensuring replacement values are unique & from a target label !=
    +683                    # current label
    +684                    for val in unique_per_label[current_label] & values:
    +685                        values.remove(val)
    +686
    +687                    if len(values) > 0:
    +688                        val = np.random.choice(list(values))
    +689
    +690                    else:
    +691                        key = possible_labels[np.random.randint(len(possible_labels))]
    +692                        values = unique_per_label[key]
    +693                        val = np.random.choice(list(values))
    +694
    +695                    feature[ix] = val
    +696
    +697            rev_ind = inds.argsort()
    +698            X_noise = Xs_T.T
    +699            X_noise = X_noise[rev_ind]
    +700
    +701            return X_noise
    +702
    +703        elif type == 'missing':
    +704            X_noise = np.copy(X)
    +705            Xn_T = X_noise.T
    +706            n = Xn_T.shape[1]
    +707            n_missing = int(n * p)
    +708            #print("n to delete:", n_missing)
    +709
    +710            for feature in Xn_T:
    +711                ixs = np.random.choice(n, n_missing, replace=False)
    +712
    +713                for ix in ixs:
    +714                    feature[ix] = missing_val
    +715
    +716            return Xn_T.T
    +717
    +718    def downsample_dataset(
    +719        self,
    +720        X: ArrayLike,
    +721        y: list[int] | ArrayLike,
    +722        N: int | None = None,
    +723        seed: int = 42,
    +724        reshuffle: bool = False,
    +725    ) -> tuple[np.ndarray, np.ndarray]:
    +726
    +727        """
    +728        Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
    +729        :param X: Dataset to downsample
    +730        :param y: Labels corresponding to X
    +731        :param N: Optional number of samples per class to downsample to
    +732        :param seed: Seed for random state of resample function
    +733        :param reshuffle: Reshuffle the dataset after downsampling
    +734        :return: Balanced X and y after downsampling
    +735        """
    +736
    +737        original_shape = X.shape
    +738
    +739        values, counts = np.unique(y, return_counts=True)
    +740        if N is None:
    +741            N = min(counts)
    +742
    +743        if N > min(counts):
    +744            raise ValueError('N must be equal to or less than the number of samples in minority class')
    +745
    +746        X_arrays_list = []
    +747        y_downsampled = []
    +748        for label in values:
    +749            X_label = [X[i] for i in range(len(y)) if y[i] == label]
    +750            X_label_downsample = resample(
    +751                X_label,
    +752                replace=True,
    +753                n_samples=N,
    +754                random_state=seed,
    +755            )
    +756            X_arrays_list.append(X_label_downsample)
    +757            ys = [label] * N
    +758            y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
    +759
    +760        X_downsampled = np.concatenate(X_arrays_list, axis=0)
    +761
    +762        if reshuffle:
    +763            indices = np.arange(len(X_downsampled))
    +764            np.random.shuffle(indices)
    +765            X_downsampled = X_downsampled[indices]
    +766            y_downsampled = y_downsampled[indices]
    +767
    +768        downsampled_shape = X_downsampled.shape
    +769
    +770        self.dataset_info.update({
    +771            'downsampling': {
    +772                'original_shape': original_shape,
    +773                'downsampled_shape': downsampled_shape,
    +774            },
    +775        })
    +776
    +777        return X_downsampled, y_downsampled
    +778
    +779    def print_dataset(
    +780        self,
    +781        X: ArrayLike,
    +782        y: ArrayLike,
    +783    ):
    +784        """
    +785        Prints given dataset
    +786        :param X: dataset
    +787        :param y: labels
    +788        :return:
    +789        """
    +790
    +791        n_samples, n_features = X.shape
    +792        n = 0
    +793        for arr in X:
    +794            print('[', end='')
    +795            for i in range(n_features):
    +796                if i == n_features - 1:
    +797                    print(arr[i], end='')
    +798                else:
    +799                    print(arr[i], end=', ')
    +800            print(f'], Label: {y[n]}')
    +801            n += 1
    +802
    +803    def summarize(self):
    +804
    +805        print(f"Number of features: {self.dataset_info['general']['n_features']}")
    +806        print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
    +807        if self.dataset_info['downsampling']:
    +808            print(
    +809                f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}",
    +810            )
    +811        print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
    +812        print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
    +813
    +814        print('-------------------------------------')
    +815
    +816        if len(self.dataset_info['combinations']) > 0:
    +817            print('Combinations:')
    +818            for comb in self.dataset_info['combinations']:
    +819                print(
    +820                    f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}",
    +821                )
    +822            print('-------------------------------------')
    +823
    +824        if len(self.dataset_info['correlations']) > 0:
    +825            print('Correlations:')
    +826            for corr in self.dataset_info['correlations']:
    +827                print(
    +828                    f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}",
    +829                )
    +830            print('-------------------------------------')
    +831
    +832        if len(self.dataset_info['duplicates']) > 0:
    +833            print('Duplicates:')
    +834            for dup in self.dataset_info['duplicates']:
    +835                print(
    +836                    f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}",
    +837                )
    +838            print('-------------------------------------')
    +839
    +840        if len(self.dataset_info['noise']) > 0:
    +841            print('Simulated noise:')
    +842            for noise in self.dataset_info['noise']:
    +843                print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}")
    +844            print('-------------------------------------')
    +845
    +846        print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
    +
    + + + + +
    +
    + dataset_info + + +
    + + + + +
    +
    + +
    + + def + generate_data( self, n_features: int, n_samples: int, cardinality: int = 5, structure: Union[list, numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]], NoneType] = None, ensure_rep: bool = False, random_values: bool | None = False, low: int | None = 0, high: int | None = 1000, seed: int = 42) -> numpy.ndarray: + + + +
    + +
     33    def generate_data(
    + 34        self,
    + 35        n_features: int,
    + 36        n_samples: int,
    + 37        cardinality: int = 5,
    + 38        structure: list | ArrayLike | None = None,
    + 39        ensure_rep: bool = False,
    + 40        random_values: bool | None = False,
    + 41        low: int | None = 0,
    + 42        high: int | None = 1000,
    + 43        seed: int = 42,
    + 44    ) -> np.ndarray:
    + 45
    + 46        """
    + 47        Generates dataset based on given parameters
    + 48        :param n_features: number of generated features
    + 49        :param n_samples: number of generated samples
    + 50        :param cardinality: default cardinality of the dataset
    + 51        :param structure: structure of the dataset
    + 52        :param ensure_rep: flag, ensures all given values represented
    + 53        :param random_values: flag, enables random (integer) feature values from set [low, high]
    + 54        :param low: sets lower bound of random feature values
    + 55        :param high: sets high bound of random feature values
    + 56        :param seed: sets seed of numpy random
    + 57        :return: X, 2D dataset
    + 58        """
    + 59
    + 60        self.dataset_info.update({
    + 61            'general': {
    + 62                'n_features': n_features,
    + 63                'n_samples': n_samples,
    + 64                'cardinality': cardinality,
    + 65                'structure': structure,
    + 66                'ensure_rep': ensure_rep,
    + 67                'seed': seed,
    + 68            },
    + 69        })
    + 70
    + 71        np.random.seed(seed)
    + 72        X = np.empty([n_features, n_samples])
    + 73
    + 74        if structure is None:
    + 75            # No specific structure parameter passed
    + 76            for i in range(n_features):
    + 77                x = self._generate_feature(
    + 78                    n_samples,
    + 79                    cardinality=cardinality,
    + 80                    ensure_rep=ensure_rep,
    + 81                    random_values=random_values,
    + 82                    low=low,
    + 83                    high=high,
    + 84                )
    + 85                X[i] = x
    + 86        else:
    + 87            # Structure parameter passed, building based on structure
    + 88            ix = 0
    + 89            for data in structure:
    + 90                if not isinstance(data[0], (list, np.ndarray)):
    + 91                    # Data in structure is a tuple of (feature index (integer), feature attributes)
    + 92                    feature_ix, feature_attributes = data
    + 93
    + 94                    if ix < feature_ix:
    + 95                        # Filling out the dataset up to column index feature_ix
    + 96                        for i in range(ix, feature_ix):
    + 97                            x = self._generate_feature(
    + 98                                n_samples,
    + 99                                cardinality=cardinality,
    +100                                ensure_rep=ensure_rep,
    +101                                random_values=random_values,
    +102                                low=low,
    +103                                high=high,
    +104                            )
    +105                            X[ix] = x
    +106                            ix += 1
    +107
    +108                    x = self._configure_generate_feature(
    +109                        feature_attributes,
    +110                        n_samples,
    +111                        ensure_rep=ensure_rep,
    +112                        random_values=random_values,
    +113                        low=low,
    +114                        high=high,
    +115                    )
    +116                    X[ix] = x
    +117                    ix += 1
    +118
    +119                else:
    +120                    # Data in structure is a tuple of (list of feature indexes, feature attributes)
    +121                    feature_ixs, feature_attributes = data
    +122
    +123                    for feature_ix in feature_ixs:
    +124                        # Filling out the dataset up to feature_ix
    +125                        if ix < feature_ix:
    +126                            for i in range(ix, feature_ix):
    +127                                x = self._generate_feature(
    +128                                    n_samples,
    +129                                    cardinality=cardinality,
    +130                                    ensure_rep=ensure_rep,
    +131                                    random_values=random_values,
    +132                                    low=low,
    +133                                    high=high,
    +134                                )
    +135                                X[ix] = x
    +136                                ix += 1
    +137
    +138                        x = self._configure_generate_feature(
    +139                            feature_attributes,
    +140                            n_samples,
    +141                            ensure_rep=ensure_rep,
    +142                            random_values=random_values,
    +143                            low=low,
    +144                            high=high,
    +145                        )
    +146
    +147                        X[ix] = x
    +148                        ix += 1
    +149
    +150            if ix < n_features:
    +151                # Fill out the rest of the dataset
    +152                for i in range(ix, n_features):
    +153                    x = self._generate_feature(
    +154                        n_samples,
    +155                        cardinality=cardinality,
    +156                        ensure_rep=ensure_rep,
    +157                        random_values=random_values,
    +158                        low=low,
    +159                        high=high,
    +160                    )
    +161                    X[i] = x
    +162
    +163        return X.T
    +
    + + +

    Generates dataset based on given parameters

    + +
    Parameters
    + +
      +
    • n_features: number of generated features
    • +
    • n_samples: number of generated samples
    • +
    • cardinality: default cardinality of the dataset
    • +
    • structure: structure of the dataset
    • +
    • ensure_rep: flag, ensures all given values represented
    • +
    • random_values: flag, enables random (integer) feature values from set [low, high]
    • +
    • low: sets lower bound of random feature values
    • +
    • high: sets high bound of random feature values
    • +
    • seed: sets seed of numpy random
    • +
    + +
    Returns
    + +
    +

    X, 2D dataset

    +
    +
    + + +
    +
    + +
    + + def + generate_combinations( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], feature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], combination_function: Optional = None, combination_type: Literal = 'linear') -> numpy.ndarray: + + + +
    + +
    267    def generate_combinations(
    +268        self,
    +269        X: ArrayLike,
    +270        feature_indices: list[int] | ArrayLike,
    +271        combination_function: Optional = None,
    +272        combination_type: Literal = 'linear',
    +273    ) -> np.ndarray:
    +274        """
    +275        Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
    +276        :param X: dataset
    +277        :param feature_indices: indexes of features to be in combination
    +278        :param combination_function: optional custom function for combining feature vectors
    +279        :param combination_type: string flag, either liner or nonlinear, defining combination type
    +280        :return: X with added resultant feature
    +281        """
    +282
    +283        selected_features = X[:, feature_indices]
    +284
    +285        if combination_function is None:
    +286            if combination_type == 'linear':
    +287                combination_function = lambda x: np.sum(x, axis=1)
    +288            elif combination_type == 'nonlinear':
    +289                combination_function = lambda x: np.sin(np.sum(x, axis=1))
    +290        else:
    +291            combination_type = str(combination_function.__name__)
    +292
    +293        combination_result = combination_function(selected_features)
    +294
    +295        combination_ix = len(X[0])
    +296
    +297        self.dataset_info['combinations'].append({
    +298            'feature_indices': feature_indices,
    +299            'combination_type': combination_type,
    +300            'combination_ix': combination_ix,
    +301        })
    +302
    +303        return np.column_stack((X, combination_result))
    +
    + + +

    Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X

    + +
    Parameters
    + +
      +
    • X: dataset
    • +
    • feature_indices: indexes of features to be in combination
    • +
    • combination_function: optional custom function for combining feature vectors
    • +
    • combination_type: string flag, either liner or nonlinear, defining combination type
    • +
    + +
    Returns
    + +
    +

    X with added resultant feature

    +
    +
    + + +
    +
    + +
    + + def + generate_correlated( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], feature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], r: float = 0.8) -> numpy.ndarray: + + + +
    + +
    350    def generate_correlated(
    +351        self,
    +352        X: ArrayLike,
    +353        feature_indices: list[int] | ArrayLike,
    +354        r: float = 0.8,
    +355    ) -> np.ndarray:
    +356
    +357        """
    +358        Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
    +359        :param X: dataset
    +360        :param feature_indices: indices of features to generate correlated feature to
    +361        :param r: (Pearson) correlation factor
    +362        :return: X with generated correlated  features
    +363        """
    +364
    +365        if not isinstance(feature_indices, (list, np.ndarray)):
    +366            feature_indices = np.array([feature_indices])
    +367
    +368        if len(feature_indices) > 1:
    +369            correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
    +370        else:
    +371            correlated_ixs = len(X[0])
    +372
    +373        selected_features = X[:, feature_indices]
    +374        transposed = np.transpose(selected_features)
    +375        correlated_features = []
    +376
    +377        for t in transposed:
    +378            theta = np.arccos(r)
    +379            t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
    +380
    +381            rand = np.random.normal(0, 1, len(t_standard))
    +382            rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
    +383
    +384            M = np.column_stack((t_standard, rand))
    +385            M_centred = (M - np.mean(M, axis=0))
    +386
    +387            Id = np.eye(len(t))
    +388            Q = qr(M_centred[:, [0]], mode='economic')[0]
    +389            P = np.dot(Q, Q.T)
    +390            orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
    +391            M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
    +392
    +393            Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
    +394            corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
    +395
    +396            correlated_features.append(corr)
    +397
    +398        correlated_features = np.transpose(correlated_features)
    +399
    +400        self.dataset_info['correlations'].append({
    +401            'feature_indices': feature_indices,
    +402            'correlated_indices': correlated_ixs,
    +403            'correlation_factor': r,
    +404        })
    +405
    +406        return np.column_stack((X, correlated_features))
    +
    + + +

    Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.

    + +
    Parameters
    + +
      +
    • X: dataset
    • +
    • feature_indices: indices of features to generate correlated feature to
    • +
    • r: (Pearson) correlation factor
    • +
    + +
    Returns
    + +
    +

    X with generated correlated features

    +
    +
    + + +
    +
    + +
    + + def + generate_duplicates( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], feature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]) -> numpy.ndarray: + + + +
    + +
    408    def generate_duplicates(
    +409        self,
    +410        X: ArrayLike,
    +411        feature_indices: list[int] | ArrayLike,
    +412    ) -> np.ndarray:
    +413        """
    +414        Generates duplicate features
    +415        :param X: dataset
    +416        :param feature_indices: indices of features to duplicate
    +417        :return: dataset with duplicated features
    +418        """
    +419        if not isinstance(feature_indices, (list, np.ndarray)):
    +420            feature_indices = np.array([feature_indices])
    +421
    +422        duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
    +423
    +424        selected_features = X[:, feature_indices]
    +425
    +426        self.dataset_info['duplicates'].append({
    +427            'feature_indices': feature_indices,
    +428            'duplicate_indices': duplicated_ixs,
    +429        })
    +430
    +431        return np.column_stack((X, selected_features))
    +
    + + +

    Generates duplicate features

    + +
    Parameters
    + +
      +
    • X: dataset
    • +
    • feature_indices: indices of features to duplicate
    • +
    + +
    Returns
    + +
    +

    dataset with duplicated features

    +
    +
    + + +
    +
    + +
    + + def + generate_labels( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], n: int = 2, p: Union[float, list[float], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]] = 0.5, k: int | float = 2, decision_function: Optional = None, class_relation: str = 'linear', balance: bool = False): + + + +
    + +
    433    def generate_labels(
    +434        self,
    +435        X: ArrayLike,
    +436        n: int = 2,
    +437        p: float | list[float] | ArrayLike = 0.5,
    +438        k: int | float = 2,
    +439        decision_function: Optional = None,
    +440        class_relation: str = 'linear',
    +441        balance: bool = False,
    +442    ):
    +443        """
    +444        Generates labels for dataset X
    +445        :param X: dataset
    +446        :param n: number of class labels
    +447        :param p: class distribution
    +448        :param k: constant
    +449        :param decision_function: optional user-defined decision function
    +450        :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    +451        :param balance: boolean, whether to balance clustering class labels
    +452        :return: array of labels, corresponding to dataset X
    +453        """
    +454
    +455        if isinstance(p, (list, np.ndarray)):
    +456            if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
    +457            if len(p) > n: raise ValueError('length of p must equal n')
    +458
    +459        if p > 1: raise ValueError('p must be less than 1.0')
    +460
    +461        n_samples, n_features = X.shape
    +462
    +463        if decision_function is None:
    +464            if class_relation == 'linear':
    +465                decision_function = lambda x: np.sum(2 * x + 3, axis=1)
    +466            elif class_relation == 'nonlinear':
    +467                decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
    +468            elif class_relation == 'cluster':
    +469                decision_function = None
    +470        else:
    +471            class_relation = str(decision_function.__name__)
    +472
    +473        y = []
    +474        if decision_function is not None:
    +475            if n > 2:
    +476                if type(p) != list:
    +477                    p = 1 / n
    +478                    percentiles = [p * 100]
    +479                    for i in range(1, n - 1):
    +480                        percentiles.append(percentiles[i - 1] + (p * 100))
    +481
    +482                    decision_boundary = decision_function(X)
    +483                    p_points = np.percentile(decision_boundary, percentiles)
    +484
    +485                    y = np.zeros_like(decision_boundary, dtype=int)
    +486                    for p_point in p_points:
    +487                        y += (decision_boundary > p_point)
    +488                else:
    +489                    decision_boundary = decision_function(X)
    +490                    percentiles = [x * 100 for x in p]
    +491
    +492                    for i in range(1, len(percentiles) - 1):
    +493                        percentiles[i] += percentiles[i - 1]
    +494
    +495                    percentiles.insert(0, 0)
    +496                    percentiles.pop()
    +497                    print(percentiles)
    +498
    +499                    p_points = np.percentile(decision_boundary, percentiles)
    +500                    print(p_points)
    +501
    +502                    y = np.zeros_like(decision_boundary, dtype=int)
    +503                    for i in range(1, n):
    +504                        p_point = p_points[i]
    +505                        for j in range(len(decision_boundary)):
    +506                            if decision_boundary[j] > p_point:
    +507                                y[j] += 1
    +508            else:
    +509                decision_boundary = decision_function(X)
    +510                p_point = np.percentile(decision_boundary, p * 100)
    +511                y = np.where(decision_boundary > p_point, 1, 0)
    +512        else:
    +513            if p == 0.5:
    +514                p = 1.0
    +515            else:
    +516                p = [p, 1 - p]
    +517            y = self._cluster_data(X, n, p=p, balance=balance)
    +518
    +519        self.dataset_info.update({
    +520            'labels': {
    +521                'class_relation': class_relation,
    +522                'n_class': n,
    +523            },
    +524        })
    +525
    +526        return y
    +
    + + +

    Generates labels for dataset X

    + +
    Parameters
    + +
      +
    • X: dataset
    • +
    • n: number of class labels
    • +
    • p: class distribution
    • +
    • k: constant
    • +
    • decision_function: optional user-defined decision function
    • +
    • class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    • +
    • balance: boolean, whether to balance clustering class labels
    • +
    + +
    Returns
    + +
    +

    array of labels, corresponding to dataset X

    +
    +
    + + +
    +
    + +
    + + def + generate_noise( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], p: float = 0.2, type: Literal = 'categorical', missing_val: str | int | float = -inf) -> numpy.ndarray: + + + +
    + +
    624    def generate_noise(
    +625        self,
    +626        X: ArrayLike,
    +627        y: list[int] | ArrayLike,
    +628        p: float = 0.2,
    +629        type: Literal = 'categorical',
    +630        missing_val: str | int | float = float('-inf'),
    +631    ) -> np.ndarray:
    +632
    +633        """
    +634        Simulates noise on given dataset X
    +635        :param X: dataset to apply noise to
    +636        :param y: required target labels for categorical noise generation
    +637        :param p: amount of noise to apply. Defaults to 0.2
    +638        :param type: type of noise to apply, either categorical or missing
    +639        :param missing_val: value to simulate missing values. Defaults to float('-inf')
    +640        :return: X with noise applied
    +641        """
    +642
    +643        self.dataset_info['noise'].append({
    +644            'type': type,
    +645            'amount': p,
    +646        })
    +647
    +648        if type == 'categorical':
    +649            label_values, label_count = np.unique(y, return_counts=True)
    +650            n_labels = len(label_values)
    +651
    +652            inds = y.argsort()
    +653            y_sort = y[inds]
    +654            X_sort = X[inds]
    +655
    +656            Xs_T = X_sort.T
    +657            n = Xs_T.shape[1]
    +658            n_flip = int(n * p)
    +659
    +660            for feature in Xs_T:
    +661                unique_per_label = {}
    +662
    +663                for i in range(n_labels):
    +664                    if i == 0:
    +665                        unique = np.unique(feature[:label_count[i]])
    +666                        unique_per_label[label_values[i]] = set(unique)
    +667                    else:
    +668                        unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
    +669                        unique_per_label[label_values[i]] = set(unique)
    +670
    +671                ixs = np.random.choice(n, n_flip, replace=False)
    +672
    +673                for ix in ixs:
    +674                    current_label = y_sort[ix]
    +675                    possible_labels = np.where(label_values != current_label)[0]
    +676
    +677                    # find all unique values from labels != current label
    +678                    values = set()
    +679                    for key in possible_labels:
    +680                        values = values.union(unique_per_label[key])
    +681
    +682                    # remove any overlapping values, ensuring replacement values are unique & from a target label !=
    +683                    # current label
    +684                    for val in unique_per_label[current_label] & values:
    +685                        values.remove(val)
    +686
    +687                    if len(values) > 0:
    +688                        val = np.random.choice(list(values))
    +689
    +690                    else:
    +691                        key = possible_labels[np.random.randint(len(possible_labels))]
    +692                        values = unique_per_label[key]
    +693                        val = np.random.choice(list(values))
    +694
    +695                    feature[ix] = val
    +696
    +697            rev_ind = inds.argsort()
    +698            X_noise = Xs_T.T
    +699            X_noise = X_noise[rev_ind]
    +700
    +701            return X_noise
    +702
    +703        elif type == 'missing':
    +704            X_noise = np.copy(X)
    +705            Xn_T = X_noise.T
    +706            n = Xn_T.shape[1]
    +707            n_missing = int(n * p)
    +708            #print("n to delete:", n_missing)
    +709
    +710            for feature in Xn_T:
    +711                ixs = np.random.choice(n, n_missing, replace=False)
    +712
    +713                for ix in ixs:
    +714                    feature[ix] = missing_val
    +715
    +716            return Xn_T.T
    +
    + + +

    Simulates noise on given dataset X

    + +
    Parameters
    + +
      +
    • X: dataset to apply noise to
    • +
    • y: required target labels for categorical noise generation
    • +
    • p: amount of noise to apply. Defaults to 0.2
    • +
    • type: type of noise to apply, either categorical or missing
    • +
    • missing_val: value to simulate missing values. Defaults to float('-inf')
    • +
    + +
    Returns
    + +
    +

    X with noise applied

    +
    +
    + + +
    +
    + +
    + + def + downsample_dataset( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], N: int | None = None, seed: int = 42, reshuffle: bool = False) -> tuple[numpy.ndarray, numpy.ndarray]: + + + +
    + +
    718    def downsample_dataset(
    +719        self,
    +720        X: ArrayLike,
    +721        y: list[int] | ArrayLike,
    +722        N: int | None = None,
    +723        seed: int = 42,
    +724        reshuffle: bool = False,
    +725    ) -> tuple[np.ndarray, np.ndarray]:
    +726
    +727        """
    +728        Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
    +729        :param X: Dataset to downsample
    +730        :param y: Labels corresponding to X
    +731        :param N: Optional number of samples per class to downsample to
    +732        :param seed: Seed for random state of resample function
    +733        :param reshuffle: Reshuffle the dataset after downsampling
    +734        :return: Balanced X and y after downsampling
    +735        """
    +736
    +737        original_shape = X.shape
    +738
    +739        values, counts = np.unique(y, return_counts=True)
    +740        if N is None:
    +741            N = min(counts)
    +742
    +743        if N > min(counts):
    +744            raise ValueError('N must be equal to or less than the number of samples in minority class')
    +745
    +746        X_arrays_list = []
    +747        y_downsampled = []
    +748        for label in values:
    +749            X_label = [X[i] for i in range(len(y)) if y[i] == label]
    +750            X_label_downsample = resample(
    +751                X_label,
    +752                replace=True,
    +753                n_samples=N,
    +754                random_state=seed,
    +755            )
    +756            X_arrays_list.append(X_label_downsample)
    +757            ys = [label] * N
    +758            y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
    +759
    +760        X_downsampled = np.concatenate(X_arrays_list, axis=0)
    +761
    +762        if reshuffle:
    +763            indices = np.arange(len(X_downsampled))
    +764            np.random.shuffle(indices)
    +765            X_downsampled = X_downsampled[indices]
    +766            y_downsampled = y_downsampled[indices]
    +767
    +768        downsampled_shape = X_downsampled.shape
    +769
    +770        self.dataset_info.update({
    +771            'downsampling': {
    +772                'original_shape': original_shape,
    +773                'downsampled_shape': downsampled_shape,
    +774            },
    +775        })
    +776
    +777        return X_downsampled, y_downsampled
    +
    + + +

    Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.

    + +
    Parameters
    + +
      +
    • X: Dataset to downsample
    • +
    • y: Labels corresponding to X
    • +
    • N: Optional number of samples per class to downsample to
    • +
    • seed: Seed for random state of resample function
    • +
    • reshuffle: Reshuffle the dataset after downsampling
    • +
    + +
    Returns
    + +
    +

    Balanced X and y after downsampling

    +
    +
    + + +
    +
    + +
    + + def + print_dataset( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]): + + + +
    + +
    779    def print_dataset(
    +780        self,
    +781        X: ArrayLike,
    +782        y: ArrayLike,
    +783    ):
    +784        """
    +785        Prints given dataset
    +786        :param X: dataset
    +787        :param y: labels
    +788        :return:
    +789        """
    +790
    +791        n_samples, n_features = X.shape
    +792        n = 0
    +793        for arr in X:
    +794            print('[', end='')
    +795            for i in range(n_features):
    +796                if i == n_features - 1:
    +797                    print(arr[i], end='')
    +798                else:
    +799                    print(arr[i], end=', ')
    +800            print(f'], Label: {y[n]}')
    +801            n += 1
    +
    + + +

    Prints given dataset

    + +
    Parameters
    + +
      +
    • X: dataset
    • +
    • y: labels
    • +
    + +
    Returns
    +
    + + +
    +
    + +
    + + def + summarize(self): + + + +
    + +
    803    def summarize(self):
    +804
    +805        print(f"Number of features: {self.dataset_info['general']['n_features']}")
    +806        print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
    +807        if self.dataset_info['downsampling']:
    +808            print(
    +809                f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}",
    +810            )
    +811        print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
    +812        print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
    +813
    +814        print('-------------------------------------')
    +815
    +816        if len(self.dataset_info['combinations']) > 0:
    +817            print('Combinations:')
    +818            for comb in self.dataset_info['combinations']:
    +819                print(
    +820                    f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}",
    +821                )
    +822            print('-------------------------------------')
    +823
    +824        if len(self.dataset_info['correlations']) > 0:
    +825            print('Correlations:')
    +826            for corr in self.dataset_info['correlations']:
    +827                print(
    +828                    f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}",
    +829                )
    +830            print('-------------------------------------')
    +831
    +832        if len(self.dataset_info['duplicates']) > 0:
    +833            print('Duplicates:')
    +834            for dup in self.dataset_info['duplicates']:
    +835                print(
    +836                    f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}",
    +837                )
    +838            print('-------------------------------------')
    +839
    +840        if len(self.dataset_info['noise']) > 0:
    +841            print('Simulated noise:')
    +842            for noise in self.dataset_info['noise']:
    +843                print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}")
    +844            print('-------------------------------------')
    +845
    +846        print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
    +
    + + + + +
    +
    +
    + + \ No newline at end of file diff --git a/docs/outrank/algorithms/synthetic_data_generators/generator_naive.html b/docs/outrank/algorithms/synthetic_data_generators/generator_naive.html index cf1042c..f01beca 100644 --- a/docs/outrank/algorithms/synthetic_data_generators/generator_naive.html +++ b/docs/outrank/algorithms/synthetic_data_generators/generator_naive.html @@ -3,7 +3,7 @@ - + outrank.algorithms.synthetic_data_generators.generator_naive API documentation @@ -49,7 +49,7 @@

    API Documentation

    outrank.algorithms.synthetic_data_generators.generator_naive

    - + @@ -69,63 +69,64 @@

    13 target = sample[:, 30] 14 # Some noise 15 -16 target[target < 20] = 0 -17 return sample, target -18 +16 target[target < 40] = 0 +17 target[target > 39] = 1 +18 return sample, target 19 -20if __name__ == '__main__': -21 import argparse -22 import logging -23 import os -24 import shutil -25 -26 import pandas as pd -27 -28 logging.basicConfig( -29 format='%(asctime)s - %(message)s', -30 datefmt='%d-%b-%y %H:%M:%S', -31 ) -32 logger = logging.getLogger('syn-logger') -33 logger.setLevel(logging.DEBUG) -34 -35 parser = argparse.ArgumentParser( -36 description='Fast feature screening for sparse data sets.', -37 formatter_class=argparse.RawTextHelpFormatter, -38 ) -39 -40 parser.add_argument('--output_df_name', type=str, default=None) -41 -42 parser.add_argument('--verify_outputs', type=str, default=None) -43 -44 parser.add_argument('--num_features', type=int, default=300) -45 -46 parser.add_argument('--size', type=int, default=1000) -47 -48 args = parser.parse_args() -49 -50 if args.output_df_name is not None: -51 sample, target = generate_random_matrix(args.num_features, args.size) -52 dfx = pd.DataFrame(sample) -53 dfx.columns = [f'f{x}' for x in range(dfx.shape[1])] -54 dfx['label'] = target -55 if os.path.exists(args.output_df_name) and os.path.isdir(args.output_df_name): -56 shutil.rmtree(args.output_df_name) -57 os.mkdir(args.output_df_name) -58 dfx.to_csv(f'./{args.output_df_name}/data.csv', index=False) -59 -60 logging.info(f'Generated dataset {dfx.shape} in {args.output_df_name}') -61 elif args.verify_outputs is not None: -62 rankings = pd.read_csv( -63 os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t', -64 ) -65 if rankings.iloc[1]['Feature'] != 'f30-(81; 100)': -66 raise Exception( -67 f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting', -68 ) -69 else: -70 logger.info( -71 f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})', -72 ) +20 +21if __name__ == '__main__': +22 import argparse +23 import logging +24 import os +25 import shutil +26 +27 import pandas as pd +28 +29 logging.basicConfig( +30 format='%(asctime)s - %(message)s', +31 datefmt='%d-%b-%y %H:%M:%S', +32 ) +33 logger = logging.getLogger('syn-logger') +34 logger.setLevel(logging.DEBUG) +35 +36 parser = argparse.ArgumentParser( +37 description='Fast feature screening for sparse data sets.', +38 formatter_class=argparse.RawTextHelpFormatter, +39 ) +40 +41 parser.add_argument('--output_df_name', type=str, default=None) +42 +43 parser.add_argument('--verify_outputs', type=str, default=None) +44 +45 parser.add_argument('--num_features', type=int, default=300) +46 +47 parser.add_argument('--size', type=int, default=1000) +48 +49 args = parser.parse_args() +50 +51 if args.output_df_name is not None: +52 sample, target = generate_random_matrix(args.num_features, args.size) +53 dfx = pd.DataFrame(sample) +54 dfx.columns = [f'f{x}' for x in range(dfx.shape[1])] +55 dfx['label'] = target +56 if os.path.exists(args.output_df_name) and os.path.isdir(args.output_df_name): +57 shutil.rmtree(args.output_df_name) +58 os.mkdir(args.output_df_name) +59 dfx.to_csv(f'./{args.output_df_name}/data.csv', index=False) +60 +61 logging.info(f'Generated dataset {dfx.shape} in {args.output_df_name}') +62 elif args.verify_outputs is not None: +63 rankings = pd.read_csv( +64 os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t', +65 ) +66 if rankings.iloc[1]['Feature'] != 'f30-(81; 100)': +67 raise Exception( +68 f'Could not retrieve the appropriate feature needle in the haystack {rankings.iloc[1].Feature}, exiting', +69 ) +70 else: +71 logger.info( +72 f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})', +73 ) @@ -133,7 +134,7 @@

    - + def generate_random_matrix(num_features=100, size=20000): @@ -148,12 +149,13 @@

    14 target = sample[:, 30] 15 # Some noise 16 -17 target[target < 20] = 0 -18 return sample, target +17 target[target < 40] = 0 +18 target[target > 39] = 1 +19 return sample, target

    - +
    @@ -339,4 +341,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/core_ranking.html b/docs/outrank/core_ranking.html index 2c41396..9f3ea65 100644 --- a/docs/outrank/core_ranking.html +++ b/docs/outrank/core_ranking.html @@ -3,7 +3,7 @@ - + outrank.core_ranking API documentation @@ -96,9 +96,6 @@

    API Documentation

  • compute_batch_ranking
  • -
  • - get_num_of_instances -
  • get_grouped_df
  • @@ -124,7 +121,7 @@

    API Documentation

    outrank.core_ranking

    - + @@ -152,738 +149,740 @@

    21import pandas as pd 22import tqdm 23 - 24from outrank.algorithms.importance_estimator import get_importances_estimate_pairwise - 25from outrank.algorithms.sketches.counting_counters_ordinary import PrimitiveConstrainedCounter - 26from outrank.algorithms.sketches.counting_ultiloglog import ( - 27 HyperLogLogWCache as HyperLogLog, - 28) - 29from outrank.core_utils import BatchRankingSummary - 30from outrank.core_utils import extract_features_from_reference_JSON - 31from outrank.core_utils import generic_line_parser - 32from outrank.core_utils import internal_hash - 33from outrank.core_utils import NominalFeatureSummary - 34from outrank.core_utils import NumericFeatureSummary - 35from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric - 36from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise - 37 - 38logger = logging.getLogger('syn-logger') - 39logger.setLevel(logging.DEBUG) - 40random.seed(a=123, version=2) - 41GLOBAL_CARDINALITY_STORAGE: dict[Any, Any] = dict() - 42GLOBAL_COUNTS_STORAGE: dict[Any, Any] = dict() - 43GLOBAL_RARE_VALUE_STORAGE: dict[str, Any] = Counter() - 44GLOBAL_PRIOR_COMB_COUNTS: dict[Any, int] = Counter() - 45IGNORED_VALUES = set() - 46HYPERLL_ERROR_BOUND = 0.02 - 47MAX_FEATURES_3MR = 10 ** 4 - 48 - 49 - 50def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]: - 51 """Make sure only relevant subspace of combinations is selected based on prior counts""" + 24from outrank.algorithms.importance_estimator import \ + 25 get_importances_estimate_pairwise + 26from outrank.algorithms.sketches.counting_counters_ordinary import \ + 27 PrimitiveConstrainedCounter + 28from outrank.algorithms.sketches.counting_ultiloglog import \ + 29 HyperLogLogWCache as HyperLogLog + 30from outrank.core_utils import BatchRankingSummary + 31from outrank.core_utils import extract_features_from_reference_JSON + 32from outrank.core_utils import generic_line_parser + 33from outrank.core_utils import get_num_of_instances + 34from outrank.core_utils import internal_hash + 35from outrank.core_utils import is_prior_heuristic + 36from outrank.core_utils import NominalFeatureSummary + 37from outrank.core_utils import NumericFeatureSummary + 38from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric + 39from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise + 40 + 41logger = logging.getLogger('syn-logger') + 42logger.setLevel(logging.DEBUG) + 43random.seed(a=123, version=2) + 44GLOBAL_CARDINALITY_STORAGE: dict[Any, Any] = dict() + 45GLOBAL_COUNTS_STORAGE: dict[Any, Any] = dict() + 46GLOBAL_RARE_VALUE_STORAGE: dict[str, Any] = Counter() + 47GLOBAL_PRIOR_COMB_COUNTS: dict[Any, int] = Counter() + 48IGNORED_VALUES = set() + 49HYPERLL_ERROR_BOUND = 0.02 + 50MAX_FEATURES_3MR = 10 ** 4 + 51 52 - 53 if len(GLOBAL_PRIOR_COMB_COUNTS) == 0: - 54 for combination in combinations: - 55 GLOBAL_PRIOR_COMB_COUNTS[combination] += 1 - 56 tmp = combinations[:args.combination_number_upper_bound] - 57 else: - 58 tmp = list(x[0] for x in sorted(GLOBAL_PRIOR_COMB_COUNTS.items(), key=lambda x:x[1], reverse=False))[:args.combination_number_upper_bound] - 59 - 60 for combination in tmp: - 61 GLOBAL_PRIOR_COMB_COUNTS[combination] += 1 - 62 - 63 return tmp - 64 + 53def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]: + 54 """Make sure only relevant subspace of combinations is selected based on prior counts""" + 55 + 56 if len(combinations) == 0: + 57 return [] + 58 + 59 missing_combinations = set(set(combinations)).difference(GLOBAL_PRIOR_COMB_COUNTS.keys()) + 60 if len(missing_combinations) > 0: + 61 for combination in missing_combinations: + 62 GLOBAL_PRIOR_COMB_COUNTS[combination] = 0 + 63 + 64 tmp = sorted(combinations, key=GLOBAL_PRIOR_COMB_COUNTS.get, reverse=False)[:args.combination_number_upper_bound] 65 - 66def get_combinations_from_columns(all_columns: pd.Index, args: Any) -> list[tuple[Any, ...]]: - 67 """Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope""" + 66 for combination in tmp: + 67 GLOBAL_PRIOR_COMB_COUNTS[combination] += 1 68 - 69 if '3mr' in args.heuristic: - 70 if args.combination_number_upper_bound > MAX_FEATURES_3MR: - 71 args.combination_number_upper_bound = MAX_FEATURES_3MR - 72 rel_columns = [column for column in all_columns if ' AND_REL ' in column] - 73 non_rel_columns = sorted(set(all_columns) - set(rel_columns)) + 69 return tmp + 70 + 71 + 72def get_combinations_from_columns(all_columns: pd.Index, args: Any) -> list[tuple[Any, ...]]: + 73 """Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope""" 74 - 75 combinations = list( - 76 itertools.combinations_with_replacement(non_rel_columns, 2), - 77 ) - 78 combinations += [(column, args.label_column) for column in rel_columns] - 79 else: - 80 _combinations = itertools.combinations_with_replacement(all_columns, 2) - 81 - 82 # Some applications do not require the full feature-feature triangular matrix - 83 if args.target_ranking_only == 'True': - 84 combinations = [x for x in _combinations if args.label_column in x] - 85 else: - 86 combinations = list(_combinations) + 75 if '3mr' in args.heuristic: + 76 if args.combination_number_upper_bound > MAX_FEATURES_3MR: + 77 args.combination_number_upper_bound = MAX_FEATURES_3MR + 78 rel_columns = [column for column in all_columns if ' AND_REL ' in column] + 79 non_rel_columns = sorted(set(all_columns) - set(rel_columns)) + 80 + 81 combinations = list( + 82 itertools.combinations_with_replacement(non_rel_columns, 2), + 83 ) + 84 combinations += [(column, args.label_column) for column in rel_columns] + 85 else: + 86 _combinations = itertools.combinations_with_replacement(all_columns, 2) 87 - 88 if args.target_ranking_only != 'True': - 89 # Diagonal elements (non-label) - 90 combinations += [ - 91 (individual_column, individual_column) - 92 for individual_column in all_columns - 93 if individual_column != args.label_column - 94 ] - 95 return combinations - 96 - 97 - 98def mixed_rank_graph( - 99 input_dataframe: pd.DataFrame, args: Any, cpu_pool: Any, pbar: Any, -100) -> BatchRankingSummary: -101 """Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic""" + 88 # Some applications do not require the full feature-feature triangular matrix + 89 if args.target_ranking_only == 'True': + 90 combinations = [x for x in _combinations if args.label_column in x] + 91 else: + 92 combinations = list(_combinations) + 93 + 94 if args.target_ranking_only != 'True': + 95 # Diagonal elements (non-label) + 96 combinations += [ + 97 (individual_column, individual_column) + 98 for individual_column in all_columns + 99 if individual_column != args.label_column +100 ] +101 return combinations 102 -103 all_columns = input_dataframe.columns -104 -105 triplets = [] -106 tmp_df = input_dataframe.copy().astype('category') -107 out_time_struct = {} +103 +104def mixed_rank_graph( +105 input_dataframe: pd.DataFrame, args: Any, cpu_pool: Any, pbar: Any, +106) -> BatchRankingSummary: +107 """Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic""" 108 -109 # Handle cont. types prior to interaction evaluation -110 pbar.set_description('Encoding columns') -111 start_enc_timer = timer() -112 tmp_df = pd.DataFrame({k : tmp_df[k].cat.codes for k in all_columns}) -113 -114 end_enc_timer = timer() -115 out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer -116 -117 combinations = get_combinations_from_columns(all_columns, args) -118 combinations = prior_combinations_sample(combinations, args) -119 random.shuffle(combinations) -120 -121 if args.heuristic == 'Constant': -122 final_constant_imp = [] -123 for c1, c2 in combinations: -124 final_constant_imp.append((c1, c2, 0.0)) -125 -126 out_time_struct['feature_score_computation'] = end_enc_timer - \ -127 start_enc_timer -128 return BatchRankingSummary(final_constant_imp, out_time_struct) +109 all_columns = input_dataframe.columns +110 +111 triplets = [] +112 tmp_df = input_dataframe.copy().astype('category') +113 out_time_struct = {} +114 +115 # Handle cont. types prior to interaction evaluation +116 pbar.set_description('Encoding columns') +117 start_enc_timer = timer() +118 tmp_df = pd.DataFrame({k : tmp_df[k].cat.codes for k in all_columns}) +119 +120 end_enc_timer = timer() +121 out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer +122 +123 combinations = get_combinations_from_columns(all_columns, args) +124 +125 reference_model_features = {} +126 if is_prior_heuristic(args): +127 reference_model_features = [(' AND ').join(tuple(sorted(item.split(',')))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)] +128 combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features] 129 -130 # Map the scoring calls to the worker pool -131 pbar.set_description('Allocating thread pool') +130 combinations = prior_combinations_sample(combinations, args) +131 random.shuffle(combinations) 132 -133 # starmap is an alternative that is slower unfortunately (but nicer) -134 def get_grounded_importances_estimate(combination: tuple[str]) -> Any: -135 return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df) -136 -137 start_enc_timer = timer() -138 with cpu_pool as p: -139 pbar.set_description(f'Computing (#ftr={len(combinations)})') -140 results = p.amap(get_grounded_importances_estimate, combinations) -141 while not results.ready(): -142 time.sleep(4) -143 triplets = results.get() -144 end_enc_timer = timer() -145 out_time_struct['feature_score_computation'] = end_enc_timer - \ -146 start_enc_timer -147 -148 # Gather the final triplets -149 pbar.set_description('Aggregation of ranking results') -150 final_triplets = [] -151 for triplet in triplets: -152 inv = (triplet[1], triplet[0], triplet[2]) -153 final_triplets.append(inv) -154 final_triplets.append(triplet) -155 triplets = final_triplets -156 -157 pbar.set_description('Proceeding to the next batch of data') -158 return BatchRankingSummary(triplets, out_time_struct) +133 if args.heuristic == 'Constant': +134 final_constant_imp = [] +135 for c1, c2 in combinations: +136 final_constant_imp.append((c1, c2, 0.0)) +137 +138 out_time_struct['feature_score_computation'] = end_enc_timer - \ +139 start_enc_timer +140 return BatchRankingSummary(final_constant_imp, out_time_struct) +141 +142 # Map the scoring calls to the worker pool +143 pbar.set_description('Allocating thread pool') +144 +145 # starmap is an alternative that is slower unfortunately (but nicer) +146 def get_grounded_importances_estimate(combination: tuple[str]) -> Any: +147 return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df) +148 +149 start_enc_timer = timer() +150 with cpu_pool as p: +151 pbar.set_description(f'Computing (#ftr={len(combinations)})') +152 results = p.amap(get_grounded_importances_estimate, combinations) +153 while not results.ready(): +154 time.sleep(4) +155 triplets = results.get() +156 end_enc_timer = timer() +157 out_time_struct['feature_score_computation'] = end_enc_timer - \ +158 start_enc_timer 159 -160 -161def enrich_with_transformations( -162 input_dataframe: pd.DataFrame, num_col_types: set[str], logger: Any, args: Any, -163) -> pd.DataFrame: -164 """Construct a collection of new features based on pre-defined transformations/rules""" -165 -166 transformer = FeatureTransformerGeneric( -167 num_col_types, preset=args.transformers, -168 ) -169 transformed_df = transformer.construct_new_features(input_dataframe) -170 logger.info( -171 f'Constructed {len(transformer.constructed_feature_names)} new features ..', -172 ) -173 -174 return transformed_df -175 -176 -177def compute_combined_features( -178 input_dataframe: pd.DataFrame, -179 logger: Any, -180 args: Any, -181 pbar: Any, -182 is_3mr: bool = False, -183) -> pd.DataFrame: -184 """Compute higher order features via xxhash-based trick.""" +160 # Gather the final triplets +161 pbar.set_description('Aggregation of ranking results') +162 final_triplets = [] +163 for triplet in triplets: +164 inv = (triplet[1], triplet[0], triplet[2]) +165 final_triplets.append(inv) +166 final_triplets.append(triplet) +167 triplets = final_triplets +168 +169 pbar.set_description('Proceeding to the next batch of data') +170 return BatchRankingSummary(triplets, out_time_struct) +171 +172 +173def enrich_with_transformations( +174 input_dataframe: pd.DataFrame, num_col_types: set[str], logger: Any, args: Any, +175) -> pd.DataFrame: +176 """Construct a collection of new features based on pre-defined transformations/rules""" +177 +178 transformer = FeatureTransformerGeneric( +179 num_col_types, preset=args.transformers, +180 ) +181 transformed_df = transformer.construct_new_features(input_dataframe) +182 logger.info( +183 f'Constructed {len(transformer.constructed_feature_names)} new features ..', +184 ) 185 -186 all_columns = [ -187 x for x in input_dataframe.columns if x != args.label_column -188 ] -189 join_string = ' AND_REL ' if is_3mr else ' AND ' -190 interaction_order = 2 if is_3mr else args.interaction_order -191 -192 if args.reference_model_JSON != '': -193 combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True) -194 full_combination_space = [combination.split(',') for combination in combined_features] -195 else: -196 full_combination_space = list( -197 itertools.combinations(all_columns, interaction_order), -198 ) -199 -200 if args.combination_number_upper_bound and args.reference_model_JSON != '': -201 random.shuffle(full_combination_space) -202 full_combination_space = full_combination_space[ -203 : args.combination_number_upper_bound -204 ] +186 return transformed_df +187 +188 +189def compute_combined_features( +190 input_dataframe: pd.DataFrame, +191 args: Any, +192 pbar: Any, +193 is_3mr: bool = False, +194) -> pd.DataFrame: +195 """Compute higher order features via xxhash-based trick.""" +196 +197 all_columns = [ +198 x for x in input_dataframe.columns if x != args.label_column +199 ] +200 join_string = ' AND_REL ' if is_3mr else ' AND ' +201 interaction_order = 2 if is_3mr else args.interaction_order +202 +203 model_combinations = [] +204 full_combination_space = [] 205 -206 com_counter = 0 -207 new_feature_hash = {} -208 for new_combination in full_combination_space: -209 pbar.set_description( -210 f'Created {com_counter}/{len(full_combination_space)}', -211 ) -212 combined_feature: list[str] = [str(0)] * input_dataframe.shape[0] -213 for feature in new_combination: -214 tmp_feature = input_dataframe[feature].tolist() -215 for enx, el in enumerate(tmp_feature): -216 combined_feature[enx] = str( -217 internal_hash( -218 str(combined_feature[enx]) + str(el), -219 ), -220 ) -221 ftr_name = join_string.join(str(x) for x in new_combination) -222 new_feature_hash[ftr_name] = combined_feature -223 com_counter += 1 -224 tmp_df = pd.DataFrame(new_feature_hash) -225 pbar.set_description('Concatenating into final frame ..') -226 input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) -227 del tmp_df -228 -229 return input_dataframe -230 -231 -232def compute_expanded_multivalue_features( -233 input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any, -234) -> pd.DataFrame: -235 """Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value "a,b,c" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.""" -236 -237 considered_multivalue_features = args.explode_multivalue_features.split( -238 ';', -239 ) -240 new_feature_hash = {} -241 missing_symbols = set(args.missing_value_symbols.split(',')) -242 -243 for multivalue_feature in considered_multivalue_features: -244 multivalue_feature_vector = input_dataframe[multivalue_feature].values.tolist( -245 ) -246 multivalue_feature_vector = [ -247 x.replace(',', '-') for x in multivalue_feature_vector -248 ] -249 multivalue_sets = [ -250 set(x.split('-')) -251 for x in multivalue_feature_vector -252 ] -253 unique_values = set.union(*multivalue_sets) -254 -255 for missing_symbol in missing_symbols: -256 if missing_symbol in unique_values: -257 unique_values.remove(missing_symbol) -258 -259 for unique_value in unique_values: -260 tmp_vec = [] -261 for enx, multivalue in enumerate(multivalue_sets): -262 if unique_value in multivalue: -263 tmp_vec.append('1') -264 else: -265 tmp_vec.append('') -266 -267 new_feature_hash[f'MULTIEX-{multivalue_feature}-{unique_value}'] = tmp_vec -268 -269 tmp_df = pd.DataFrame(new_feature_hash) -270 input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) -271 del tmp_df -272 -273 return input_dataframe -274 +206 +207 if args.interaction_order > 1: +208 full_combination_space = list( +209 itertools.combinations(all_columns, interaction_order), +210 ) +211 full_combination_space = prior_combinations_sample(full_combination_space, args) +212 +213 if args.reference_model_JSON != '': +214 model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True) +215 model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations] +216 if not is_prior_heuristic(args): +217 full_combination_space = model_combinations +218 +219 if is_prior_heuristic(args): +220 full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space] +221 +222 +223 com_counter = 0 +224 new_feature_hash = {} +225 for new_combination in full_combination_space: +226 pbar.set_description( +227 f'Created {com_counter}/{len(full_combination_space)}', +228 ) +229 combined_feature: list[str] = [str(0)] * input_dataframe.shape[0] +230 for feature in new_combination: +231 tmp_feature = input_dataframe[feature].tolist() +232 for enx, el in enumerate(tmp_feature): +233 combined_feature[enx] = str( +234 internal_hash( +235 str(combined_feature[enx]) + str(el), +236 ), +237 ) +238 ftr_name = join_string.join(str(x) for x in new_combination) +239 new_feature_hash[ftr_name] = combined_feature +240 com_counter += 1 +241 tmp_df = pd.DataFrame(new_feature_hash) +242 pbar.set_description('Concatenating into final frame ..') +243 input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) +244 del tmp_df +245 +246 return input_dataframe +247 +248 +249def compute_expanded_multivalue_features( +250 input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any, +251) -> pd.DataFrame: +252 """Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value "a,b,c" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.""" +253 +254 considered_multivalue_features = args.explode_multivalue_features.split( +255 ';', +256 ) +257 new_feature_hash = {} +258 missing_symbols = set(args.missing_value_symbols.split(',')) +259 +260 for multivalue_feature in considered_multivalue_features: +261 multivalue_feature_vector = input_dataframe[multivalue_feature].values.tolist( +262 ) +263 multivalue_feature_vector = [ +264 x.replace(',', '-') for x in multivalue_feature_vector +265 ] +266 multivalue_sets = [ +267 set(x.split('-')) +268 for x in multivalue_feature_vector +269 ] +270 unique_values = set.union(*multivalue_sets) +271 +272 for missing_symbol in missing_symbols: +273 if missing_symbol in unique_values: +274 unique_values.remove(missing_symbol) 275 -276def compute_subfeatures( -277 input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any, -278) -> pd.DataFrame: -279 """Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction. -280 ->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered. -281 <->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded) -282 """ +276 for unique_value in unique_values: +277 tmp_vec = [] +278 for enx, multivalue in enumerate(multivalue_sets): +279 if unique_value in multivalue: +280 tmp_vec.append('1') +281 else: +282 tmp_vec.append('') 283 -284 all_subfeature_pair_seeds = args.subfeature_mapping.split(';') -285 new_feature_hash = dict() -286 -287 for seed_pair in all_subfeature_pair_seeds: -288 if '<->' in seed_pair: -289 feature_first, feature_second = seed_pair.split('<->') -290 -291 elif '->' in seed_pair: -292 feature_first, feature_second = seed_pair.split('->') -293 -294 else: -295 raise NotImplementedError( -296 'Please specify valid subfeature operator (<-> or ->)', -297 ) -298 -299 subframe = input_dataframe[[feature_first, feature_second]] -300 unique_feature_second = subframe[feature_second].unique() -301 feature_first_vec = subframe[feature_first].tolist() -302 feature_second_vec = subframe[feature_second].tolist() -303 out_template_feature = [ -304 (a, b) for a, b in zip(feature_first_vec, feature_second_vec) -305 ] -306 -307 if '<->' in seed_pair: -308 unique_feature_first = subframe[feature_first].unique() -309 -310 mask_types = [] -311 for unique_target_feature_value in unique_feature_second: -312 for unique_seed_feature_value in unique_feature_first: -313 mask_types.append( -314 (unique_seed_feature_value, unique_target_feature_value), -315 ) -316 -317 for mask_type in mask_types: -318 new_feature = [] -319 for value_tuple in out_template_feature: -320 if ( -321 value_tuple[0] == mask_type[0] -322 and value_tuple[1] == mask_type[1] -323 ): -324 new_feature.append(str(1)) -325 else: -326 new_feature.append(str(0)) -327 feature_name = ( -328 f'SUBFEATURE|{feature_first}|{feature_second}-' -329 + mask_type[0] -330 + '&' -331 + mask_type[1] -332 ) -333 new_feature_hash[feature_name] = new_feature -334 -335 del new_feature -336 -337 elif '->' in seed_pair: -338 for unique_target_feature_value in unique_feature_second: -339 tmp_new_feature = [ -340 'AND'.join( -341 x, -342 ) if x[1] == unique_target_feature_value else '' -343 for x in out_template_feature -344 ] -345 feature_name_final = ( -346 'SUBFEATURE-' + feature_first + '&' + unique_target_feature_value -347 ) -348 new_feature_hash[feature_name_final] = tmp_new_feature -349 -350 tmp_df = pd.DataFrame(new_feature_hash) -351 input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) -352 -353 del tmp_df -354 return input_dataframe -355 -356 -357def include_noisy_features( -358 input_dataframe: pd.DataFrame, logger: Any, args: Any, -359) -> pd.DataFrame: -360 """Add randomized features that serve as a sanity check""" -361 -362 transformer = FeatureTransformerNoise() -363 transformed_df = transformer.construct_new_features( -364 input_dataframe, args.label_column, -365 ) +284 new_feature_hash[f'MULTIEX-{multivalue_feature}-{unique_value}'] = tmp_vec +285 +286 tmp_df = pd.DataFrame(new_feature_hash) +287 input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) +288 del tmp_df +289 +290 return input_dataframe +291 +292 +293def compute_subfeatures( +294 input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any, +295) -> pd.DataFrame: +296 """Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction. +297 ->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered. +298 <->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded) +299 """ +300 +301 all_subfeature_pair_seeds = args.subfeature_mapping.split(';') +302 new_feature_hash = dict() +303 +304 for seed_pair in all_subfeature_pair_seeds: +305 if '<->' in seed_pair: +306 feature_first, feature_second = seed_pair.split('<->') +307 +308 elif '->' in seed_pair: +309 feature_first, feature_second = seed_pair.split('->') +310 +311 else: +312 raise NotImplementedError( +313 'Please specify valid subfeature operator (<-> or ->)', +314 ) +315 +316 subframe = input_dataframe[[feature_first, feature_second]] +317 unique_feature_second = subframe[feature_second].unique() +318 feature_first_vec = subframe[feature_first].tolist() +319 feature_second_vec = subframe[feature_second].tolist() +320 out_template_feature = [ +321 (a, b) for a, b in zip(feature_first_vec, feature_second_vec) +322 ] +323 +324 if '<->' in seed_pair: +325 unique_feature_first = subframe[feature_first].unique() +326 +327 mask_types = [] +328 for unique_target_feature_value in unique_feature_second: +329 for unique_seed_feature_value in unique_feature_first: +330 mask_types.append( +331 (unique_seed_feature_value, unique_target_feature_value), +332 ) +333 +334 for mask_type in mask_types: +335 new_feature = [] +336 for value_tuple in out_template_feature: +337 if ( +338 value_tuple[0] == mask_type[0] +339 and value_tuple[1] == mask_type[1] +340 ): +341 new_feature.append(str(1)) +342 else: +343 new_feature.append(str(0)) +344 feature_name = ( +345 f'SUBFEATURE|{feature_first}|{feature_second}-' +346 + mask_type[0] +347 + '&' +348 + mask_type[1] +349 ) +350 new_feature_hash[feature_name] = new_feature +351 +352 del new_feature +353 +354 elif '->' in seed_pair: +355 for unique_target_feature_value in unique_feature_second: +356 tmp_new_feature = [ +357 'AND'.join( +358 x, +359 ) if x[1] == unique_target_feature_value else '' +360 for x in out_template_feature +361 ] +362 feature_name_final = ( +363 'SUBFEATURE-' + feature_first + '&' + unique_target_feature_value +364 ) +365 new_feature_hash[feature_name_final] = tmp_new_feature 366 -367 return transformed_df -368 +367 tmp_df = pd.DataFrame(new_feature_hash) +368 input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) 369 -370def compute_coverage(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]: -371 """Compute coverage of features, incrementally""" -372 output_storage_cov = defaultdict(set) -373 all_missing_symbols = set(args.missing_value_symbols.split(',')) -374 for column in input_dataframe: -375 all_missing = sum( -376 [ -377 input_dataframe[column].values.tolist().count(x) -378 for x in all_missing_symbols -379 ], -380 ) -381 -382 output_storage_cov[column] = ( -383 1 - (all_missing / input_dataframe.shape[0]) -384 ) * 100 +370 del tmp_df +371 return input_dataframe +372 +373 +374def include_noisy_features( +375 input_dataframe: pd.DataFrame, logger: Any, args: Any, +376) -> pd.DataFrame: +377 """Add randomized features that serve as a sanity check""" +378 +379 transformer = FeatureTransformerNoise() +380 transformed_df = transformer.construct_new_features( +381 input_dataframe, args.label_column, +382 ) +383 +384 return transformed_df 385 -386 return output_storage_cov -387 -388 -389def compute_feature_memory_consumption(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]: -390 """An approximation of how much feature take up""" -391 output_storage_features = defaultdict(set) -392 for col in input_dataframe.columns: -393 specific_column = [ -394 str(x).strip() for x in input_dataframe[col].astype(str).values.tolist() -395 ] -396 col_size = sum( -397 len(x.encode()) -398 for x in specific_column -399 ) / input_dataframe.shape[0] -400 output_storage_features[col] = col_size -401 return output_storage_features +386 +387def compute_coverage(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]: +388 """Compute coverage of features, incrementally""" +389 output_storage_cov = defaultdict(set) +390 all_missing_symbols = set(args.missing_value_symbols.split(',')) +391 for column in input_dataframe: +392 all_missing = sum( +393 [ +394 input_dataframe[column].values.tolist().count(x) +395 for x in all_missing_symbols +396 ], +397 ) +398 +399 output_storage_cov[column] = ( +400 1 - (all_missing / input_dataframe.shape[0]) +401 ) * 100 402 -403 -404def compute_value_counts(input_dataframe: pd.DataFrame, args: Any): -405 """Update the count structure""" -406 -407 global GLOBAL_RARE_VALUE_STORAGE -408 global IGNORED_VALUES -409 -410 for column in input_dataframe.columns: -411 main_values = input_dataframe[column].values -412 for value in main_values: -413 if value not in IGNORED_VALUES: -414 GLOBAL_RARE_VALUE_STORAGE.update({(column, value): 1}) -415 -416 for key, val in GLOBAL_RARE_VALUE_STORAGE.items(): -417 if val > args.rare_value_count_upper_bound: -418 IGNORED_VALUES.add(key) +403 return output_storage_cov +404 +405 +406def compute_feature_memory_consumption(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]: +407 """An approximation of how much feature take up""" +408 output_storage_features = defaultdict(set) +409 for col in input_dataframe.columns: +410 specific_column = [ +411 str(x).strip() for x in input_dataframe[col].astype(str).values.tolist() +412 ] +413 col_size = sum( +414 len(x.encode()) +415 for x in specific_column +416 ) / input_dataframe.shape[0] +417 output_storage_features[col] = col_size +418 return output_storage_features 419 -420 for to_remove_val in IGNORED_VALUES: -421 del GLOBAL_RARE_VALUE_STORAGE[to_remove_val] -422 +420 +421def compute_value_counts(input_dataframe: pd.DataFrame, args: Any): +422 """Update the count structure""" 423 -424def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any, max_unique_hist_constraint: int) -> None: -425 """Compute cardinalities of features, incrementally""" +424 global GLOBAL_RARE_VALUE_STORAGE +425 global IGNORED_VALUES 426 -427 global GLOBAL_CARDINALITY_STORAGE -428 output_storage_card = defaultdict(set) -429 for enx, column in enumerate(input_dataframe): -430 output_storage_card[column] = set(input_dataframe[column].unique()) -431 if column not in GLOBAL_CARDINALITY_STORAGE: -432 GLOBAL_CARDINALITY_STORAGE[column] = HyperLogLog( -433 HYPERLL_ERROR_BOUND, -434 ) -435 -436 if column not in GLOBAL_COUNTS_STORAGE: -437 GLOBAL_COUNTS_STORAGE[column] = PrimitiveConstrainedCounter(max_unique_hist_constraint) -438 -439 [GLOBAL_COUNTS_STORAGE[column].add(value) for value in input_dataframe[column].values] +427 for column in input_dataframe.columns: +428 main_values = input_dataframe[column].values +429 for value in main_values: +430 if value not in IGNORED_VALUES: +431 GLOBAL_RARE_VALUE_STORAGE.update({(column, value): 1}) +432 +433 for key, val in GLOBAL_RARE_VALUE_STORAGE.items(): +434 if val > args.rare_value_count_upper_bound: +435 IGNORED_VALUES.add(key) +436 +437 for to_remove_val in IGNORED_VALUES: +438 del GLOBAL_RARE_VALUE_STORAGE[to_remove_val] +439 440 -441 for unique_value in set(input_dataframe[column].unique()): -442 if unique_value: -443 GLOBAL_CARDINALITY_STORAGE[column].add( -444 internal_hash(unique_value), -445 ) -446 -447 pbar.set_description( -448 f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}', -449 ) -450 -451 -452def compute_bounds_increment( -453 input_dataframe: pd.DataFrame, numeric_column_types: set[str], -454) -> dict[str, Any]: -455 all_features = input_dataframe.columns -456 numeric_column_types = set(numeric_column_types) -457 summary_object = {} -458 summary_storage: Any = {} -459 for feature in all_features: -460 if feature in numeric_column_types: -461 feature_vector = pd.to_numeric( -462 input_dataframe[feature], errors='coerce', -463 ) -464 minimum = np.min(feature_vector) -465 maximum = np.max(feature_vector) -466 mean = np.mean(feature_vector) -467 summary_storage = NumericFeatureSummary( -468 feature, minimum, maximum, mean, len( -469 np.unique(feature_vector), -470 ), -471 ) -472 summary_object[feature] = summary_storage -473 -474 else: -475 feature_vector = input_dataframe[feature].values -476 summary_storage = NominalFeatureSummary( -477 feature, len(np.unique(feature_vector)), -478 ) -479 summary_object[feature] = summary_storage -480 -481 return summary_object -482 -483 -484def compute_batch_ranking( -485 line_tmp_storage: list[list[Any]], -486 numeric_column_types: set[str], -487 args: Any, -488 cpu_pool: Any, -489 column_descriptions: list[str], -490 logger: Any, -491 pbar: Any, -492) -> tuple[BatchRankingSummary, dict[str, Any], dict[str, set[str]], dict[str, set[str]]]: -493 """Enrich the feature space and compute the batch importances""" -494 -495 input_dataframe = pd.DataFrame(line_tmp_storage) -496 input_dataframe.columns = column_descriptions -497 pbar.set_description('Control features') -498 -499 if args.feature_set_focus: -500 if args.feature_set_focus == '_all_from_reference_JSON': -501 focus_set = extract_features_from_reference_JSON( -502 args.reference_model_JSON, -503 ) -504 -505 else: -506 focus_set = set(args.feature_set_focus.split(',')) -507 -508 focus_set.add(args.label_column) -509 focus_set = {x for x in focus_set if x in input_dataframe.columns} -510 input_dataframe = input_dataframe[list(focus_set)] +441def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any, max_unique_hist_constraint: int) -> None: +442 """Compute cardinalities of features, incrementally""" +443 +444 global GLOBAL_CARDINALITY_STORAGE +445 output_storage_card = defaultdict(set) +446 for enx, column in enumerate(input_dataframe): +447 output_storage_card[column] = set(input_dataframe[column].unique()) +448 if column not in GLOBAL_CARDINALITY_STORAGE: +449 GLOBAL_CARDINALITY_STORAGE[column] = HyperLogLog( +450 HYPERLL_ERROR_BOUND, +451 ) +452 +453 if column not in GLOBAL_COUNTS_STORAGE: +454 GLOBAL_COUNTS_STORAGE[column] = PrimitiveConstrainedCounter(max_unique_hist_constraint) +455 +456 [GLOBAL_COUNTS_STORAGE[column].add(value) for value in input_dataframe[column].values] +457 +458 for unique_value in set(input_dataframe[column].unique()): +459 if unique_value: +460 GLOBAL_CARDINALITY_STORAGE[column].add( +461 internal_hash(unique_value), +462 ) +463 +464 pbar.set_description( +465 f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}', +466 ) +467 +468 +469def compute_bounds_increment( +470 input_dataframe: pd.DataFrame, numeric_column_types: set[str], +471) -> dict[str, Any]: +472 all_features = input_dataframe.columns +473 numeric_column_types = set(numeric_column_types) +474 summary_object = {} +475 summary_storage: Any = {} +476 for feature in all_features: +477 if feature in numeric_column_types: +478 feature_vector = pd.to_numeric( +479 input_dataframe[feature], errors='coerce', +480 ) +481 minimum = np.min(feature_vector) +482 maximum = np.max(feature_vector) +483 mean = np.mean(feature_vector) +484 summary_storage = NumericFeatureSummary( +485 feature, minimum, maximum, mean, len( +486 np.unique(feature_vector), +487 ), +488 ) +489 summary_object[feature] = summary_storage +490 +491 else: +492 feature_vector = input_dataframe[feature].values +493 summary_storage = NominalFeatureSummary( +494 feature, len(np.unique(feature_vector)), +495 ) +496 summary_object[feature] = summary_storage +497 +498 return summary_object +499 +500 +501def compute_batch_ranking( +502 line_tmp_storage: list[list[Any]], +503 numeric_column_types: set[str], +504 args: Any, +505 cpu_pool: Any, +506 column_descriptions: list[str], +507 logger: Any, +508 pbar: Any, +509) -> tuple[BatchRankingSummary, dict[str, Any], dict[str, set[str]], dict[str, set[str]]]: +510 """Enrich the feature space and compute the batch importances""" 511 -512 if args.transformers != 'none': -513 -514 pbar.set_description('Adding transformations') -515 input_dataframe = enrich_with_transformations( -516 input_dataframe, numeric_column_types, logger, args, -517 ) -518 -519 if args.explode_multivalue_features != 'False': -520 pbar.set_description('Constructing new features from multivalue ones') -521 input_dataframe = compute_expanded_multivalue_features( -522 input_dataframe, logger, args, pbar, -523 ) +512 input_dataframe = pd.DataFrame(line_tmp_storage) +513 input_dataframe.columns = column_descriptions +514 pbar.set_description('Control features') +515 +516 if args.feature_set_focus: +517 if args.feature_set_focus == '_all_from_reference_JSON': +518 focus_set = extract_features_from_reference_JSON( +519 args.reference_model_JSON, +520 ) +521 +522 else: +523 focus_set = set(args.feature_set_focus.split(',')) 524 -525 if args.subfeature_mapping != 'False': -526 pbar.set_description('Constructing new (sub)features') -527 input_dataframe = compute_subfeatures( -528 input_dataframe, logger, args, pbar, -529 ) +525 focus_set.add(args.label_column) +526 focus_set = {x for x in focus_set if x in input_dataframe.columns} +527 input_dataframe = input_dataframe[list(focus_set)] +528 +529 if args.transformers != 'none': 530 -531 if args.interaction_order > 1 or args.reference_model_JSON: -532 pbar.set_description('Constructing new features') -533 input_dataframe = compute_combined_features( -534 input_dataframe, logger, args, pbar, -535 ) -536 -537 # in case of 3mr we compute the score of combinations against the target -538 if '3mr' in args.heuristic: -539 pbar.set_description( -540 'Constructing features for computing relations in 3mr', -541 ) -542 input_dataframe = compute_combined_features( -543 input_dataframe, logger, args, pbar, True, -544 ) -545 -546 if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant': -547 pbar.set_description('Computing baseline features') -548 input_dataframe = include_noisy_features(input_dataframe, logger, args) -549 -550 # Compute incremental statistic useful for data inspection/transformer generation -551 pbar.set_description('Computing coverage') -552 coverage_storage = compute_coverage(input_dataframe, args) -553 feature_memory_consumption = compute_feature_memory_consumption( -554 input_dataframe, args, -555 ) -556 compute_cardinalities(input_dataframe, pbar, args.max_unique_hist_constraint) -557 -558 if args.task == 'identify_rare_values': -559 compute_value_counts(input_dataframe, args) -560 -561 bounds_storage = compute_bounds_increment( -562 input_dataframe, numeric_column_types, -563 ) -564 -565 pbar.set_description( -566 f'Computing ranks for {input_dataframe.shape[1]} features', -567 ) -568 -569 return ( -570 mixed_rank_graph(input_dataframe, args, cpu_pool, pbar), -571 bounds_storage, -572 coverage_storage, -573 feature_memory_consumption, -574 ) -575 -576 -577def get_num_of_instances(fname: str) -> int: -578 """Count the number of lines in a file, fast - useful for progress logging""" -579 -580 def _make_gen(reader): -581 while True: -582 b = reader(2**16) -583 if not b: -584 break -585 yield b -586 -587 with open(fname, 'rb') as f: -588 count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read)) -589 return count -590 -591 -592def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame: -593 """A helper method that enables median-based aggregation after processing""" -594 -595 importances_df = pd.DataFrame(importances_df_list) -596 if len(importances_df) == 0: -597 return None -598 importances_df.columns = ['FeatureA', 'FeatureB', 'Score'] -599 grouped = importances_df.groupby( -600 ['FeatureA', 'FeatureB'], -601 ).median().reset_index() -602 return grouped -603 -604 -605def checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None: -606 """A helper which stores intermediary state - useful for longer runs""" -607 -608 gdf = get_grouped_df(importances_batch) -609 if gdf is not None: -610 gdf.to_csv('ranking_checkpoint_tmp.tsv', sep='\t') -611 -612 -613def estimate_importances_minibatches( -614 input_file: str, -615 column_descriptions: list, -616 fw_col_mapping: dict[str, str], -617 numeric_column_types: set, -618 batch_size: int = 100000, -619 args: Any = None, -620 data_encoding: str = 'utf-8', -621 cpu_pool: Any = None, -622 delimiter: str = '\t', -623 feature_construction_mode: bool = False, -624 logger: Any = None, -625) -> tuple[list[dict[str, Any]], Any, dict[Any, Any], list[dict[str, Any]], list[dict[str, set[str]]], defaultdict[str, list[set[str]]], dict[str, Any], dict[str, Any], dict[str, Any]]: -626 """Interaction score estimator - suitable for example for csv-like input data types. -627 This type of data is normally a single large csv, meaning that minibatch processing needs to -628 happen during incremental handling of the file (that"s not the case for pre-separated ob data) -629 """ -630 -631 invalid_line_queue: Any = deque([], maxlen=2**5) +531 pbar.set_description('Adding transformations') +532 input_dataframe = enrich_with_transformations( +533 input_dataframe, numeric_column_types, logger, args, +534 ) +535 +536 if args.explode_multivalue_features != 'False': +537 pbar.set_description('Constructing new features from multivalue ones') +538 input_dataframe = compute_expanded_multivalue_features( +539 input_dataframe, logger, args, pbar, +540 ) +541 +542 if args.subfeature_mapping != 'False': +543 pbar.set_description('Constructing new (sub)features') +544 input_dataframe = compute_subfeatures( +545 input_dataframe, logger, args, pbar, +546 ) +547 +548 if args.interaction_order > 1 or args.reference_model_JSON: +549 pbar.set_description('Constructing new features') +550 input_dataframe = compute_combined_features( +551 input_dataframe, args, pbar, +552 ) +553 +554 # in case of 3mr we compute the score of combinations against the target +555 if '3mr' in args.heuristic: +556 pbar.set_description( +557 'Constructing features for computing relations in 3mr', +558 ) +559 input_dataframe = compute_combined_features( +560 input_dataframe, args, pbar, True, +561 ) +562 +563 if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant': +564 pbar.set_description('Computing baseline features') +565 input_dataframe = include_noisy_features(input_dataframe, logger, args) +566 +567 # Compute incremental statistic useful for data inspection/transformer generation +568 pbar.set_description('Computing coverage') +569 coverage_storage = compute_coverage(input_dataframe, args) +570 feature_memory_consumption = compute_feature_memory_consumption( +571 input_dataframe, args, +572 ) +573 compute_cardinalities(input_dataframe, pbar, args.max_unique_hist_constraint) +574 +575 if args.task == 'identify_rare_values': +576 compute_value_counts(input_dataframe, args) +577 +578 bounds_storage = compute_bounds_increment( +579 input_dataframe, numeric_column_types, +580 ) +581 +582 pbar.set_description( +583 f'Computing ranks for {input_dataframe.shape[1]} features', +584 ) +585 +586 return ( +587 mixed_rank_graph(input_dataframe, args, cpu_pool, pbar), +588 bounds_storage, +589 coverage_storage, +590 feature_memory_consumption, +591 ) +592 +593 +594def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame: +595 """A helper method that enables median-based aggregation after processing""" +596 +597 importances_df = pd.DataFrame(importances_df_list) +598 if len(importances_df) == 0: +599 return None +600 importances_df.columns = ['FeatureA', 'FeatureB', 'Score'] +601 grouped = importances_df.groupby( +602 ['FeatureA', 'FeatureB'], +603 ).median().reset_index() +604 return grouped +605 +606 +607def checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None: +608 """A helper which stores intermediary state - useful for longer runs""" +609 +610 gdf = get_grouped_df(importances_batch) +611 if gdf is not None: +612 gdf.to_csv('ranking_checkpoint_tmp.tsv', sep='\t') +613 +614 +615def estimate_importances_minibatches( +616 input_file: str, +617 column_descriptions: list, +618 fw_col_mapping: dict[str, str], +619 numeric_column_types: set, +620 batch_size: int = 100000, +621 args: Any = None, +622 data_encoding: str = 'utf-8', +623 cpu_pool: Any = None, +624 delimiter: str = '\t', +625 feature_construction_mode: bool = False, +626 logger: Any = None, +627) -> tuple[list[dict[str, Any]], Any, dict[Any, Any], list[dict[str, Any]], list[dict[str, set[str]]], defaultdict[str, list[set[str]]], dict[str, Any], dict[str, Any], dict[str, Any]]: +628 """Interaction score estimator - suitable for example for csv-like input data types. +629 This type of data is normally a single large csv, meaning that minibatch processing needs to +630 happen during incremental handling of the file (that"s not the case for pre-separated ob data) +631 """ 632 -633 invalid_lines = 0 -634 line_counter = 0 -635 -636 importances_df: list[Any] = [] -637 line_tmp_storage = [] -638 bounds_storage_batch = [] -639 memory_storage_batch = [] -640 step_timing_checkpoints = [] -641 -642 local_coverage_object = defaultdict(list) -643 local_pbar = tqdm.tqdm( -644 total=get_num_of_instances(input_file) - 1, position=0, disable=args.disable_tqdm == 'True', -645 ) -646 -647 file_name, file_extension = os.path.splitext(input_file) +633 invalid_line_queue: Any = deque([], maxlen=2**5) +634 +635 invalid_lines = 0 +636 line_counter = 0 +637 +638 importances_df: list[Any] = [] +639 line_tmp_storage = [] +640 bounds_storage_batch = [] +641 memory_storage_batch = [] +642 step_timing_checkpoints = [] +643 +644 local_coverage_object = defaultdict(list) +645 local_pbar = tqdm.tqdm( +646 total=get_num_of_instances(input_file) - 1, position=0, disable=args.disable_tqdm == 'True', +647 ) 648 -649 if file_extension == '.gz': -650 file_stream = gzip.open(input_file, 'rt', encoding=data_encoding) -651 -652 else: -653 file_stream = open(input_file, encoding=data_encoding) -654 -655 file_stream.readline() +649 file_name, file_extension = os.path.splitext(input_file) +650 +651 if file_extension == '.gz': +652 file_stream = gzip.open(input_file, 'rt', encoding=data_encoding) +653 +654 else: +655 file_stream = open(input_file, encoding=data_encoding) 656 -657 local_pbar.set_description('Starting ranking computation') -658 for line in file_stream: -659 line_counter += 1 -660 local_pbar.update(1) -661 -662 if line_counter % args.subsampling != 0: -663 continue -664 -665 parsed_line = generic_line_parser( -666 line, delimiter, args, fw_col_mapping, column_descriptions, -667 ) -668 -669 if len(parsed_line) == len(column_descriptions): -670 line_tmp_storage.append(parsed_line) -671 -672 else: -673 invalid_line_queue.appendleft(str(parsed_line)) -674 invalid_lines += 1 -675 -676 # Batches need to be processed on-the-fly -677 if len(line_tmp_storage) >= args.minibatch_size: -678 -679 importances_batch, bounds_storage, coverage_storage, memory_storage = compute_batch_ranking( -680 line_tmp_storage, -681 numeric_column_types, -682 args, -683 cpu_pool, -684 column_descriptions, -685 logger, -686 local_pbar, -687 ) -688 -689 bounds_storage_batch.append(bounds_storage) -690 memory_storage_batch.append(memory_storage) -691 for k, v in coverage_storage.items(): -692 local_coverage_object[k].append(v) -693 -694 del coverage_storage +657 file_stream.readline() +658 +659 local_pbar.set_description('Starting ranking computation') +660 for line in file_stream: +661 line_counter += 1 +662 local_pbar.update(1) +663 +664 if line_counter % args.subsampling != 0: +665 continue +666 +667 parsed_line = generic_line_parser( +668 line, delimiter, args, fw_col_mapping, column_descriptions, +669 ) +670 +671 if len(parsed_line) == len(column_descriptions): +672 line_tmp_storage.append(parsed_line) +673 +674 else: +675 invalid_line_queue.appendleft(str(parsed_line)) +676 invalid_lines += 1 +677 +678 # Batches need to be processed on-the-fly +679 if len(line_tmp_storage) >= args.minibatch_size: +680 +681 importances_batch, bounds_storage, coverage_storage, memory_storage = compute_batch_ranking( +682 line_tmp_storage, +683 numeric_column_types, +684 args, +685 cpu_pool, +686 column_descriptions, +687 logger, +688 local_pbar, +689 ) +690 +691 bounds_storage_batch.append(bounds_storage) +692 memory_storage_batch.append(memory_storage) +693 for k, v in coverage_storage.items(): +694 local_coverage_object[k].append(v) 695 -696 line_tmp_storage = [] -697 step_timing_checkpoints.append(importances_batch.step_times) -698 importances_df += importances_batch.triplet_scores -699 -700 if args.heuristic != 'Constant': -701 local_pbar.set_description('Creating checkpoint') -702 checkpoint_importances_df(importances_df) -703 -704 file_stream.close() +696 del coverage_storage +697 +698 line_tmp_storage = [] +699 step_timing_checkpoints.append(importances_batch.step_times) +700 importances_df += importances_batch.triplet_scores +701 +702 if args.heuristic != 'Constant': +703 local_pbar.set_description('Creating checkpoint') +704 checkpoint_importances_df(importances_df) 705 -706 local_pbar.set_description('Parsing the remainder') -707 if invalid_lines > 0: -708 logger.info( -709 f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!", -710 ) -711 -712 invalid_lines_log = '\n INVALID_LINE ====> '.join( -713 list(invalid_line_queue)[0:5], -714 ) -715 logger.info( -716 f'5 samples of invalid lines are printed below\n {invalid_lines_log}', -717 ) -718 -719 remaining_batch_size = len(line_tmp_storage) +706 file_stream.close() +707 +708 local_pbar.set_description('Parsing the remainder') +709 if invalid_lines > 0: +710 logger.info( +711 f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!", +712 ) +713 +714 invalid_lines_log = '\n INVALID_LINE ====> '.join( +715 list(invalid_line_queue)[0:5], +716 ) +717 logger.info( +718 f'5 samples of invalid lines are printed below\n {invalid_lines_log}', +719 ) 720 -721 if remaining_batch_size > 2**10: -722 line_tmp_storage = line_tmp_storage[: args.minibatch_size] -723 importances_batch, bounds_storage, coverage_storage, _ = compute_batch_ranking( -724 line_tmp_storage, -725 numeric_column_types, -726 args, -727 cpu_pool, -728 column_descriptions, -729 logger, -730 local_pbar, -731 ) -732 -733 for k, v in coverage_storage.items(): -734 local_coverage_object[k].append(v) -735 -736 step_timing_checkpoints.append(importances_batch.step_times) -737 importances_df += importances_batch.triplet_scores -738 bounds_storage = dict() -739 bounds_storage_batch.append(bounds_storage) -740 checkpoint_importances_df(importances_df) -741 -742 local_pbar.set_description('Wrapping up') -743 local_pbar.close() -744 -745 return ( -746 step_timing_checkpoints, -747 get_grouped_df(importances_df), -748 GLOBAL_CARDINALITY_STORAGE.copy(), -749 bounds_storage_batch, -750 memory_storage_batch, -751 local_coverage_object, -752 GLOBAL_RARE_VALUE_STORAGE.copy(), -753 GLOBAL_PRIOR_COMB_COUNTS.copy(), -754 GLOBAL_COUNTS_STORAGE.copy(), -755 ) +721 remaining_batch_size = len(line_tmp_storage) +722 +723 if remaining_batch_size > 2**10: +724 line_tmp_storage = line_tmp_storage[: args.minibatch_size] +725 importances_batch, bounds_storage, coverage_storage, _ = compute_batch_ranking( +726 line_tmp_storage, +727 numeric_column_types, +728 args, +729 cpu_pool, +730 column_descriptions, +731 logger, +732 local_pbar, +733 ) +734 +735 for k, v in coverage_storage.items(): +736 local_coverage_object[k].append(v) +737 +738 step_timing_checkpoints.append(importances_batch.step_times) +739 importances_df += importances_batch.triplet_scores +740 bounds_storage = dict() +741 bounds_storage_batch.append(bounds_storage) +742 checkpoint_importances_df(importances_df) +743 +744 local_pbar.set_description('Wrapping up') +745 local_pbar.close() +746 +747 return ( +748 step_timing_checkpoints, +749 get_grouped_df(importances_df), +750 GLOBAL_CARDINALITY_STORAGE.copy(), +751 bounds_storage_batch, +752 memory_storage_batch, +753 local_coverage_object, +754 GLOBAL_RARE_VALUE_STORAGE.copy(), +755 GLOBAL_PRIOR_COMB_COUNTS.copy(), +756 GLOBAL_COUNTS_STORAGE.copy(), +757 ) @@ -893,11 +892,11 @@

    logger = <Logger syn-logger (DEBUG)> - + - - + +
    @@ -905,11 +904,11 @@

    GLOBAL_CARDINALITY_STORAGE: dict[typing.Any, typing.Any] = {} - + - - + +

    @@ -917,11 +916,11 @@

    GLOBAL_COUNTS_STORAGE: dict[typing.Any, typing.Any] = {} - + - - + +

    @@ -929,11 +928,11 @@

    GLOBAL_RARE_VALUE_STORAGE: dict[str, typing.Any] = Counter() - + - - + +

    @@ -941,11 +940,11 @@

    GLOBAL_PRIOR_COMB_COUNTS: dict[typing.Any, int] = Counter() - + - - + +

    @@ -953,11 +952,11 @@

    IGNORED_VALUES = set() - + - - + +

    @@ -965,11 +964,11 @@

    HYPERLL_ERROR_BOUND = 0.02 - + - - + +

    @@ -977,17 +976,17 @@

    MAX_FEATURES_3MR = 10000 - + - - + +

    - + def prior_combinations_sample( combinations: list[tuple[typing.Any, ...]], args: Any) -> list[tuple[typing.Any, ...]]: @@ -995,20 +994,23 @@

    -
    51def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]:
    -52    """Make sure only relevant subspace of combinations is selected based on prior counts"""
    -53
    -54    if len(GLOBAL_PRIOR_COMB_COUNTS) == 0:
    -55        for combination in combinations:
    -56            GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
    -57        tmp = combinations[:args.combination_number_upper_bound]
    -58    else:
    -59        tmp = list(x[0] for x in sorted(GLOBAL_PRIOR_COMB_COUNTS.items(), key=lambda x:x[1], reverse=False))[:args.combination_number_upper_bound]
    -60
    -61    for combination in tmp:
    -62        GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
    -63
    -64    return tmp
    +            
    54def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]:
    +55    """Make sure only relevant subspace of combinations is selected based on prior counts"""
    +56
    +57    if len(combinations) == 0:
    +58        return []
    +59
    +60    missing_combinations = set(set(combinations)).difference(GLOBAL_PRIOR_COMB_COUNTS.keys())
    +61    if len(missing_combinations) > 0:
    +62        for combination in missing_combinations:
    +63            GLOBAL_PRIOR_COMB_COUNTS[combination] = 0
    +64
    +65    tmp = sorted(combinations, key=GLOBAL_PRIOR_COMB_COUNTS.get, reverse=False)[:args.combination_number_upper_bound]
    +66
    +67    for combination in tmp:
    +68        GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
    +69
    +70    return tmp
     
    @@ -1020,7 +1022,7 @@

    - + def get_combinations_from_columns( all_columns: pandas.core.indexes.base.Index, args: Any) -> list[tuple[typing.Any, ...]]: @@ -1028,36 +1030,36 @@

    -
    67def get_combinations_from_columns(all_columns: pd.Index, args: Any) -> list[tuple[Any, ...]]:
    -68    """Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope"""
    -69
    -70    if '3mr' in args.heuristic:
    -71        if args.combination_number_upper_bound > MAX_FEATURES_3MR:
    -72            args.combination_number_upper_bound = MAX_FEATURES_3MR
    -73        rel_columns = [column for column in all_columns if ' AND_REL ' in column]
    -74        non_rel_columns = sorted(set(all_columns) - set(rel_columns))
    -75
    -76        combinations = list(
    -77            itertools.combinations_with_replacement(non_rel_columns, 2),
    -78        )
    -79        combinations += [(column, args.label_column) for column in rel_columns]
    -80    else:
    -81        _combinations = itertools.combinations_with_replacement(all_columns, 2)
    -82
    -83        # Some applications do not require the full feature-feature triangular matrix
    -84        if args.target_ranking_only == 'True':
    -85            combinations = [x for x in _combinations if args.label_column in x]
    -86        else:
    -87            combinations = list(_combinations)
    -88
    -89    if args.target_ranking_only != 'True':
    -90        # Diagonal elements (non-label)
    -91        combinations += [
    -92            (individual_column, individual_column)
    -93            for individual_column in all_columns
    -94            if individual_column != args.label_column
    -95        ]
    -96    return combinations
    +            
     73def get_combinations_from_columns(all_columns: pd.Index, args: Any) -> list[tuple[Any, ...]]:
    + 74    """Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope"""
    + 75
    + 76    if '3mr' in args.heuristic:
    + 77        if args.combination_number_upper_bound > MAX_FEATURES_3MR:
    + 78            args.combination_number_upper_bound = MAX_FEATURES_3MR
    + 79        rel_columns = [column for column in all_columns if ' AND_REL ' in column]
    + 80        non_rel_columns = sorted(set(all_columns) - set(rel_columns))
    + 81
    + 82        combinations = list(
    + 83            itertools.combinations_with_replacement(non_rel_columns, 2),
    + 84        )
    + 85        combinations += [(column, args.label_column) for column in rel_columns]
    + 86    else:
    + 87        _combinations = itertools.combinations_with_replacement(all_columns, 2)
    + 88
    + 89        # Some applications do not require the full feature-feature triangular matrix
    + 90        if args.target_ranking_only == 'True':
    + 91            combinations = [x for x in _combinations if args.label_column in x]
    + 92        else:
    + 93            combinations = list(_combinations)
    + 94
    + 95    if args.target_ranking_only != 'True':
    + 96        # Diagonal elements (non-label)
    + 97        combinations += [
    + 98            (individual_column, individual_column)
    + 99            for individual_column in all_columns
    +100            if individual_column != args.label_column
    +101        ]
    +102    return combinations
     
    @@ -1069,7 +1071,7 @@

    - + def mixed_rank_graph( input_dataframe: pandas.core.frame.DataFrame, args: Any, cpu_pool: Any, pbar: Any) -> outrank.core_utils.BatchRankingSummary: @@ -1077,67 +1079,73 @@

    -
     99def mixed_rank_graph(
    -100    input_dataframe: pd.DataFrame, args: Any, cpu_pool: Any, pbar: Any,
    -101) -> BatchRankingSummary:
    -102    """Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic"""
    -103
    -104    all_columns = input_dataframe.columns
    -105
    -106    triplets = []
    -107    tmp_df = input_dataframe.copy().astype('category')
    -108    out_time_struct = {}
    +            
    105def mixed_rank_graph(
    +106    input_dataframe: pd.DataFrame, args: Any, cpu_pool: Any, pbar: Any,
    +107) -> BatchRankingSummary:
    +108    """Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic"""
     109
    -110    # Handle cont. types prior to interaction evaluation
    -111    pbar.set_description('Encoding columns')
    -112    start_enc_timer = timer()
    -113    tmp_df = pd.DataFrame({k : tmp_df[k].cat.codes for k in all_columns})
    -114
    -115    end_enc_timer = timer()
    -116    out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
    -117
    -118    combinations = get_combinations_from_columns(all_columns, args)
    -119    combinations = prior_combinations_sample(combinations, args)
    -120    random.shuffle(combinations)
    -121
    -122    if args.heuristic == 'Constant':
    -123        final_constant_imp = []
    -124        for c1, c2 in combinations:
    -125            final_constant_imp.append((c1, c2, 0.0))
    -126
    -127        out_time_struct['feature_score_computation'] = end_enc_timer - \
    -128            start_enc_timer
    -129        return BatchRankingSummary(final_constant_imp, out_time_struct)
    +110    all_columns = input_dataframe.columns
    +111
    +112    triplets = []
    +113    tmp_df = input_dataframe.copy().astype('category')
    +114    out_time_struct = {}
    +115
    +116    # Handle cont. types prior to interaction evaluation
    +117    pbar.set_description('Encoding columns')
    +118    start_enc_timer = timer()
    +119    tmp_df = pd.DataFrame({k : tmp_df[k].cat.codes for k in all_columns})
    +120
    +121    end_enc_timer = timer()
    +122    out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
    +123
    +124    combinations = get_combinations_from_columns(all_columns, args)
    +125
    +126    reference_model_features = {}
    +127    if is_prior_heuristic(args):
    +128        reference_model_features = [(' AND ').join(tuple(sorted(item.split(',')))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)]
    +129        combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features]
     130
    -131    # Map the scoring calls to the worker pool
    -132    pbar.set_description('Allocating thread pool')
    +131    combinations = prior_combinations_sample(combinations, args)
    +132    random.shuffle(combinations)
     133
    -134    # starmap is an alternative that is slower unfortunately (but nicer)
    -135    def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
    -136        return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
    -137
    -138    start_enc_timer = timer()
    -139    with cpu_pool as p:
    -140        pbar.set_description(f'Computing (#ftr={len(combinations)})')
    -141        results = p.amap(get_grounded_importances_estimate, combinations)
    -142        while not results.ready():
    -143            time.sleep(4)
    -144        triplets = results.get()
    -145    end_enc_timer = timer()
    -146    out_time_struct['feature_score_computation'] = end_enc_timer - \
    -147        start_enc_timer
    -148
    -149    # Gather the final triplets
    -150    pbar.set_description('Aggregation of ranking results')
    -151    final_triplets = []
    -152    for triplet in triplets:
    -153        inv = (triplet[1], triplet[0], triplet[2])
    -154        final_triplets.append(inv)
    -155        final_triplets.append(triplet)
    -156        triplets = final_triplets
    -157
    -158    pbar.set_description('Proceeding to the next batch of data')
    -159    return BatchRankingSummary(triplets, out_time_struct)
    +134    if args.heuristic == 'Constant':
    +135        final_constant_imp = []
    +136        for c1, c2 in combinations:
    +137            final_constant_imp.append((c1, c2, 0.0))
    +138
    +139        out_time_struct['feature_score_computation'] = end_enc_timer - \
    +140            start_enc_timer
    +141        return BatchRankingSummary(final_constant_imp, out_time_struct)
    +142
    +143    # Map the scoring calls to the worker pool
    +144    pbar.set_description('Allocating thread pool')
    +145
    +146    # starmap is an alternative that is slower unfortunately (but nicer)
    +147    def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
    +148        return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)
    +149
    +150    start_enc_timer = timer()
    +151    with cpu_pool as p:
    +152        pbar.set_description(f'Computing (#ftr={len(combinations)})')
    +153        results = p.amap(get_grounded_importances_estimate, combinations)
    +154        while not results.ready():
    +155            time.sleep(4)
    +156        triplets = results.get()
    +157    end_enc_timer = timer()
    +158    out_time_struct['feature_score_computation'] = end_enc_timer - \
    +159        start_enc_timer
    +160
    +161    # Gather the final triplets
    +162    pbar.set_description('Aggregation of ranking results')
    +163    final_triplets = []
    +164    for triplet in triplets:
    +165        inv = (triplet[1], triplet[0], triplet[2])
    +166        final_triplets.append(inv)
    +167        final_triplets.append(triplet)
    +168        triplets = final_triplets
    +169
    +170    pbar.set_description('Proceeding to the next batch of data')
    +171    return BatchRankingSummary(triplets, out_time_struct)
     
    @@ -1149,7 +1157,7 @@

    - + def enrich_with_transformations( input_dataframe: pandas.core.frame.DataFrame, num_col_types: set[str], logger: Any, args: Any) -> pandas.core.frame.DataFrame: @@ -1157,20 +1165,20 @@

    -
    162def enrich_with_transformations(
    -163    input_dataframe: pd.DataFrame, num_col_types: set[str], logger: Any, args: Any,
    -164) -> pd.DataFrame:
    -165    """Construct a collection of new features based on pre-defined transformations/rules"""
    -166
    -167    transformer = FeatureTransformerGeneric(
    -168        num_col_types, preset=args.transformers,
    -169    )
    -170    transformed_df = transformer.construct_new_features(input_dataframe)
    -171    logger.info(
    -172        f'Constructed {len(transformer.constructed_feature_names)} new features ..',
    -173    )
    -174
    -175    return transformed_df
    +            
    174def enrich_with_transformations(
    +175    input_dataframe: pd.DataFrame, num_col_types: set[str], logger: Any, args: Any,
    +176) -> pd.DataFrame:
    +177    """Construct a collection of new features based on pre-defined transformations/rules"""
    +178
    +179    transformer = FeatureTransformerGeneric(
    +180        num_col_types, preset=args.transformers,
    +181    )
    +182    transformed_df = transformer.construct_new_features(input_dataframe)
    +183    logger.info(
    +184        f'Constructed {len(transformer.constructed_feature_names)} new features ..',
    +185    )
    +186
    +187    return transformed_df
     
    @@ -1182,67 +1190,72 @@

    - + def - compute_combined_features( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any, pbar: Any, is_3mr: bool = False) -> pandas.core.frame.DataFrame: + compute_combined_features( input_dataframe: pandas.core.frame.DataFrame, args: Any, pbar: Any, is_3mr: bool = False) -> pandas.core.frame.DataFrame:
    -
    178def compute_combined_features(
    -179    input_dataframe: pd.DataFrame,
    -180    logger: Any,
    -181    args: Any,
    -182    pbar: Any,
    -183    is_3mr: bool = False,
    -184) -> pd.DataFrame:
    -185    """Compute higher order features via xxhash-based trick."""
    -186
    -187    all_columns = [
    -188        x for x in input_dataframe.columns if x != args.label_column
    -189    ]
    -190    join_string = ' AND_REL ' if is_3mr else ' AND '
    -191    interaction_order = 2 if is_3mr else args.interaction_order
    -192
    -193    if args.reference_model_JSON != '':
    -194        combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
    -195        full_combination_space = [combination.split(',') for combination in combined_features]
    -196    else:
    -197        full_combination_space = list(
    -198            itertools.combinations(all_columns, interaction_order),
    -199        )
    -200
    -201    if args.combination_number_upper_bound and args.reference_model_JSON != '':
    -202        random.shuffle(full_combination_space)
    -203        full_combination_space = full_combination_space[
    -204            : args.combination_number_upper_bound
    -205        ]
    +            
    190def compute_combined_features(
    +191    input_dataframe: pd.DataFrame,
    +192    args: Any,
    +193    pbar: Any,
    +194    is_3mr: bool = False,
    +195) -> pd.DataFrame:
    +196    """Compute higher order features via xxhash-based trick."""
    +197
    +198    all_columns = [
    +199        x for x in input_dataframe.columns if x != args.label_column
    +200    ]
    +201    join_string = ' AND_REL ' if is_3mr else ' AND '
    +202    interaction_order = 2 if is_3mr else args.interaction_order
    +203
    +204    model_combinations = []
    +205    full_combination_space = []
     206
    -207    com_counter = 0
    -208    new_feature_hash = {}
    -209    for new_combination in full_combination_space:
    -210        pbar.set_description(
    -211            f'Created {com_counter}/{len(full_combination_space)}',
    -212        )
    -213        combined_feature: list[str] = [str(0)] * input_dataframe.shape[0]
    -214        for feature in new_combination:
    -215            tmp_feature = input_dataframe[feature].tolist()
    -216            for enx, el in enumerate(tmp_feature):
    -217                combined_feature[enx] = str(
    -218                    internal_hash(
    -219                        str(combined_feature[enx]) + str(el),
    -220                    ),
    -221                )
    -222        ftr_name = join_string.join(str(x) for x in new_combination)
    -223        new_feature_hash[ftr_name] = combined_feature
    -224        com_counter += 1
    -225    tmp_df = pd.DataFrame(new_feature_hash)
    -226    pbar.set_description('Concatenating into final frame ..')
    -227    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
    -228    del tmp_df
    -229
    -230    return input_dataframe
    +207
    +208    if args.interaction_order > 1:
    +209            full_combination_space = list(
    +210                itertools.combinations(all_columns, interaction_order),
    +211            )
    +212    full_combination_space = prior_combinations_sample(full_combination_space, args)
    +213
    +214    if args.reference_model_JSON != '':
    +215        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
    +216        model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
    +217        if not is_prior_heuristic(args):
    +218            full_combination_space = model_combinations
    +219
    +220    if is_prior_heuristic(args):
    +221        full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
    +222
    +223
    +224    com_counter = 0
    +225    new_feature_hash = {}
    +226    for new_combination in full_combination_space:
    +227        pbar.set_description(
    +228            f'Created {com_counter}/{len(full_combination_space)}',
    +229        )
    +230        combined_feature: list[str] = [str(0)] * input_dataframe.shape[0]
    +231        for feature in new_combination:
    +232            tmp_feature = input_dataframe[feature].tolist()
    +233            for enx, el in enumerate(tmp_feature):
    +234                combined_feature[enx] = str(
    +235                    internal_hash(
    +236                        str(combined_feature[enx]) + str(el),
    +237                    ),
    +238                )
    +239        ftr_name = join_string.join(str(x) for x in new_combination)
    +240        new_feature_hash[ftr_name] = combined_feature
    +241        com_counter += 1
    +242    tmp_df = pd.DataFrame(new_feature_hash)
    +243    pbar.set_description('Concatenating into final frame ..')
    +244    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
    +245    del tmp_df
    +246
    +247    return input_dataframe
     
    @@ -1254,7 +1267,7 @@

    - + def compute_expanded_multivalue_features( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any, pbar: Any) -> pandas.core.frame.DataFrame: @@ -1262,48 +1275,48 @@

    -
    233def compute_expanded_multivalue_features(
    -234    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
    -235) -> pd.DataFrame:
    -236    """Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value "a,b,c" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice."""
    -237
    -238    considered_multivalue_features = args.explode_multivalue_features.split(
    -239        ';',
    -240    )
    -241    new_feature_hash = {}
    -242    missing_symbols = set(args.missing_value_symbols.split(','))
    -243
    -244    for multivalue_feature in considered_multivalue_features:
    -245        multivalue_feature_vector = input_dataframe[multivalue_feature].values.tolist(
    -246        )
    -247        multivalue_feature_vector = [
    -248            x.replace(',', '-') for x in multivalue_feature_vector
    -249        ]
    -250        multivalue_sets = [
    -251            set(x.split('-'))
    -252            for x in multivalue_feature_vector
    -253        ]
    -254        unique_values = set.union(*multivalue_sets)
    -255
    -256        for missing_symbol in missing_symbols:
    -257            if missing_symbol in unique_values:
    -258                unique_values.remove(missing_symbol)
    -259
    -260        for unique_value in unique_values:
    -261            tmp_vec = []
    -262            for enx, multivalue in enumerate(multivalue_sets):
    -263                if unique_value in multivalue:
    -264                    tmp_vec.append('1')
    -265                else:
    -266                    tmp_vec.append('')
    -267
    -268            new_feature_hash[f'MULTIEX-{multivalue_feature}-{unique_value}'] = tmp_vec
    -269
    -270    tmp_df = pd.DataFrame(new_feature_hash)
    -271    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
    -272    del tmp_df
    -273
    -274    return input_dataframe
    +            
    250def compute_expanded_multivalue_features(
    +251    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
    +252) -> pd.DataFrame:
    +253    """Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value "a,b,c" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice."""
    +254
    +255    considered_multivalue_features = args.explode_multivalue_features.split(
    +256        ';',
    +257    )
    +258    new_feature_hash = {}
    +259    missing_symbols = set(args.missing_value_symbols.split(','))
    +260
    +261    for multivalue_feature in considered_multivalue_features:
    +262        multivalue_feature_vector = input_dataframe[multivalue_feature].values.tolist(
    +263        )
    +264        multivalue_feature_vector = [
    +265            x.replace(',', '-') for x in multivalue_feature_vector
    +266        ]
    +267        multivalue_sets = [
    +268            set(x.split('-'))
    +269            for x in multivalue_feature_vector
    +270        ]
    +271        unique_values = set.union(*multivalue_sets)
    +272
    +273        for missing_symbol in missing_symbols:
    +274            if missing_symbol in unique_values:
    +275                unique_values.remove(missing_symbol)
    +276
    +277        for unique_value in unique_values:
    +278            tmp_vec = []
    +279            for enx, multivalue in enumerate(multivalue_sets):
    +280                if unique_value in multivalue:
    +281                    tmp_vec.append('1')
    +282                else:
    +283                    tmp_vec.append('')
    +284
    +285            new_feature_hash[f'MULTIEX-{multivalue_feature}-{unique_value}'] = tmp_vec
    +286
    +287    tmp_df = pd.DataFrame(new_feature_hash)
    +288    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
    +289    del tmp_df
    +290
    +291    return input_dataframe
     
    @@ -1315,7 +1328,7 @@

    - + def compute_subfeatures( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any, pbar: Any) -> pandas.core.frame.DataFrame: @@ -1323,85 +1336,85 @@

    -
    277def compute_subfeatures(
    -278    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
    -279) -> pd.DataFrame:
    -280    """Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.
    -281    ->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.
    -282    <->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)
    -283    """
    -284
    -285    all_subfeature_pair_seeds = args.subfeature_mapping.split(';')
    -286    new_feature_hash = dict()
    -287
    -288    for seed_pair in all_subfeature_pair_seeds:
    -289        if '<->' in seed_pair:
    -290            feature_first, feature_second = seed_pair.split('<->')
    -291
    -292        elif '->' in seed_pair:
    -293            feature_first, feature_second = seed_pair.split('->')
    -294
    -295        else:
    -296            raise NotImplementedError(
    -297                'Please specify valid subfeature operator (<-> or ->)',
    -298            )
    -299
    -300        subframe = input_dataframe[[feature_first, feature_second]]
    -301        unique_feature_second = subframe[feature_second].unique()
    -302        feature_first_vec = subframe[feature_first].tolist()
    -303        feature_second_vec = subframe[feature_second].tolist()
    -304        out_template_feature = [
    -305            (a, b) for a, b in zip(feature_first_vec, feature_second_vec)
    -306        ]
    -307
    -308        if '<->' in seed_pair:
    -309            unique_feature_first = subframe[feature_first].unique()
    -310
    -311            mask_types = []
    -312            for unique_target_feature_value in unique_feature_second:
    -313                for unique_seed_feature_value in unique_feature_first:
    -314                    mask_types.append(
    -315                        (unique_seed_feature_value, unique_target_feature_value),
    -316                    )
    -317
    -318            for mask_type in mask_types:
    -319                new_feature = []
    -320                for value_tuple in out_template_feature:
    -321                    if (
    -322                        value_tuple[0] == mask_type[0]
    -323                        and value_tuple[1] == mask_type[1]
    -324                    ):
    -325                        new_feature.append(str(1))
    -326                    else:
    -327                        new_feature.append(str(0))
    -328                feature_name = (
    -329                    f'SUBFEATURE|{feature_first}|{feature_second}-'
    -330                    + mask_type[0]
    -331                    + '&'
    -332                    + mask_type[1]
    -333                )
    -334                new_feature_hash[feature_name] = new_feature
    -335
    -336            del new_feature
    -337
    -338        elif '->' in seed_pair:
    -339            for unique_target_feature_value in unique_feature_second:
    -340                tmp_new_feature = [
    -341                    'AND'.join(
    -342                        x,
    -343                    ) if x[1] == unique_target_feature_value else ''
    -344                    for x in out_template_feature
    -345                ]
    -346                feature_name_final = (
    -347                    'SUBFEATURE-' + feature_first + '&' + unique_target_feature_value
    -348                )
    -349                new_feature_hash[feature_name_final] = tmp_new_feature
    -350
    -351    tmp_df = pd.DataFrame(new_feature_hash)
    -352    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
    -353
    -354    del tmp_df
    -355    return input_dataframe
    +            
    294def compute_subfeatures(
    +295    input_dataframe: pd.DataFrame, logger: Any, args: Any, pbar: Any,
    +296) -> pd.DataFrame:
    +297    """Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.
    +298    ->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.
    +299    <->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)
    +300    """
    +301
    +302    all_subfeature_pair_seeds = args.subfeature_mapping.split(';')
    +303    new_feature_hash = dict()
    +304
    +305    for seed_pair in all_subfeature_pair_seeds:
    +306        if '<->' in seed_pair:
    +307            feature_first, feature_second = seed_pair.split('<->')
    +308
    +309        elif '->' in seed_pair:
    +310            feature_first, feature_second = seed_pair.split('->')
    +311
    +312        else:
    +313            raise NotImplementedError(
    +314                'Please specify valid subfeature operator (<-> or ->)',
    +315            )
    +316
    +317        subframe = input_dataframe[[feature_first, feature_second]]
    +318        unique_feature_second = subframe[feature_second].unique()
    +319        feature_first_vec = subframe[feature_first].tolist()
    +320        feature_second_vec = subframe[feature_second].tolist()
    +321        out_template_feature = [
    +322            (a, b) for a, b in zip(feature_first_vec, feature_second_vec)
    +323        ]
    +324
    +325        if '<->' in seed_pair:
    +326            unique_feature_first = subframe[feature_first].unique()
    +327
    +328            mask_types = []
    +329            for unique_target_feature_value in unique_feature_second:
    +330                for unique_seed_feature_value in unique_feature_first:
    +331                    mask_types.append(
    +332                        (unique_seed_feature_value, unique_target_feature_value),
    +333                    )
    +334
    +335            for mask_type in mask_types:
    +336                new_feature = []
    +337                for value_tuple in out_template_feature:
    +338                    if (
    +339                        value_tuple[0] == mask_type[0]
    +340                        and value_tuple[1] == mask_type[1]
    +341                    ):
    +342                        new_feature.append(str(1))
    +343                    else:
    +344                        new_feature.append(str(0))
    +345                feature_name = (
    +346                    f'SUBFEATURE|{feature_first}|{feature_second}-'
    +347                    + mask_type[0]
    +348                    + '&'
    +349                    + mask_type[1]
    +350                )
    +351                new_feature_hash[feature_name] = new_feature
    +352
    +353            del new_feature
    +354
    +355        elif '->' in seed_pair:
    +356            for unique_target_feature_value in unique_feature_second:
    +357                tmp_new_feature = [
    +358                    'AND'.join(
    +359                        x,
    +360                    ) if x[1] == unique_target_feature_value else ''
    +361                    for x in out_template_feature
    +362                ]
    +363                feature_name_final = (
    +364                    'SUBFEATURE-' + feature_first + '&' + unique_target_feature_value
    +365                )
    +366                new_feature_hash[feature_name_final] = tmp_new_feature
    +367
    +368    tmp_df = pd.DataFrame(new_feature_hash)
    +369    input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
    +370
    +371    del tmp_df
    +372    return input_dataframe
     
    @@ -1415,7 +1428,7 @@

    - + def include_noisy_features( input_dataframe: pandas.core.frame.DataFrame, logger: Any, args: Any) -> pandas.core.frame.DataFrame: @@ -1423,17 +1436,17 @@

    -
    358def include_noisy_features(
    -359    input_dataframe: pd.DataFrame, logger: Any, args: Any,
    -360) -> pd.DataFrame:
    -361    """Add randomized features that serve as a sanity check"""
    -362
    -363    transformer = FeatureTransformerNoise()
    -364    transformed_df = transformer.construct_new_features(
    -365        input_dataframe, args.label_column,
    -366    )
    -367
    -368    return transformed_df
    +            
    375def include_noisy_features(
    +376    input_dataframe: pd.DataFrame, logger: Any, args: Any,
    +377) -> pd.DataFrame:
    +378    """Add randomized features that serve as a sanity check"""
    +379
    +380    transformer = FeatureTransformerNoise()
    +381    transformed_df = transformer.construct_new_features(
    +382        input_dataframe, args.label_column,
    +383    )
    +384
    +385    return transformed_df
     
    @@ -1445,7 +1458,7 @@

    - + def compute_coverage( input_dataframe: pandas.core.frame.DataFrame, args: Any) -> dict[str, set[str]]: @@ -1453,23 +1466,23 @@

    -
    371def compute_coverage(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
    -372    """Compute coverage of features, incrementally"""
    -373    output_storage_cov = defaultdict(set)
    -374    all_missing_symbols = set(args.missing_value_symbols.split(','))
    -375    for column in input_dataframe:
    -376        all_missing = sum(
    -377            [
    -378                input_dataframe[column].values.tolist().count(x)
    -379                for x in all_missing_symbols
    -380            ],
    -381        )
    -382
    -383        output_storage_cov[column] = (
    -384            1 - (all_missing / input_dataframe.shape[0])
    -385        ) * 100
    -386
    -387    return output_storage_cov
    +            
    388def compute_coverage(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
    +389    """Compute coverage of features, incrementally"""
    +390    output_storage_cov = defaultdict(set)
    +391    all_missing_symbols = set(args.missing_value_symbols.split(','))
    +392    for column in input_dataframe:
    +393        all_missing = sum(
    +394            [
    +395                input_dataframe[column].values.tolist().count(x)
    +396                for x in all_missing_symbols
    +397            ],
    +398        )
    +399
    +400        output_storage_cov[column] = (
    +401            1 - (all_missing / input_dataframe.shape[0])
    +402        ) * 100
    +403
    +404    return output_storage_cov
     
    @@ -1481,7 +1494,7 @@

    - + def compute_feature_memory_consumption( input_dataframe: pandas.core.frame.DataFrame, args: Any) -> dict[str, set[str]]: @@ -1489,19 +1502,19 @@

    -
    390def compute_feature_memory_consumption(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
    -391    """An approximation of how much feature take up"""
    -392    output_storage_features = defaultdict(set)
    -393    for col in input_dataframe.columns:
    -394        specific_column = [
    -395            str(x).strip() for x in input_dataframe[col].astype(str).values.tolist()
    -396        ]
    -397        col_size = sum(
    -398            len(x.encode())
    -399            for x in specific_column
    -400        ) / input_dataframe.shape[0]
    -401        output_storage_features[col] = col_size
    -402    return output_storage_features
    +            
    407def compute_feature_memory_consumption(input_dataframe: pd.DataFrame, args: Any) -> dict[str, set[str]]:
    +408    """An approximation of how much feature take up"""
    +409    output_storage_features = defaultdict(set)
    +410    for col in input_dataframe.columns:
    +411        specific_column = [
    +412            str(x).strip() for x in input_dataframe[col].astype(str).values.tolist()
    +413        ]
    +414        col_size = sum(
    +415            len(x.encode())
    +416            for x in specific_column
    +417        ) / input_dataframe.shape[0]
    +418        output_storage_features[col] = col_size
    +419    return output_storage_features
     
    @@ -1513,7 +1526,7 @@

    - + def compute_value_counts(input_dataframe: pandas.core.frame.DataFrame, args: Any): @@ -1521,24 +1534,24 @@

    -
    405def compute_value_counts(input_dataframe: pd.DataFrame, args: Any):
    -406    """Update the count structure"""
    -407
    -408    global GLOBAL_RARE_VALUE_STORAGE
    -409    global IGNORED_VALUES
    -410
    -411    for column in input_dataframe.columns:
    -412        main_values = input_dataframe[column].values
    -413        for value in main_values:
    -414            if value not in IGNORED_VALUES:
    -415                GLOBAL_RARE_VALUE_STORAGE.update({(column, value): 1})
    -416
    -417    for key, val in GLOBAL_RARE_VALUE_STORAGE.items():
    -418        if val > args.rare_value_count_upper_bound:
    -419            IGNORED_VALUES.add(key)
    -420
    -421    for to_remove_val in IGNORED_VALUES:
    -422        del GLOBAL_RARE_VALUE_STORAGE[to_remove_val]
    +            
    422def compute_value_counts(input_dataframe: pd.DataFrame, args: Any):
    +423    """Update the count structure"""
    +424
    +425    global GLOBAL_RARE_VALUE_STORAGE
    +426    global IGNORED_VALUES
    +427
    +428    for column in input_dataframe.columns:
    +429        main_values = input_dataframe[column].values
    +430        for value in main_values:
    +431            if value not in IGNORED_VALUES:
    +432                GLOBAL_RARE_VALUE_STORAGE.update({(column, value): 1})
    +433
    +434    for key, val in GLOBAL_RARE_VALUE_STORAGE.items():
    +435        if val > args.rare_value_count_upper_bound:
    +436            IGNORED_VALUES.add(key)
    +437
    +438    for to_remove_val in IGNORED_VALUES:
    +439        del GLOBAL_RARE_VALUE_STORAGE[to_remove_val]
     
    @@ -1550,7 +1563,7 @@

    - + def compute_cardinalities( input_dataframe: pandas.core.frame.DataFrame, pbar: Any, max_unique_hist_constraint: int) -> None: @@ -1558,32 +1571,32 @@

    -
    425def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any, max_unique_hist_constraint: int) -> None:
    -426    """Compute cardinalities of features, incrementally"""
    -427
    -428    global GLOBAL_CARDINALITY_STORAGE
    -429    output_storage_card = defaultdict(set)
    -430    for enx, column in enumerate(input_dataframe):
    -431        output_storage_card[column] = set(input_dataframe[column].unique())
    -432        if column not in GLOBAL_CARDINALITY_STORAGE:
    -433            GLOBAL_CARDINALITY_STORAGE[column] = HyperLogLog(
    -434                HYPERLL_ERROR_BOUND,
    -435            )
    -436
    -437        if column not in GLOBAL_COUNTS_STORAGE:
    -438            GLOBAL_COUNTS_STORAGE[column] = PrimitiveConstrainedCounter(max_unique_hist_constraint)
    -439
    -440        [GLOBAL_COUNTS_STORAGE[column].add(value) for value in input_dataframe[column].values]
    -441
    -442        for unique_value in set(input_dataframe[column].unique()):
    -443            if unique_value:
    -444                GLOBAL_CARDINALITY_STORAGE[column].add(
    -445                    internal_hash(unique_value),
    -446                )
    -447
    -448        pbar.set_description(
    -449            f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}',
    -450        )
    +            
    442def compute_cardinalities(input_dataframe: pd.DataFrame, pbar: Any, max_unique_hist_constraint: int) -> None:
    +443    """Compute cardinalities of features, incrementally"""
    +444
    +445    global GLOBAL_CARDINALITY_STORAGE
    +446    output_storage_card = defaultdict(set)
    +447    for enx, column in enumerate(input_dataframe):
    +448        output_storage_card[column] = set(input_dataframe[column].unique())
    +449        if column not in GLOBAL_CARDINALITY_STORAGE:
    +450            GLOBAL_CARDINALITY_STORAGE[column] = HyperLogLog(
    +451                HYPERLL_ERROR_BOUND,
    +452            )
    +453
    +454        if column not in GLOBAL_COUNTS_STORAGE:
    +455            GLOBAL_COUNTS_STORAGE[column] = PrimitiveConstrainedCounter(max_unique_hist_constraint)
    +456
    +457        [GLOBAL_COUNTS_STORAGE[column].add(value) for value in input_dataframe[column].values]
    +458
    +459        for unique_value in set(input_dataframe[column].unique()):
    +460            if unique_value:
    +461                GLOBAL_CARDINALITY_STORAGE[column].add(
    +462                    internal_hash(unique_value),
    +463                )
    +464
    +465        pbar.set_description(
    +466            f'Computing cardinality (Hyperloglog update) {enx}/{input_dataframe.shape[1]}',
    +467        )
     
    @@ -1595,7 +1608,7 @@

    - + def compute_bounds_increment( input_dataframe: pandas.core.frame.DataFrame, numeric_column_types: set[str]) -> dict[str, typing.Any]: @@ -1603,46 +1616,46 @@

    -
    453def compute_bounds_increment(
    -454    input_dataframe: pd.DataFrame, numeric_column_types: set[str],
    -455) -> dict[str, Any]:
    -456    all_features = input_dataframe.columns
    -457    numeric_column_types = set(numeric_column_types)
    -458    summary_object = {}
    -459    summary_storage: Any = {}
    -460    for feature in all_features:
    -461        if feature in numeric_column_types:
    -462            feature_vector = pd.to_numeric(
    -463                input_dataframe[feature], errors='coerce',
    -464            )
    -465            minimum = np.min(feature_vector)
    -466            maximum = np.max(feature_vector)
    -467            mean = np.mean(feature_vector)
    -468            summary_storage = NumericFeatureSummary(
    -469                feature, minimum, maximum, mean, len(
    -470                    np.unique(feature_vector),
    -471                ),
    -472            )
    -473            summary_object[feature] = summary_storage
    -474
    -475        else:
    -476            feature_vector = input_dataframe[feature].values
    -477            summary_storage = NominalFeatureSummary(
    -478                feature, len(np.unique(feature_vector)),
    -479            )
    -480            summary_object[feature] = summary_storage
    -481
    -482    return summary_object
    +            
    470def compute_bounds_increment(
    +471    input_dataframe: pd.DataFrame, numeric_column_types: set[str],
    +472) -> dict[str, Any]:
    +473    all_features = input_dataframe.columns
    +474    numeric_column_types = set(numeric_column_types)
    +475    summary_object = {}
    +476    summary_storage: Any = {}
    +477    for feature in all_features:
    +478        if feature in numeric_column_types:
    +479            feature_vector = pd.to_numeric(
    +480                input_dataframe[feature], errors='coerce',
    +481            )
    +482            minimum = np.min(feature_vector)
    +483            maximum = np.max(feature_vector)
    +484            mean = np.mean(feature_vector)
    +485            summary_storage = NumericFeatureSummary(
    +486                feature, minimum, maximum, mean, len(
    +487                    np.unique(feature_vector),
    +488                ),
    +489            )
    +490            summary_object[feature] = summary_storage
    +491
    +492        else:
    +493            feature_vector = input_dataframe[feature].values
    +494            summary_storage = NominalFeatureSummary(
    +495                feature, len(np.unique(feature_vector)),
    +496            )
    +497            summary_object[feature] = summary_storage
    +498
    +499    return summary_object
     
    - +
    - + def compute_batch_ranking( line_tmp_storage: list[list[typing.Any]], numeric_column_types: set[str], args: Any, cpu_pool: Any, column_descriptions: list[str], logger: Any, pbar: Any) -> tuple[outrank.core_utils.BatchRankingSummary, dict[str, typing.Any], dict[str, set[str]], dict[str, set[str]]]: @@ -1650,97 +1663,97 @@

    -
    485def compute_batch_ranking(
    -486    line_tmp_storage: list[list[Any]],
    -487    numeric_column_types: set[str],
    -488    args: Any,
    -489    cpu_pool: Any,
    -490    column_descriptions: list[str],
    -491    logger: Any,
    -492    pbar: Any,
    -493) -> tuple[BatchRankingSummary, dict[str, Any], dict[str, set[str]], dict[str, set[str]]]:
    -494    """Enrich the feature space and compute the batch importances"""
    -495
    -496    input_dataframe = pd.DataFrame(line_tmp_storage)
    -497    input_dataframe.columns = column_descriptions
    -498    pbar.set_description('Control features')
    -499
    -500    if args.feature_set_focus:
    -501        if args.feature_set_focus == '_all_from_reference_JSON':
    -502            focus_set = extract_features_from_reference_JSON(
    -503                args.reference_model_JSON,
    -504            )
    -505
    -506        else:
    -507            focus_set = set(args.feature_set_focus.split(','))
    -508
    -509        focus_set.add(args.label_column)
    -510        focus_set = {x for x in focus_set if x in input_dataframe.columns}
    -511        input_dataframe = input_dataframe[list(focus_set)]
    +            
    502def compute_batch_ranking(
    +503    line_tmp_storage: list[list[Any]],
    +504    numeric_column_types: set[str],
    +505    args: Any,
    +506    cpu_pool: Any,
    +507    column_descriptions: list[str],
    +508    logger: Any,
    +509    pbar: Any,
    +510) -> tuple[BatchRankingSummary, dict[str, Any], dict[str, set[str]], dict[str, set[str]]]:
    +511    """Enrich the feature space and compute the batch importances"""
     512
    -513    if args.transformers != 'none':
    -514
    -515        pbar.set_description('Adding transformations')
    -516        input_dataframe = enrich_with_transformations(
    -517            input_dataframe, numeric_column_types, logger, args,
    -518        )
    -519
    -520    if args.explode_multivalue_features != 'False':
    -521        pbar.set_description('Constructing new features from multivalue ones')
    -522        input_dataframe = compute_expanded_multivalue_features(
    -523            input_dataframe, logger, args, pbar,
    -524        )
    +513    input_dataframe = pd.DataFrame(line_tmp_storage)
    +514    input_dataframe.columns = column_descriptions
    +515    pbar.set_description('Control features')
    +516
    +517    if args.feature_set_focus:
    +518        if args.feature_set_focus == '_all_from_reference_JSON':
    +519            focus_set = extract_features_from_reference_JSON(
    +520                args.reference_model_JSON,
    +521            )
    +522
    +523        else:
    +524            focus_set = set(args.feature_set_focus.split(','))
     525
    -526    if args.subfeature_mapping != 'False':
    -527        pbar.set_description('Constructing new (sub)features')
    -528        input_dataframe = compute_subfeatures(
    -529            input_dataframe, logger, args, pbar,
    -530        )
    +526        focus_set.add(args.label_column)
    +527        focus_set = {x for x in focus_set if x in input_dataframe.columns}
    +528        input_dataframe = input_dataframe[list(focus_set)]
    +529
    +530    if args.transformers != 'none':
     531
    -532    if args.interaction_order > 1 or args.reference_model_JSON:
    -533        pbar.set_description('Constructing new features')
    -534        input_dataframe = compute_combined_features(
    -535            input_dataframe, logger, args, pbar,
    -536        )
    -537
    -538    # in case of 3mr we compute the score of combinations against the target
    -539    if '3mr' in args.heuristic:
    -540        pbar.set_description(
    -541            'Constructing features for computing relations in 3mr',
    -542        )
    -543        input_dataframe = compute_combined_features(
    -544            input_dataframe, logger, args, pbar, True,
    -545        )
    -546
    -547    if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant':
    -548        pbar.set_description('Computing baseline features')
    -549        input_dataframe = include_noisy_features(input_dataframe, logger, args)
    -550
    -551    # Compute incremental statistic useful for data inspection/transformer generation
    -552    pbar.set_description('Computing coverage')
    -553    coverage_storage = compute_coverage(input_dataframe, args)
    -554    feature_memory_consumption = compute_feature_memory_consumption(
    -555        input_dataframe, args,
    -556    )
    -557    compute_cardinalities(input_dataframe, pbar, args.max_unique_hist_constraint)
    -558
    -559    if args.task == 'identify_rare_values':
    -560        compute_value_counts(input_dataframe, args)
    -561
    -562    bounds_storage = compute_bounds_increment(
    -563        input_dataframe, numeric_column_types,
    -564    )
    -565
    -566    pbar.set_description(
    -567        f'Computing ranks for {input_dataframe.shape[1]} features',
    -568    )
    -569
    -570    return (
    -571        mixed_rank_graph(input_dataframe, args, cpu_pool, pbar),
    -572        bounds_storage,
    -573        coverage_storage,
    -574        feature_memory_consumption,
    -575    )
    +532        pbar.set_description('Adding transformations')
    +533        input_dataframe = enrich_with_transformations(
    +534            input_dataframe, numeric_column_types, logger, args,
    +535        )
    +536
    +537    if args.explode_multivalue_features != 'False':
    +538        pbar.set_description('Constructing new features from multivalue ones')
    +539        input_dataframe = compute_expanded_multivalue_features(
    +540            input_dataframe, logger, args, pbar,
    +541        )
    +542
    +543    if args.subfeature_mapping != 'False':
    +544        pbar.set_description('Constructing new (sub)features')
    +545        input_dataframe = compute_subfeatures(
    +546            input_dataframe, logger, args, pbar,
    +547        )
    +548
    +549    if args.interaction_order > 1 or args.reference_model_JSON:
    +550        pbar.set_description('Constructing new features')
    +551        input_dataframe = compute_combined_features(
    +552            input_dataframe, args, pbar,
    +553        )
    +554
    +555    # in case of 3mr we compute the score of combinations against the target
    +556    if '3mr' in args.heuristic:
    +557        pbar.set_description(
    +558            'Constructing features for computing relations in 3mr',
    +559        )
    +560        input_dataframe = compute_combined_features(
    +561            input_dataframe, args, pbar, True,
    +562        )
    +563
    +564    if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant':
    +565        pbar.set_description('Computing baseline features')
    +566        input_dataframe = include_noisy_features(input_dataframe, logger, args)
    +567
    +568    # Compute incremental statistic useful for data inspection/transformer generation
    +569    pbar.set_description('Computing coverage')
    +570    coverage_storage = compute_coverage(input_dataframe, args)
    +571    feature_memory_consumption = compute_feature_memory_consumption(
    +572        input_dataframe, args,
    +573    )
    +574    compute_cardinalities(input_dataframe, pbar, args.max_unique_hist_constraint)
    +575
    +576    if args.task == 'identify_rare_values':
    +577        compute_value_counts(input_dataframe, args)
    +578
    +579    bounds_storage = compute_bounds_increment(
    +580        input_dataframe, numeric_column_types,
    +581    )
    +582
    +583    pbar.set_description(
    +584        f'Computing ranks for {input_dataframe.shape[1]} features',
    +585    )
    +586
    +587    return (
    +588        mixed_rank_graph(input_dataframe, args, cpu_pool, pbar),
    +589        bounds_storage,
    +590        coverage_storage,
    +591        feature_memory_consumption,
    +592    )
     
    @@ -1748,43 +1761,11 @@

    -
    -
    - -
    - - def - get_num_of_instances(fname: str) -> int: - - - -
    - -
    578def get_num_of_instances(fname: str) -> int:
    -579    """Count the number of lines in a file, fast - useful for progress logging"""
    -580
    -581    def _make_gen(reader):
    -582        while True:
    -583            b = reader(2**16)
    -584            if not b:
    -585                break
    -586            yield b
    -587
    -588    with open(fname, 'rb') as f:
    -589        count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read))
    -590    return count
    -
    - - -

    Count the number of lines in a file, fast - useful for progress logging

    -
    - -
    - + def get_grouped_df( importances_df_list: list[tuple[str, str, float]]) -> pandas.core.frame.DataFrame: @@ -1792,17 +1773,17 @@

    -
    593def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame:
    -594    """A helper method that enables median-based aggregation after processing"""
    -595
    -596    importances_df = pd.DataFrame(importances_df_list)
    -597    if len(importances_df) == 0:
    -598        return None
    -599    importances_df.columns = ['FeatureA', 'FeatureB', 'Score']
    -600    grouped = importances_df.groupby(
    -601        ['FeatureA', 'FeatureB'],
    -602    ).median().reset_index()
    -603    return grouped
    +            
    595def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame:
    +596    """A helper method that enables median-based aggregation after processing"""
    +597
    +598    importances_df = pd.DataFrame(importances_df_list)
    +599    if len(importances_df) == 0:
    +600        return None
    +601    importances_df.columns = ['FeatureA', 'FeatureB', 'Score']
    +602    grouped = importances_df.groupby(
    +603        ['FeatureA', 'FeatureB'],
    +604    ).median().reset_index()
    +605    return grouped
     
    @@ -1814,7 +1795,7 @@

    - + def checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None: @@ -1822,12 +1803,12 @@

    -
    606def checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None:
    -607    """A helper which stores intermediary state - useful for longer runs"""
    -608
    -609    gdf = get_grouped_df(importances_batch)
    -610    if gdf is not None:
    -611        gdf.to_csv('ranking_checkpoint_tmp.tsv', sep='\t')
    +            
    608def checkpoint_importances_df(importances_batch: list[tuple[str, str, float]]) -> None:
    +609    """A helper which stores intermediary state - useful for longer runs"""
    +610
    +611    gdf = get_grouped_df(importances_batch)
    +612    if gdf is not None:
    +613        gdf.to_csv('ranking_checkpoint_tmp.tsv', sep='\t')
     
    @@ -1839,7 +1820,7 @@

    - + def estimate_importances_minibatches( input_file: str, column_descriptions: list, fw_col_mapping: dict[str, str], numeric_column_types: set, batch_size: int = 100000, args: Any = None, data_encoding: str = 'utf-8', cpu_pool: Any = None, delimiter: str = '\t', feature_construction_mode: bool = False, logger: Any = None) -> tuple[list[dict[str, typing.Any]], typing.Any, dict[typing.Any, typing.Any], list[dict[str, typing.Any]], list[dict[str, set[str]]], collections.defaultdict[str, list[set[str]]], dict[str, typing.Any], dict[str, typing.Any], dict[str, typing.Any]]: @@ -1847,149 +1828,149 @@

    -
    614def estimate_importances_minibatches(
    -615    input_file: str,
    -616    column_descriptions: list,
    -617    fw_col_mapping: dict[str, str],
    -618    numeric_column_types: set,
    -619    batch_size: int = 100000,
    -620    args: Any = None,
    -621    data_encoding: str = 'utf-8',
    -622    cpu_pool: Any = None,
    -623    delimiter: str = '\t',
    -624    feature_construction_mode: bool = False,
    -625    logger: Any = None,
    -626) -> tuple[list[dict[str, Any]], Any, dict[Any, Any], list[dict[str, Any]], list[dict[str, set[str]]], defaultdict[str, list[set[str]]], dict[str, Any], dict[str, Any], dict[str, Any]]:
    -627    """Interaction score estimator - suitable for example for csv-like input data types.
    -628    This type of data is normally a single large csv, meaning that minibatch processing needs to
    -629    happen during incremental handling of the file (that"s not the case for pre-separated ob data)
    -630    """
    -631
    -632    invalid_line_queue: Any = deque([], maxlen=2**5)
    +            
    616def estimate_importances_minibatches(
    +617    input_file: str,
    +618    column_descriptions: list,
    +619    fw_col_mapping: dict[str, str],
    +620    numeric_column_types: set,
    +621    batch_size: int = 100000,
    +622    args: Any = None,
    +623    data_encoding: str = 'utf-8',
    +624    cpu_pool: Any = None,
    +625    delimiter: str = '\t',
    +626    feature_construction_mode: bool = False,
    +627    logger: Any = None,
    +628) -> tuple[list[dict[str, Any]], Any, dict[Any, Any], list[dict[str, Any]], list[dict[str, set[str]]], defaultdict[str, list[set[str]]], dict[str, Any], dict[str, Any], dict[str, Any]]:
    +629    """Interaction score estimator - suitable for example for csv-like input data types.
    +630    This type of data is normally a single large csv, meaning that minibatch processing needs to
    +631    happen during incremental handling of the file (that"s not the case for pre-separated ob data)
    +632    """
     633
    -634    invalid_lines = 0
    -635    line_counter = 0
    -636
    -637    importances_df: list[Any] = []
    -638    line_tmp_storage = []
    -639    bounds_storage_batch = []
    -640    memory_storage_batch = []
    -641    step_timing_checkpoints = []
    -642
    -643    local_coverage_object = defaultdict(list)
    -644    local_pbar = tqdm.tqdm(
    -645        total=get_num_of_instances(input_file) - 1, position=0, disable=args.disable_tqdm == 'True',
    -646    )
    -647
    -648    file_name, file_extension = os.path.splitext(input_file)
    +634    invalid_line_queue: Any = deque([], maxlen=2**5)
    +635
    +636    invalid_lines = 0
    +637    line_counter = 0
    +638
    +639    importances_df: list[Any] = []
    +640    line_tmp_storage = []
    +641    bounds_storage_batch = []
    +642    memory_storage_batch = []
    +643    step_timing_checkpoints = []
    +644
    +645    local_coverage_object = defaultdict(list)
    +646    local_pbar = tqdm.tqdm(
    +647        total=get_num_of_instances(input_file) - 1, position=0, disable=args.disable_tqdm == 'True',
    +648    )
     649
    -650    if file_extension == '.gz':
    -651        file_stream = gzip.open(input_file, 'rt', encoding=data_encoding)
    -652
    -653    else:
    -654        file_stream = open(input_file, encoding=data_encoding)
    -655
    -656    file_stream.readline()
    +650    file_name, file_extension = os.path.splitext(input_file)
    +651
    +652    if file_extension == '.gz':
    +653        file_stream = gzip.open(input_file, 'rt', encoding=data_encoding)
    +654
    +655    else:
    +656        file_stream = open(input_file, encoding=data_encoding)
     657
    -658    local_pbar.set_description('Starting ranking computation')
    -659    for line in file_stream:
    -660        line_counter += 1
    -661        local_pbar.update(1)
    -662
    -663        if line_counter % args.subsampling != 0:
    -664            continue
    -665
    -666        parsed_line = generic_line_parser(
    -667            line, delimiter, args, fw_col_mapping, column_descriptions,
    -668        )
    -669
    -670        if len(parsed_line) == len(column_descriptions):
    -671            line_tmp_storage.append(parsed_line)
    -672
    -673        else:
    -674            invalid_line_queue.appendleft(str(parsed_line))
    -675            invalid_lines += 1
    -676
    -677        # Batches need to be processed on-the-fly
    -678        if len(line_tmp_storage) >= args.minibatch_size:
    -679
    -680            importances_batch, bounds_storage, coverage_storage, memory_storage = compute_batch_ranking(
    -681                line_tmp_storage,
    -682                numeric_column_types,
    -683                args,
    -684                cpu_pool,
    -685                column_descriptions,
    -686                logger,
    -687                local_pbar,
    -688            )
    -689
    -690            bounds_storage_batch.append(bounds_storage)
    -691            memory_storage_batch.append(memory_storage)
    -692            for k, v in coverage_storage.items():
    -693                local_coverage_object[k].append(v)
    -694
    -695            del coverage_storage
    +658    file_stream.readline()
    +659
    +660    local_pbar.set_description('Starting ranking computation')
    +661    for line in file_stream:
    +662        line_counter += 1
    +663        local_pbar.update(1)
    +664
    +665        if line_counter % args.subsampling != 0:
    +666            continue
    +667
    +668        parsed_line = generic_line_parser(
    +669            line, delimiter, args, fw_col_mapping, column_descriptions,
    +670        )
    +671
    +672        if len(parsed_line) == len(column_descriptions):
    +673            line_tmp_storage.append(parsed_line)
    +674
    +675        else:
    +676            invalid_line_queue.appendleft(str(parsed_line))
    +677            invalid_lines += 1
    +678
    +679        # Batches need to be processed on-the-fly
    +680        if len(line_tmp_storage) >= args.minibatch_size:
    +681
    +682            importances_batch, bounds_storage, coverage_storage, memory_storage = compute_batch_ranking(
    +683                line_tmp_storage,
    +684                numeric_column_types,
    +685                args,
    +686                cpu_pool,
    +687                column_descriptions,
    +688                logger,
    +689                local_pbar,
    +690            )
    +691
    +692            bounds_storage_batch.append(bounds_storage)
    +693            memory_storage_batch.append(memory_storage)
    +694            for k, v in coverage_storage.items():
    +695                local_coverage_object[k].append(v)
     696
    -697            line_tmp_storage = []
    -698            step_timing_checkpoints.append(importances_batch.step_times)
    -699            importances_df += importances_batch.triplet_scores
    -700
    -701            if args.heuristic != 'Constant':
    -702                local_pbar.set_description('Creating checkpoint')
    -703                checkpoint_importances_df(importances_df)
    -704
    -705    file_stream.close()
    +697            del coverage_storage
    +698
    +699            line_tmp_storage = []
    +700            step_timing_checkpoints.append(importances_batch.step_times)
    +701            importances_df += importances_batch.triplet_scores
    +702
    +703            if args.heuristic != 'Constant':
    +704                local_pbar.set_description('Creating checkpoint')
    +705                checkpoint_importances_df(importances_df)
     706
    -707    local_pbar.set_description('Parsing the remainder')
    -708    if invalid_lines > 0:
    -709        logger.info(
    -710            f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!",
    -711        )
    -712
    -713        invalid_lines_log = '\n INVALID_LINE ====> '.join(
    -714            list(invalid_line_queue)[0:5],
    -715        )
    -716        logger.info(
    -717            f'5 samples of invalid lines are printed below\n {invalid_lines_log}',
    -718        )
    -719
    -720    remaining_batch_size = len(line_tmp_storage)
    +707    file_stream.close()
    +708
    +709    local_pbar.set_description('Parsing the remainder')
    +710    if invalid_lines > 0:
    +711        logger.info(
    +712            f"Detected {invalid_lines} invalid lines. If this number is very high, it's possible your header is off - re-check your data/attribute-feature mappings please!",
    +713        )
    +714
    +715        invalid_lines_log = '\n INVALID_LINE ====> '.join(
    +716            list(invalid_line_queue)[0:5],
    +717        )
    +718        logger.info(
    +719            f'5 samples of invalid lines are printed below\n {invalid_lines_log}',
    +720        )
     721
    -722    if remaining_batch_size > 2**10:
    -723        line_tmp_storage = line_tmp_storage[: args.minibatch_size]
    -724        importances_batch, bounds_storage, coverage_storage, _ = compute_batch_ranking(
    -725            line_tmp_storage,
    -726            numeric_column_types,
    -727            args,
    -728            cpu_pool,
    -729            column_descriptions,
    -730            logger,
    -731            local_pbar,
    -732        )
    -733
    -734        for k, v in coverage_storage.items():
    -735            local_coverage_object[k].append(v)
    -736
    -737        step_timing_checkpoints.append(importances_batch.step_times)
    -738        importances_df += importances_batch.triplet_scores
    -739        bounds_storage = dict()
    -740        bounds_storage_batch.append(bounds_storage)
    -741        checkpoint_importances_df(importances_df)
    -742
    -743    local_pbar.set_description('Wrapping up')
    -744    local_pbar.close()
    -745
    -746    return (
    -747        step_timing_checkpoints,
    -748        get_grouped_df(importances_df),
    -749        GLOBAL_CARDINALITY_STORAGE.copy(),
    -750        bounds_storage_batch,
    -751        memory_storage_batch,
    -752        local_coverage_object,
    -753        GLOBAL_RARE_VALUE_STORAGE.copy(),
    -754        GLOBAL_PRIOR_COMB_COUNTS.copy(),
    -755        GLOBAL_COUNTS_STORAGE.copy(),
    -756    )
    +722    remaining_batch_size = len(line_tmp_storage)
    +723
    +724    if remaining_batch_size > 2**10:
    +725        line_tmp_storage = line_tmp_storage[: args.minibatch_size]
    +726        importances_batch, bounds_storage, coverage_storage, _ = compute_batch_ranking(
    +727            line_tmp_storage,
    +728            numeric_column_types,
    +729            args,
    +730            cpu_pool,
    +731            column_descriptions,
    +732            logger,
    +733            local_pbar,
    +734        )
    +735
    +736        for k, v in coverage_storage.items():
    +737            local_coverage_object[k].append(v)
    +738
    +739        step_timing_checkpoints.append(importances_batch.step_times)
    +740        importances_df += importances_batch.triplet_scores
    +741        bounds_storage = dict()
    +742        bounds_storage_batch.append(bounds_storage)
    +743        checkpoint_importances_df(importances_df)
    +744
    +745    local_pbar.set_description('Wrapping up')
    +746    local_pbar.close()
    +747
    +748    return (
    +749        step_timing_checkpoints,
    +750        get_grouped_df(importances_df),
    +751        GLOBAL_CARDINALITY_STORAGE.copy(),
    +752        bounds_storage_batch,
    +753        memory_storage_batch,
    +754        local_coverage_object,
    +755        GLOBAL_RARE_VALUE_STORAGE.copy(),
    +756        GLOBAL_PRIOR_COMB_COUNTS.copy(),
    +757        GLOBAL_COUNTS_STORAGE.copy(),
    +758    )
     
    @@ -2183,4 +2164,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/core_selftest.html b/docs/outrank/core_selftest.html index 47af91f..a1cca67 100644 --- a/docs/outrank/core_selftest.html +++ b/docs/outrank/core_selftest.html @@ -3,7 +3,7 @@ - + outrank.core_selftest API documentation @@ -42,7 +42,7 @@

    outrank.core_selftest

    - + @@ -236,4 +236,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/core_utils.html b/docs/outrank/core_utils.html index 21ee6b7..c2dc080 100644 --- a/docs/outrank/core_utils.html +++ b/docs/outrank/core_utils.html @@ -3,7 +3,7 @@ - + outrank.core_utils API documentation @@ -174,6 +174,12 @@

    API Documentation

  • summarize_rare_counts
  • +
  • + is_prior_heuristic +
  • +
  • + get_num_of_instances +
  • @@ -190,7 +196,7 @@

    API Documentation

    outrank.core_utils

    - + @@ -590,7 +596,7 @@

    393 ) 394 395 -396def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]: +396def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, all_features = False) -> set[Any]: 397 """Given a model's JSON, extract unique features""" 398 399 with open(json_path) as jp: @@ -598,246 +604,270 @@

    401 402 unique_features = set() 403 feature_space = content['desc'].get('features', []) -404 fields_space = content['desc'].get('fields', []) -405 joint_space = feature_space + fields_space +404 if all_features: +405 return set(feature_space) 406 -407 if combined_features_only: -408 return {feature for feature in feature_space if len(feature.split(','))>1} +407 fields_space = content['desc'].get('fields', []) +408 joint_space = feature_space + fields_space 409 -410 for feature_tuple in joint_space: -411 for individual_feature in feature_tuple.split(','): -412 unique_features.add(individual_feature) -413 -414 return unique_features -415 +410 if combined_features_only: +411 return {feature for feature in feature_space if len(feature.split(','))>1} +412 +413 for feature_tuple in joint_space: +414 for individual_feature in feature_tuple.split(','): +415 unique_features.add(individual_feature) 416 -417def summarize_feature_bounds_for_transformers( -418 bounds_object_storage: Any, -419 feature_types: list[str], -420 task_name: str, -421 label_name: str, -422 granularity: int = 15, -423 output_summary_table_only: bool = False, -424): -425 """summarization auxilliary method for generating JSON-based specs""" -426 -427 if bounds_object_storage is None: -428 logging.info('Bounds storage object is empty.') -429 exit() -430 -431 final_storage = defaultdict(list) -432 for el in bounds_object_storage: -433 if isinstance(el, dict): -434 for k, v in el.items(): -435 final_storage[k].append(v) -436 -437 summary_table_rows = [] -438 for k, v in final_storage.items(): -439 # Conduct local aggregation + bound changes -440 if k in feature_types and k != label_name: -441 minima, maxima, medians, uniques = [], [], [], [] -442 for feature_summary in v: -443 minima.append(feature_summary.minimum) -444 maxima.append(feature_summary.maximum) -445 medians.append(feature_summary.median) -446 uniques.append(feature_summary.num_unique) -447 summary_table_rows.append( -448 [ -449 k, -450 round(np.min(minima), 2), -451 round(np.max(maxima), 2), -452 round(np.median(medians), 2), -453 int(np.mean(uniques)), -454 ], -455 ) -456 -457 if len(summary_table_rows) == 0: -458 logging.info('No numeric features to summarize.') -459 return None -460 -461 summary_table: pd.Dataframe = pd.DataFrame(summary_table_rows) -462 summary_table.columns = [ -463 'Feature', -464 'Minimum', -465 'Maximum', -466 'Median', -467 'Num avg. unique (batch)', -468 ] -469 -470 if output_summary_table_only: -471 return summary_table +417 return unique_features +418 +419 +420def summarize_feature_bounds_for_transformers( +421 bounds_object_storage: Any, +422 feature_types: list[str], +423 task_name: str, +424 label_name: str, +425 granularity: int = 15, +426 output_summary_table_only: bool = False, +427): +428 """summarization auxilliary method for generating JSON-based specs""" +429 +430 if bounds_object_storage is None: +431 logging.info('Bounds storage object is empty.') +432 exit() +433 +434 final_storage = defaultdict(list) +435 for el in bounds_object_storage: +436 if isinstance(el, dict): +437 for k, v in el.items(): +438 final_storage[k].append(v) +439 +440 summary_table_rows = [] +441 for k, v in final_storage.items(): +442 # Conduct local aggregation + bound changes +443 if k in feature_types and k != label_name: +444 minima, maxima, medians, uniques = [], [], [], [] +445 for feature_summary in v: +446 minima.append(feature_summary.minimum) +447 maxima.append(feature_summary.maximum) +448 medians.append(feature_summary.median) +449 uniques.append(feature_summary.num_unique) +450 summary_table_rows.append( +451 [ +452 k, +453 round(np.min(minima), 2), +454 round(np.max(maxima), 2), +455 round(np.median(medians), 2), +456 int(np.mean(uniques)), +457 ], +458 ) +459 +460 if len(summary_table_rows) == 0: +461 logging.info('No numeric features to summarize.') +462 return None +463 +464 summary_table: pd.Dataframe = pd.DataFrame(summary_table_rows) +465 summary_table.columns = [ +466 'Feature', +467 'Minimum', +468 'Maximum', +469 'Median', +470 'Num avg. unique (batch)', +471 ] 472 -473 if len(summary_table) == 0: -474 logging.info('Summary table empty, skipping transformer generation ..') -475 return -476 -477 if task_name == 'feature_summary_transformers': -478 transformers_per_feature = defaultdict(list) +473 if output_summary_table_only: +474 return summary_table +475 +476 if len(summary_table) == 0: +477 logging.info('Summary table empty, skipping transformer generation ..') +478 return 479 -480 # Take care of weights first -> range is pre-defined -481 for k, v in final_storage.items(): -482 if label_name in k or 'dummy' in k: -483 continue -484 -485 weight_template = { -486 'feature': k, -487 'src_features': [k], -488 'transformations': ['Weight'], -489 'weights': [0, 0.5, 1.5, 2, 3, 10], -490 } -491 transformers_per_feature[k].append(weight_template) -492 -493 # Consider numeric transformations - pairs and single ones -494 for enx, row in summary_table.iterrows(): -495 if row.Feature == 'dummy': -496 continue -497 try: -498 actual_range = ( -499 np.arange( -500 row['Minimum'], -501 row['Maximum'], -502 (row['Maximum'] - row['Minimum']) / granularity, -503 ) -504 .round(2) -505 .tolist() -506 ) -507 binner_template = { -508 'feature': f'{row.Feature}', -509 'src_features': [row.Feature], -510 'transformations': [ -511 'BinnerSqrt', -512 'BinnerLog', -513 'BinnerSqrtPlain', -514 'BinnerLogPlain', -515 ], -516 'n': actual_range, -517 'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128], -518 } -519 -520 except Exception as es: -521 logging.info( -522 f'\U0001F631 Encountered {es}. The problematic feature is: {row}, skipping transformer for this feature ..', -523 ) -524 -525 transformers_per_feature[row.Feature].append(binner_template) -526 -527 # We want the full loop here, due to asymmetry of transformation(s) -528 for enx_second, row_second in summary_table.iterrows(): -529 if enx_second < enx: -530 continue -531 -532 # The n values are defined based on maxima of the second feature -533 if row_second.Feature != row.Feature: -534 n_bound = round(row_second['Median'] + row['Median'], 2) -535 max_bound = round( -536 min(row_second['Maximum'], row['Maximum']), 2, -537 ) -538 min_bound = round( -539 row_second['Minimum'] + row['Minimum'], 2, +480 if task_name == 'feature_summary_transformers': +481 transformers_per_feature = defaultdict(list) +482 +483 # Take care of weights first -> range is pre-defined +484 for k, v in final_storage.items(): +485 if label_name in k or 'dummy' in k: +486 continue +487 +488 weight_template = { +489 'feature': k, +490 'src_features': [k], +491 'transformations': ['Weight'], +492 'weights': [0, 0.5, 1.5, 2, 3, 10], +493 } +494 transformers_per_feature[k].append(weight_template) +495 +496 # Consider numeric transformations - pairs and single ones +497 for enx, row in summary_table.iterrows(): +498 if row.Feature == 'dummy': +499 continue +500 try: +501 actual_range = ( +502 np.arange( +503 row['Minimum'], +504 row['Maximum'], +505 (row['Maximum'] - row['Minimum']) / granularity, +506 ) +507 .round(2) +508 .tolist() +509 ) +510 binner_template = { +511 'feature': f'{row.Feature}', +512 'src_features': [row.Feature], +513 'transformations': [ +514 'BinnerSqrt', +515 'BinnerLog', +516 'BinnerSqrtPlain', +517 'BinnerLogPlain', +518 ], +519 'n': actual_range, +520 'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128], +521 } +522 +523 except Exception as es: +524 logging.info( +525 f'\U0001F631 Encountered {es}. The problematic feature is: {row}, skipping transformer for this feature ..', +526 ) +527 +528 transformers_per_feature[row.Feature].append(binner_template) +529 +530 # We want the full loop here, due to asymmetry of transformation(s) +531 for enx_second, row_second in summary_table.iterrows(): +532 if enx_second < enx: +533 continue +534 +535 # The n values are defined based on maxima of the second feature +536 if row_second.Feature != row.Feature: +537 n_bound = round(row_second['Median'] + row['Median'], 2) +538 max_bound = round( +539 min(row_second['Maximum'], row['Maximum']), 2, 540 ) -541 range_spectrum = sorted( -542 list( -543 { -544 0.0, -545 min_bound, -546 n_bound / 10, -547 n_bound / 5, -548 n_bound, -549 max_bound, -550 }, -551 ), -552 ) -553 -554 range_spectrum = [x for x in range_spectrum if x >= 0] -555 binner_pair_template = { -556 'feature': f'{row.Feature}Ratio{row_second.Feature}', -557 'src_features': [row.Feature, row_second.Feature], -558 'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'], -559 'n': range_spectrum, -560 'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128], -561 } -562 -563 binner_pair_template_second = { -564 'feature': f'{row_second.Feature}Ratio{row.Feature}', -565 'src_features': [row_second.Feature, row.Feature], -566 'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'], -567 'n': range_spectrum, -568 'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128], -569 } -570 -571 transformers_per_feature[row.Feature].append( -572 binner_pair_template, -573 ) +541 min_bound = round( +542 row_second['Minimum'] + row['Minimum'], 2, +543 ) +544 range_spectrum = sorted( +545 list( +546 { +547 0.0, +548 min_bound, +549 n_bound / 10, +550 n_bound / 5, +551 n_bound, +552 max_bound, +553 }, +554 ), +555 ) +556 +557 range_spectrum = [x for x in range_spectrum if x >= 0] +558 binner_pair_template = { +559 'feature': f'{row.Feature}Ratio{row_second.Feature}', +560 'src_features': [row.Feature, row_second.Feature], +561 'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'], +562 'n': range_spectrum, +563 'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128], +564 } +565 +566 binner_pair_template_second = { +567 'feature': f'{row_second.Feature}Ratio{row.Feature}', +568 'src_features': [row_second.Feature, row.Feature], +569 'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'], +570 'n': range_spectrum, +571 'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128], +572 } +573 574 transformers_per_feature[row.Feature].append( -575 binner_pair_template_second, +575 binner_pair_template, 576 ) -577 -578 binner_templates = [] -579 for k, v in transformers_per_feature.items(): -580 for transformer_struct in v: -581 binner_templates.append(transformer_struct) -582 -583 logging.info( -584 f'Generated {len(binner_templates)} transformation search specifications.\n', -585 ) -586 namespace_full = f'"random_grid_feature_transform": {json.dumps(binner_templates)}, "random_grid_epochs": 512' -587 logging.info('Generated transformations below:\n') -588 print(namespace_full) -589 -590 -591def summarize_rare_counts( -592 term_counter: Any, -593 args: Any, -594 cardinality_object: Any, -595 object_info: DatasetInformationStorage, -596) -> None: -597 """Write rare values""" -598 -599 out_df_rows = [] -600 logging.info( -601 f'Rare value summary (freq <= {args.rare_value_count_upper_bound}) follows ..', -602 ) -603 -604 for namespace_tuple, count in term_counter.items(): -605 namespace, value = namespace_tuple -606 out_df_rows.append([namespace, value, count]) -607 out_df: pd.DataFrame = pd.DataFrame(out_df_rows) -608 out_df.columns = ['Namespace', 'value', 'Count'] -609 out_df.to_csv( -610 os.path.join(args.output_folder, 'rare_values.tsv'), sep='\t', index=False, -611 ) -612 logging.info(f'Wrote rare values to {args.output_folder}/rare_values.tsv') -613 -614 overall_rare_counts = Counter(out_df.Namespace.values) -615 sorted_counts = sorted( -616 overall_rare_counts.items(), key=lambda pair: pair[1], reverse=True, -617 ) -618 for k, v in sorted_counts: -619 logging.info(f'Namespace: {k} ---- Rare values observed: {v}') -620 -621 final_df_rows = [] -622 for k, v in sorted_counts: -623 cardinality = len(cardinality_object[k]) -624 rare_proportion = np.round(100 * (v / cardinality), 2) -625 col_type = 'nominal' -626 if k in object_info.column_types: -627 col_type = 'numeric' -628 final_df_rows.append( -629 { -630 'rare_proportion': rare_proportion, -631 'feature_type': col_type, -632 'feature_name': k, -633 }, -634 ) -635 -636 final_df: pd.DataFrame = pd.DataFrame(final_df_rows) -637 final_df = final_df.sort_values(by=['rare_proportion']) -638 logging.info( -639 f'Wrote feature sparsity summary to {args.output_folder}/feature_sparsity_summary.tsv', -640 ) -641 final_df.to_csv( -642 f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t', +577 transformers_per_feature[row.Feature].append( +578 binner_pair_template_second, +579 ) +580 +581 binner_templates = [] +582 for k, v in transformers_per_feature.items(): +583 for transformer_struct in v: +584 binner_templates.append(transformer_struct) +585 +586 logging.info( +587 f'Generated {len(binner_templates)} transformation search specifications.\n', +588 ) +589 namespace_full = f'"random_grid_feature_transform": {json.dumps(binner_templates)}, "random_grid_epochs": 512' +590 logging.info('Generated transformations below:\n') +591 print(namespace_full) +592 +593 +594def summarize_rare_counts( +595 term_counter: Any, +596 args: Any, +597 cardinality_object: Any, +598 object_info: DatasetInformationStorage, +599) -> None: +600 """Write rare values""" +601 +602 out_df_rows = [] +603 logging.info( +604 f'Rare value summary (freq <= {args.rare_value_count_upper_bound}) follows ..', +605 ) +606 +607 for namespace_tuple, count in term_counter.items(): +608 namespace, value = namespace_tuple +609 out_df_rows.append([namespace, value, count]) +610 out_df: pd.DataFrame = pd.DataFrame(out_df_rows) +611 out_df.columns = ['Namespace', 'value', 'Count'] +612 out_df.to_csv( +613 os.path.join(args.output_folder, 'rare_values.tsv'), sep='\t', index=False, +614 ) +615 logging.info(f'Wrote rare values to {args.output_folder}/rare_values.tsv') +616 +617 overall_rare_counts = Counter(out_df.Namespace.values) +618 sorted_counts = sorted( +619 overall_rare_counts.items(), key=lambda pair: pair[1], reverse=True, +620 ) +621 for k, v in sorted_counts: +622 logging.info(f'Namespace: {k} ---- Rare values observed: {v}') +623 +624 final_df_rows = [] +625 for k, v in sorted_counts: +626 cardinality = len(cardinality_object[k]) +627 rare_proportion = np.round(100 * (v / cardinality), 2) +628 col_type = 'nominal' +629 if k in object_info.column_types: +630 col_type = 'numeric' +631 final_df_rows.append( +632 { +633 'rare_proportion': rare_proportion, +634 'feature_type': col_type, +635 'feature_name': k, +636 }, +637 ) +638 +639 final_df: pd.DataFrame = pd.DataFrame(final_df_rows) +640 final_df = final_df.sort_values(by=['rare_proportion']) +641 logging.info( +642 f'Wrote feature sparsity summary to {args.output_folder}/feature_sparsity_summary.tsv', 643 ) +644 final_df.to_csv( +645 f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t', +646 ) +647 +648 +649def is_prior_heuristic(args: Any) -> bool: +650 if '-prior' in args.heuristic and args.reference_model_JSON: +651 return True +652 return False +653 +654 +655def get_num_of_instances(fname: str) -> int: +656 """Count the number of lines in a file, fast - useful for progress logging""" +657 +658 def _make_gen(reader): +659 while True: +660 b = reader(2**16) +661 if not b: +662 break +663 yield b +664 +665 with open(fname, 'rb') as f: +666 count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read)) +667 return count

    @@ -848,17 +878,17 @@

    ['OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', 'Visualization part also includes clustering - this might be very insightful!', 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', 'Give it as many threads as physically possible (--num_threads).', 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', 'Your target can be any feature! (explaining one feature with others)', 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.'] - +

    - - + +
    - + def write_json_dump_to_file(args: Any, config_name: str) -> None: @@ -874,13 +904,13 @@

    - +
    - + def internal_hash(input_obj: str) -> str: @@ -930,80 +960,80 @@

    - + DatasetInformationStorage( data_path: str, column_names: list[str], column_types: set[str], col_delimiter: str | None, encoding: str, fw_map: dict[str, str] | None) - +
    - - + +
    data_path: str - +
    - - + +
    column_names: list[str] - +
    - - + +
    column_types: set[str] - +
    - - + +
    col_delimiter: str | None - +
    - - + +
    encoding: str - +
    - - + +
    fw_map: dict[str, str] | None - +
    - - + +

    @@ -1037,69 +1067,69 @@

    - + NumericFeatureSummary( feature_name: str, minimum: float, maximum: float, median: float, num_unique: int) - +
    - - + +
    feature_name: str - +
    - - + +
    minimum: float - +
    - - + +
    maximum: float - +
    - - + +
    median: float - +
    - - + +
    num_unique: int - +
    - - + +

    @@ -1130,36 +1160,36 @@

    - + NominalFeatureSummary(feature_name: str, num_unique: int) - +
    - - + +
    feature_name: str - +
    - - + +
    num_unique: int - +
    - - + +

    @@ -1190,43 +1220,43 @@

    - + BatchRankingSummary( triplet_scores: list[tuple[str, str, float]], step_times: dict[str, typing.Any]) - +
    - - + +
    triplet_scores: list[tuple[str, str, float]] - +
    - - + +
    step_times: dict[str, typing.Any] - +
    - - + +

    - + def display_random_tip() -> None: @@ -1246,13 +1276,13 @@

    - +
    - + def get_dataset_info(args: Any): @@ -1281,13 +1311,13 @@

    - +
    - + def display_tool_name() -> None: @@ -1326,13 +1356,13 @@

    - +
    - + def parse_ob_line(line_string: str, delimiter: str = '\t', args: Any = None) -> list[str]: @@ -1359,7 +1389,7 @@

    - + def parse_ob_line_vw( line_string: str, delimiter: str, args: Any = None, fw_col_mapping=None, table_header=None, include_namespace_info=False) -> list[str | None]: @@ -1419,7 +1449,7 @@

    - + def parse_ob_csv_line(line_string: str, delimiter: str = ',', args: Any = None) -> list[str]: @@ -1445,7 +1475,7 @@

    - + def generic_line_parser( line_string: str, delimiter: str, args: Any = None, fw_col_mapping: Any = None, table_header: Any = None) -> list[typing.Any]: @@ -1488,7 +1518,7 @@

    - + def read_reference_json(json_path) -> dict[str, dict]: @@ -1511,7 +1541,7 @@

    - + def parse_namespace(namespace_path: str) -> tuple[set[str], dict[str, str]]: @@ -1554,7 +1584,7 @@

    - + def read_column_names(mapping_file: str) -> list[str]: @@ -1579,7 +1609,7 @@

    - + def parse_ob_vw_feature_information(data_path) -> DatasetInformationStorage: @@ -1615,7 +1645,7 @@

    - + def parse_ob_raw_feature_information(data_path) -> DatasetInformationStorage: @@ -1670,7 +1700,7 @@

    - + def parse_ob_feature_information(data_path) -> DatasetInformationStorage: @@ -1707,7 +1737,7 @@

    - + def parse_csv_with_description_information(data_path) -> DatasetInformationStorage: @@ -1736,13 +1766,13 @@

    - +
    - + def parse_csv_raw(data_path) -> DatasetInformationStorage: @@ -1765,21 +1795,21 @@

    - +
    - + def - extract_features_from_reference_JSON(json_path: str, combined_features_only=False) -> set[typing.Any]: + extract_features_from_reference_JSON( json_path: str, combined_features_only=False, all_features=False) -> set[typing.Any]:
    -
    397def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
    +            
    397def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, all_features = False) -> set[Any]:
     398    """Given a model's JSON, extract unique features"""
     399
     400    with open(json_path) as jp:
    @@ -1787,17 +1817,20 @@ 

    402 403 unique_features = set() 404 feature_space = content['desc'].get('features', []) -405 fields_space = content['desc'].get('fields', []) -406 joint_space = feature_space + fields_space +405 if all_features: +406 return set(feature_space) 407 -408 if combined_features_only: -409 return {feature for feature in feature_space if len(feature.split(','))>1} +408 fields_space = content['desc'].get('fields', []) +409 joint_space = feature_space + fields_space 410 -411 for feature_tuple in joint_space: -412 for individual_feature in feature_tuple.split(','): -413 unique_features.add(individual_feature) -414 -415 return unique_features +411 if combined_features_only: +412 return {feature for feature in feature_space if len(feature.split(','))>1} +413 +414 for feature_tuple in joint_space: +415 for individual_feature in feature_tuple.split(','): +416 unique_features.add(individual_feature) +417 +418 return unique_features

    @@ -1809,7 +1842,7 @@

    - + def summarize_feature_bounds_for_transformers( bounds_object_storage: Any, feature_types: list[str], task_name: str, label_name: str, granularity: int = 15, output_summary_table_only: bool = False): @@ -1817,178 +1850,178 @@

    -
    418def summarize_feature_bounds_for_transformers(
    -419    bounds_object_storage: Any,
    -420    feature_types: list[str],
    -421    task_name: str,
    -422    label_name: str,
    -423    granularity: int = 15,
    -424    output_summary_table_only: bool = False,
    -425):
    -426    """summarization auxilliary method for generating JSON-based specs"""
    -427
    -428    if bounds_object_storage is None:
    -429        logging.info('Bounds storage object is empty.')
    -430        exit()
    -431
    -432    final_storage = defaultdict(list)
    -433    for el in bounds_object_storage:
    -434        if isinstance(el, dict):
    -435            for k, v in el.items():
    -436                final_storage[k].append(v)
    -437
    -438    summary_table_rows = []
    -439    for k, v in final_storage.items():
    -440        # Conduct local aggregation + bound changes
    -441        if k in feature_types and k != label_name:
    -442            minima, maxima, medians, uniques = [], [], [], []
    -443            for feature_summary in v:
    -444                minima.append(feature_summary.minimum)
    -445                maxima.append(feature_summary.maximum)
    -446                medians.append(feature_summary.median)
    -447                uniques.append(feature_summary.num_unique)
    -448            summary_table_rows.append(
    -449                [
    -450                    k,
    -451                    round(np.min(minima), 2),
    -452                    round(np.max(maxima), 2),
    -453                    round(np.median(medians), 2),
    -454                    int(np.mean(uniques)),
    -455                ],
    -456            )
    -457
    -458    if len(summary_table_rows) == 0:
    -459        logging.info('No numeric features to summarize.')
    -460        return None
    -461
    -462    summary_table: pd.Dataframe = pd.DataFrame(summary_table_rows)
    -463    summary_table.columns = [
    -464        'Feature',
    -465        'Minimum',
    -466        'Maximum',
    -467        'Median',
    -468        'Num avg. unique (batch)',
    -469    ]
    -470
    -471    if output_summary_table_only:
    -472        return summary_table
    +            
    421def summarize_feature_bounds_for_transformers(
    +422    bounds_object_storage: Any,
    +423    feature_types: list[str],
    +424    task_name: str,
    +425    label_name: str,
    +426    granularity: int = 15,
    +427    output_summary_table_only: bool = False,
    +428):
    +429    """summarization auxilliary method for generating JSON-based specs"""
    +430
    +431    if bounds_object_storage is None:
    +432        logging.info('Bounds storage object is empty.')
    +433        exit()
    +434
    +435    final_storage = defaultdict(list)
    +436    for el in bounds_object_storage:
    +437        if isinstance(el, dict):
    +438            for k, v in el.items():
    +439                final_storage[k].append(v)
    +440
    +441    summary_table_rows = []
    +442    for k, v in final_storage.items():
    +443        # Conduct local aggregation + bound changes
    +444        if k in feature_types and k != label_name:
    +445            minima, maxima, medians, uniques = [], [], [], []
    +446            for feature_summary in v:
    +447                minima.append(feature_summary.minimum)
    +448                maxima.append(feature_summary.maximum)
    +449                medians.append(feature_summary.median)
    +450                uniques.append(feature_summary.num_unique)
    +451            summary_table_rows.append(
    +452                [
    +453                    k,
    +454                    round(np.min(minima), 2),
    +455                    round(np.max(maxima), 2),
    +456                    round(np.median(medians), 2),
    +457                    int(np.mean(uniques)),
    +458                ],
    +459            )
    +460
    +461    if len(summary_table_rows) == 0:
    +462        logging.info('No numeric features to summarize.')
    +463        return None
    +464
    +465    summary_table: pd.Dataframe = pd.DataFrame(summary_table_rows)
    +466    summary_table.columns = [
    +467        'Feature',
    +468        'Minimum',
    +469        'Maximum',
    +470        'Median',
    +471        'Num avg. unique (batch)',
    +472    ]
     473
    -474    if len(summary_table) == 0:
    -475        logging.info('Summary table empty, skipping transformer generation ..')
    -476        return
    -477
    -478    if task_name == 'feature_summary_transformers':
    -479        transformers_per_feature = defaultdict(list)
    +474    if output_summary_table_only:
    +475        return summary_table
    +476
    +477    if len(summary_table) == 0:
    +478        logging.info('Summary table empty, skipping transformer generation ..')
    +479        return
     480
    -481        # Take care of weights first -> range is pre-defined
    -482        for k, v in final_storage.items():
    -483            if label_name in k or 'dummy' in k:
    -484                continue
    -485
    -486            weight_template = {
    -487                'feature': k,
    -488                'src_features': [k],
    -489                'transformations': ['Weight'],
    -490                'weights': [0, 0.5, 1.5, 2, 3, 10],
    -491            }
    -492            transformers_per_feature[k].append(weight_template)
    -493
    -494        # Consider numeric transformations - pairs and single ones
    -495        for enx, row in summary_table.iterrows():
    -496            if row.Feature == 'dummy':
    -497                continue
    -498            try:
    -499                actual_range = (
    -500                    np.arange(
    -501                        row['Minimum'],
    -502                        row['Maximum'],
    -503                        (row['Maximum'] - row['Minimum']) / granularity,
    -504                    )
    -505                    .round(2)
    -506                    .tolist()
    -507                )
    -508                binner_template = {
    -509                    'feature': f'{row.Feature}',
    -510                    'src_features': [row.Feature],
    -511                    'transformations': [
    -512                        'BinnerSqrt',
    -513                        'BinnerLog',
    -514                        'BinnerSqrtPlain',
    -515                        'BinnerLogPlain',
    -516                    ],
    -517                    'n': actual_range,
    -518                    'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
    -519                }
    -520
    -521            except Exception as es:
    -522                logging.info(
    -523                    f'\U0001F631 Encountered {es}. The problematic feature is: {row}, skipping transformer for this feature ..',
    -524                )
    -525
    -526            transformers_per_feature[row.Feature].append(binner_template)
    -527
    -528            # We want the full loop here, due to asymmetry of transformation(s)
    -529            for enx_second, row_second in summary_table.iterrows():
    -530                if enx_second < enx:
    -531                    continue
    -532
    -533                # The n values are defined based on maxima of the second feature
    -534                if row_second.Feature != row.Feature:
    -535                    n_bound = round(row_second['Median'] + row['Median'], 2)
    -536                    max_bound = round(
    -537                        min(row_second['Maximum'], row['Maximum']), 2,
    -538                    )
    -539                    min_bound = round(
    -540                        row_second['Minimum'] + row['Minimum'], 2,
    +481    if task_name == 'feature_summary_transformers':
    +482        transformers_per_feature = defaultdict(list)
    +483
    +484        # Take care of weights first -> range is pre-defined
    +485        for k, v in final_storage.items():
    +486            if label_name in k or 'dummy' in k:
    +487                continue
    +488
    +489            weight_template = {
    +490                'feature': k,
    +491                'src_features': [k],
    +492                'transformations': ['Weight'],
    +493                'weights': [0, 0.5, 1.5, 2, 3, 10],
    +494            }
    +495            transformers_per_feature[k].append(weight_template)
    +496
    +497        # Consider numeric transformations - pairs and single ones
    +498        for enx, row in summary_table.iterrows():
    +499            if row.Feature == 'dummy':
    +500                continue
    +501            try:
    +502                actual_range = (
    +503                    np.arange(
    +504                        row['Minimum'],
    +505                        row['Maximum'],
    +506                        (row['Maximum'] - row['Minimum']) / granularity,
    +507                    )
    +508                    .round(2)
    +509                    .tolist()
    +510                )
    +511                binner_template = {
    +512                    'feature': f'{row.Feature}',
    +513                    'src_features': [row.Feature],
    +514                    'transformations': [
    +515                        'BinnerSqrt',
    +516                        'BinnerLog',
    +517                        'BinnerSqrtPlain',
    +518                        'BinnerLogPlain',
    +519                    ],
    +520                    'n': actual_range,
    +521                    'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
    +522                }
    +523
    +524            except Exception as es:
    +525                logging.info(
    +526                    f'\U0001F631 Encountered {es}. The problematic feature is: {row}, skipping transformer for this feature ..',
    +527                )
    +528
    +529            transformers_per_feature[row.Feature].append(binner_template)
    +530
    +531            # We want the full loop here, due to asymmetry of transformation(s)
    +532            for enx_second, row_second in summary_table.iterrows():
    +533                if enx_second < enx:
    +534                    continue
    +535
    +536                # The n values are defined based on maxima of the second feature
    +537                if row_second.Feature != row.Feature:
    +538                    n_bound = round(row_second['Median'] + row['Median'], 2)
    +539                    max_bound = round(
    +540                        min(row_second['Maximum'], row['Maximum']), 2,
     541                    )
    -542                    range_spectrum = sorted(
    -543                        list(
    -544                            {
    -545                                0.0,
    -546                                min_bound,
    -547                                n_bound / 10,
    -548                                n_bound / 5,
    -549                                n_bound,
    -550                                max_bound,
    -551                            },
    -552                        ),
    -553                    )
    -554
    -555                    range_spectrum = [x for x in range_spectrum if x >= 0]
    -556                    binner_pair_template = {
    -557                        'feature': f'{row.Feature}Ratio{row_second.Feature}',
    -558                        'src_features': [row.Feature, row_second.Feature],
    -559                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
    -560                        'n': range_spectrum,
    -561                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
    -562                    }
    -563
    -564                    binner_pair_template_second = {
    -565                        'feature': f'{row_second.Feature}Ratio{row.Feature}',
    -566                        'src_features': [row_second.Feature, row.Feature],
    -567                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
    -568                        'n': range_spectrum,
    -569                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
    -570                    }
    -571
    -572                    transformers_per_feature[row.Feature].append(
    -573                        binner_pair_template,
    -574                    )
    +542                    min_bound = round(
    +543                        row_second['Minimum'] + row['Minimum'], 2,
    +544                    )
    +545                    range_spectrum = sorted(
    +546                        list(
    +547                            {
    +548                                0.0,
    +549                                min_bound,
    +550                                n_bound / 10,
    +551                                n_bound / 5,
    +552                                n_bound,
    +553                                max_bound,
    +554                            },
    +555                        ),
    +556                    )
    +557
    +558                    range_spectrum = [x for x in range_spectrum if x >= 0]
    +559                    binner_pair_template = {
    +560                        'feature': f'{row.Feature}Ratio{row_second.Feature}',
    +561                        'src_features': [row.Feature, row_second.Feature],
    +562                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
    +563                        'n': range_spectrum,
    +564                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
    +565                    }
    +566
    +567                    binner_pair_template_second = {
    +568                        'feature': f'{row_second.Feature}Ratio{row.Feature}',
    +569                        'src_features': [row_second.Feature, row.Feature],
    +570                        'transformations': ['BinnerLogRatioPlain', 'BinnerLogRatio'],
    +571                        'n': range_spectrum,
    +572                        'resolutions': [0.1, 2, 4, 8, 16, 32, 64, 128],
    +573                    }
    +574
     575                    transformers_per_feature[row.Feature].append(
    -576                        binner_pair_template_second,
    +576                        binner_pair_template,
     577                    )
    -578
    -579        binner_templates = []
    -580        for k, v in transformers_per_feature.items():
    -581            for transformer_struct in v:
    -582                binner_templates.append(transformer_struct)
    -583
    -584        logging.info(
    -585            f'Generated {len(binner_templates)} transformation search specifications.\n',
    -586        )
    -587        namespace_full = f'"random_grid_feature_transform": {json.dumps(binner_templates)}, "random_grid_epochs": 512'
    -588        logging.info('Generated transformations below:\n')
    -589        print(namespace_full)
    +578                    transformers_per_feature[row.Feature].append(
    +579                        binner_pair_template_second,
    +580                    )
    +581
    +582        binner_templates = []
    +583        for k, v in transformers_per_feature.items():
    +584            for transformer_struct in v:
    +585                binner_templates.append(transformer_struct)
    +586
    +587        logging.info(
    +588            f'Generated {len(binner_templates)} transformation search specifications.\n',
    +589        )
    +590        namespace_full = f'"random_grid_feature_transform": {json.dumps(binner_templates)}, "random_grid_epochs": 512'
    +591        logging.info('Generated transformations below:\n')
    +592        print(namespace_full)
     
    @@ -2000,7 +2033,7 @@

    - + def summarize_rare_counts( term_counter: Any, args: Any, cardinality_object: Any, object_info: DatasetInformationStorage) -> None: @@ -2008,59 +2041,59 @@

    -
    592def summarize_rare_counts(
    -593    term_counter: Any,
    -594    args: Any,
    -595    cardinality_object: Any,
    -596    object_info: DatasetInformationStorage,
    -597) -> None:
    -598    """Write rare values"""
    -599
    -600    out_df_rows = []
    -601    logging.info(
    -602        f'Rare value summary (freq <= {args.rare_value_count_upper_bound}) follows ..',
    -603    )
    -604
    -605    for namespace_tuple, count in term_counter.items():
    -606        namespace, value = namespace_tuple
    -607        out_df_rows.append([namespace, value, count])
    -608    out_df: pd.DataFrame = pd.DataFrame(out_df_rows)
    -609    out_df.columns = ['Namespace', 'value', 'Count']
    -610    out_df.to_csv(
    -611        os.path.join(args.output_folder, 'rare_values.tsv'), sep='\t', index=False,
    -612    )
    -613    logging.info(f'Wrote rare values to {args.output_folder}/rare_values.tsv')
    -614
    -615    overall_rare_counts = Counter(out_df.Namespace.values)
    -616    sorted_counts = sorted(
    -617        overall_rare_counts.items(), key=lambda pair: pair[1], reverse=True,
    -618    )
    -619    for k, v in sorted_counts:
    -620        logging.info(f'Namespace: {k} ---- Rare values observed: {v}')
    -621
    -622    final_df_rows = []
    -623    for k, v in sorted_counts:
    -624        cardinality = len(cardinality_object[k])
    -625        rare_proportion = np.round(100 * (v / cardinality), 2)
    -626        col_type = 'nominal'
    -627        if k in object_info.column_types:
    -628            col_type = 'numeric'
    -629        final_df_rows.append(
    -630            {
    -631                'rare_proportion': rare_proportion,
    -632                'feature_type': col_type,
    -633                'feature_name': k,
    -634            },
    -635        )
    -636
    -637    final_df: pd.DataFrame = pd.DataFrame(final_df_rows)
    -638    final_df = final_df.sort_values(by=['rare_proportion'])
    -639    logging.info(
    -640        f'Wrote feature sparsity summary to {args.output_folder}/feature_sparsity_summary.tsv',
    -641    )
    -642    final_df.to_csv(
    -643        f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
    +            
    595def summarize_rare_counts(
    +596    term_counter: Any,
    +597    args: Any,
    +598    cardinality_object: Any,
    +599    object_info: DatasetInformationStorage,
    +600) -> None:
    +601    """Write rare values"""
    +602
    +603    out_df_rows = []
    +604    logging.info(
    +605        f'Rare value summary (freq <= {args.rare_value_count_upper_bound}) follows ..',
    +606    )
    +607
    +608    for namespace_tuple, count in term_counter.items():
    +609        namespace, value = namespace_tuple
    +610        out_df_rows.append([namespace, value, count])
    +611    out_df: pd.DataFrame = pd.DataFrame(out_df_rows)
    +612    out_df.columns = ['Namespace', 'value', 'Count']
    +613    out_df.to_csv(
    +614        os.path.join(args.output_folder, 'rare_values.tsv'), sep='\t', index=False,
    +615    )
    +616    logging.info(f'Wrote rare values to {args.output_folder}/rare_values.tsv')
    +617
    +618    overall_rare_counts = Counter(out_df.Namespace.values)
    +619    sorted_counts = sorted(
    +620        overall_rare_counts.items(), key=lambda pair: pair[1], reverse=True,
    +621    )
    +622    for k, v in sorted_counts:
    +623        logging.info(f'Namespace: {k} ---- Rare values observed: {v}')
    +624
    +625    final_df_rows = []
    +626    for k, v in sorted_counts:
    +627        cardinality = len(cardinality_object[k])
    +628        rare_proportion = np.round(100 * (v / cardinality), 2)
    +629        col_type = 'nominal'
    +630        if k in object_info.column_types:
    +631            col_type = 'numeric'
    +632        final_df_rows.append(
    +633            {
    +634                'rare_proportion': rare_proportion,
    +635                'feature_type': col_type,
    +636                'feature_name': k,
    +637            },
    +638        )
    +639
    +640    final_df: pd.DataFrame = pd.DataFrame(final_df_rows)
    +641    final_df = final_df.sort_values(by=['rare_proportion'])
    +642    logging.info(
    +643        f'Wrote feature sparsity summary to {args.output_folder}/feature_sparsity_summary.tsv',
     644    )
    +645    final_df.to_csv(
    +646        f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
    +647    )
     
    @@ -2068,6 +2101,59 @@

    +
    +
    + +
    + + def + is_prior_heuristic(args: Any) -> bool: + + + +
    + +
    650def is_prior_heuristic(args: Any) -> bool:
    +651    if '-prior' in args.heuristic and args.reference_model_JSON:
    +652        return True
    +653    return False
    +
    + + + + +
    +
    + +
    + + def + get_num_of_instances(fname: str) -> int: + + + +
    + +
    656def get_num_of_instances(fname: str) -> int:
    +657    """Count the number of lines in a file, fast - useful for progress logging"""
    +658
    +659    def _make_gen(reader):
    +660        while True:
    +661            b = reader(2**16)
    +662            if not b:
    +663                break
    +664            yield b
    +665
    +666    with open(fname, 'rb') as f:
    +667        count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read))
    +668    return count
    +
    + + +

    Count the number of lines in a file, fast - useful for progress logging

    +
    + +
    - + \ No newline at end of file diff --git a/docs/outrank/feature_transformations.html b/docs/outrank/feature_transformations.html index 77c10c6..c84e722 100644 --- a/docs/outrank/feature_transformations.html +++ b/docs/outrank/feature_transformations.html @@ -3,7 +3,7 @@ - + outrank.feature_transformations API documentation @@ -47,10 +47,10 @@

    Submodules

    outrank.feature_transformations

    - - - - + + + +
    - + \ No newline at end of file diff --git a/docs/outrank/feature_transformations/feature_transformer_vault.html b/docs/outrank/feature_transformations/feature_transformer_vault.html index 3219c64..fbd2281 100644 --- a/docs/outrank/feature_transformations/feature_transformer_vault.html +++ b/docs/outrank/feature_transformations/feature_transformer_vault.html @@ -3,7 +3,7 @@ - + outrank.feature_transformations.feature_transformer_vault API documentation @@ -51,7 +51,7 @@

    API Documentation

    outrank.feature_transformations.feature_transformer_vault

    - + @@ -60,15 +60,14 @@

    2 3from outrank.feature_transformations.feature_transformer_vault.default_transformers import DEFAULT_TRANSFORMERS 4from outrank.feature_transformations.feature_transformer_vault.default_transformers import MINIMAL_TRANSFORMERS - 5from outrank.feature_transformations.feature_transformer_vault.fw_transformers import ( - 6 FW_TRANSFORMERS, - 7) - 8 - 9_tr_global_namespace = { -10 'default': DEFAULT_TRANSFORMERS, -11 'minimal': MINIMAL_TRANSFORMERS, -12 'fw-transformers': FW_TRANSFORMERS, -13} + 5from outrank.feature_transformations.feature_transformer_vault.fw_transformers import \ + 6 FW_TRANSFORMERS + 7 + 8_tr_global_namespace = { + 9 'default': DEFAULT_TRANSFORMERS, +10 'minimal': MINIMAL_TRANSFORMERS, +11 'fw-transformers': FW_TRANSFORMERS, +12}

    @@ -256,4 +255,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html b/docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html index e9db800..c3fa662 100644 --- a/docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html +++ b/docs/outrank/feature_transformations/feature_transformer_vault/default_transformers.html @@ -3,7 +3,7 @@ - + outrank.feature_transformations.feature_transformer_vault.default_transformers API documentation @@ -52,50 +52,51 @@

    API Documentation

    outrank.feature_transformations.feature_transformer_vault.default_transformers

    - +
     1# Some boilerplate transformations people tend to use
      2from __future__ import annotations
    - 3MINIMAL_TRANSFORMERS = {
    - 4    '_tr_sqrt': 'np.sqrt(X)',
    - 5    '_tr_log(x+1)': 'np.log(X + 1)',
    - 6    '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))',
    - 7    '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)',
    - 8}
    - 9
    -10DEFAULT_TRANSFORMERS = {
    -11    '_tr_sqrt': 'np.sqrt(X)',
    -12    '_tr_log(x+1)': 'np.log(X + 1)',
    -13    '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))',
    -14    '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)',
    -15    '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))',
    -16    '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))',
    -17    '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)',
    -18    '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)',
    -19    '_tr_nonzero': 'np.where(X != 0, 1, 0)',
    -20    '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)',
    -21}
    -22
    -23if __name__ == '__main__':
    -24    import numpy as np
    -25
    -26    # generate some input (call it X)
    -27    X = np.random.random(100)
    -28
    -29    # get some transformer
    -30    some_transformer = DEFAULT_TRANSFORMERS.get('_tr_nonzero')
    -31
    -32    if some_transformer is None:
    -33        some_transformer = ''
    -34
    -35    # evaluate to get output
    -36    output = eval(some_transformer)
    -37
    -38    # check output somehow
    -39    print(output)
    + 3
    + 4MINIMAL_TRANSFORMERS = {
    + 5    '_tr_sqrt': 'np.sqrt(X)',
    + 6    '_tr_log(x+1)': 'np.log(X + 1)',
    + 7    '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))',
    + 8    '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)',
    + 9}
    +10
    +11DEFAULT_TRANSFORMERS = {
    +12    '_tr_sqrt': 'np.sqrt(X)',
    +13    '_tr_log(x+1)': 'np.log(X + 1)',
    +14    '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))',
    +15    '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)',
    +16    '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))',
    +17    '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))',
    +18    '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)',
    +19    '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)',
    +20    '_tr_nonzero': 'np.where(X != 0, 1, 0)',
    +21    '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)',
    +22}
    +23
    +24if __name__ == '__main__':
    +25    import numpy as np
    +26
    +27    # generate some input (call it X)
    +28    X = np.random.random(100)
    +29
    +30    # get some transformer
    +31    some_transformer = DEFAULT_TRANSFORMERS.get('_tr_nonzero')
    +32
    +33    if some_transformer is None:
    +34        some_transformer = ''
    +35
    +36    # evaluate to get output
    +37    output = eval(some_transformer)
    +38
    +39    # check output somehow
    +40    print(output)
     
    @@ -106,11 +107,11 @@

    {'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)'} - +

    - - + +
    @@ -119,11 +120,11 @@

    {'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)'} - +

    - - + +
    @@ -309,4 +310,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html b/docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html index a3e5a60..5e5b619 100644 --- a/docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html +++ b/docs/outrank/feature_transformations/feature_transformer_vault/fw_transformers.html @@ -3,7 +3,7 @@ - + outrank.feature_transformations.feature_transformer_vault.fw_transformers API documentation @@ -55,7 +55,7 @@

    API Documentation

    outrank.feature_transformations.feature_transformer_vault.fw_transformers

    - + @@ -64,39 +64,38 @@

    2 3import numpy as np 4 - 5from outrank.feature_transformations.feature_transformer_vault.default_transformers import ( - 6 DEFAULT_TRANSFORMERS, - 7) - 8 - 9FW_TRANSFORMERS = DEFAULT_TRANSFORMERS.copy() -10resolution_range = [1, 10, 50, 100] -11greater_than_range = [1, 2, 4, 8, 16, 32, 64, 96] -12 -13for resolution in resolution_range: -14 for greater_than in greater_than_range: -15 FW_TRANSFORMERS[f'_tr_fw_sqrt_res_{resolution}_gt_{greater_than}'] = ( -16 f'np.where(X < {greater_than}, ' -17 f'X, ' -18 f'np.where(X>{greater_than} ,' -19 f'np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))' -20 ) -21 -22 FW_TRANSFORMERS[ -23 f'_tr_fw_log_res_{resolution}_gt_{greater_than}' -24 ] = f'np.where(X <{greater_than}, X, np.where(X >{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))' -25 -26for resolution in resolution_range: -27 for greater_than in [np.divide(x, 100) for x in greater_than_range]: -28 FW_TRANSFORMERS[ -29 f'_tr_fw_prob_sqrt_res_{resolution}_gt_{greater_than}' -30 ] = f'np.where(X < {greater_than}, X, np.where(X>{greater_than}, np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))' -31 -32 FW_TRANSFORMERS[ -33 f'_tr_fw_prob_log_res_{resolution}_gt_{greater_than}' -34 ] = f'np.where(X <{greater_than},X, np.where(X>{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))' -35 -36if __name__ == '__main__': -37 print(len(FW_TRANSFORMERS)) + 5from outrank.feature_transformations.feature_transformer_vault.default_transformers import \ + 6 DEFAULT_TRANSFORMERS + 7 + 8FW_TRANSFORMERS = DEFAULT_TRANSFORMERS.copy() + 9resolution_range = [1, 10, 50, 100] +10greater_than_range = [1, 2, 4, 8, 16, 32, 64, 96] +11 +12for resolution in resolution_range: +13 for greater_than in greater_than_range: +14 FW_TRANSFORMERS[f'_tr_fw_sqrt_res_{resolution}_gt_{greater_than}'] = ( +15 f'np.where(X < {greater_than}, ' +16 f'X, ' +17 f'np.where(X>{greater_than} ,' +18 f'np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))' +19 ) +20 +21 FW_TRANSFORMERS[ +22 f'_tr_fw_log_res_{resolution}_gt_{greater_than}' +23 ] = f'np.where(X <{greater_than}, X, np.where(X >{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))' +24 +25for resolution in resolution_range: +26 for greater_than in [np.divide(x, 100) for x in greater_than_range]: +27 FW_TRANSFORMERS[ +28 f'_tr_fw_prob_sqrt_res_{resolution}_gt_{greater_than}' +29 ] = f'np.where(X < {greater_than}, X, np.where(X>{greater_than}, np.round(np.sqrt(X-{greater_than})*{resolution},0), 0))' +30 +31 FW_TRANSFORMERS[ +32 f'_tr_fw_prob_log_res_{resolution}_gt_{greater_than}' +33 ] = f'np.where(X <{greater_than},X, np.where(X>{greater_than}, np.round(np.log(X-{greater_than})*{resolution},0), 0))' +34 +35if __name__ == '__main__': +36 print(len(FW_TRANSFORMERS))

    @@ -107,11 +106,11 @@

    {'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)', '_tr_fw_sqrt_res_1_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*1,0), 0))', '_tr_fw_log_res_1_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*1,0), 0))', '_tr_fw_log_res_1_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*1,0), 0))', '_tr_fw_log_res_1_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*1,0), 0))', '_tr_fw_log_res_1_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*1,0), 0))', '_tr_fw_log_res_1_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*1,0), 0))', '_tr_fw_log_res_1_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*1,0), 0))', '_tr_fw_log_res_1_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*1,0), 0))', '_tr_fw_log_res_1_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*1,0), 0))', '_tr_fw_sqrt_res_10_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*10,0), 0))', '_tr_fw_log_res_10_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*10,0), 0))', '_tr_fw_log_res_10_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*10,0), 0))', '_tr_fw_log_res_10_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*10,0), 0))', '_tr_fw_log_res_10_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*10,0), 0))', '_tr_fw_log_res_10_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*10,0), 0))', '_tr_fw_log_res_10_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*10,0), 0))', '_tr_fw_log_res_10_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*10,0), 0))', '_tr_fw_log_res_10_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*10,0), 0))', '_tr_fw_sqrt_res_50_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*50,0), 0))', '_tr_fw_log_res_50_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*50,0), 0))', '_tr_fw_log_res_50_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*50,0), 0))', '_tr_fw_log_res_50_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*50,0), 0))', '_tr_fw_log_res_50_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*50,0), 0))', '_tr_fw_log_res_50_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*50,0), 0))', '_tr_fw_log_res_50_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*50,0), 0))', '_tr_fw_log_res_50_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*50,0), 0))', '_tr_fw_log_res_50_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*50,0), 0))', '_tr_fw_sqrt_res_100_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*100,0), 0))', '_tr_fw_log_res_100_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*100,0), 0))', '_tr_fw_log_res_100_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*100,0), 0))', '_tr_fw_log_res_100_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*100,0), 0))', '_tr_fw_log_res_100_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*100,0), 0))', '_tr_fw_log_res_100_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*100,0), 0))', '_tr_fw_log_res_100_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*100,0), 0))', '_tr_fw_log_res_100_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*100,0), 0))', '_tr_fw_log_res_100_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*100,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*1,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*10,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*50,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*100,0), 0))'} - +

    - - + +
    @@ -119,11 +118,11 @@

    resolution_range = [1, 10, 50, 100] - + - - + +

    @@ -131,11 +130,11 @@

    greater_than_range = [1, 2, 4, 8, 16, 32, 64, 96] - + - - + +

    @@ -321,4 +320,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/feature_transformations/ranking_transformers.html b/docs/outrank/feature_transformations/ranking_transformers.html index 458b822..f87af0b 100644 --- a/docs/outrank/feature_transformations/ranking_transformers.html +++ b/docs/outrank/feature_transformations/ranking_transformers.html @@ -3,7 +3,7 @@ - + outrank.feature_transformations.ranking_transformers API documentation @@ -88,7 +88,7 @@

    API Documentation

    outrank.feature_transformations.ranking_transformers

    - + @@ -268,7 +268,7 @@

    - + class FeatureTransformerNoise: @@ -334,23 +334,23 @@

    - +
    noise_preset - +
    - - + +
    - + def construct_new_features(self, dataframe: pandas.core.frame.DataFrame, label_column=None): @@ -421,7 +421,7 @@

    - + class FeatureTransformerGeneric: @@ -525,12 +525,12 @@

    - +
    - + FeatureTransformerGeneric(numeric_column_names: set[str], preset: str = 'default') @@ -565,57 +565,57 @@

    - +
    numeric_column_names - +
    - - + +
    constructed_feature_names: set[str] - +
    - - + +
    max_maj_support - +
    - - + +
    nan_prop_support - +
    - - + +
    - + def get_vals(self, tmp_df: pandas.core.frame.DataFrame, col_name: str) -> Any: @@ -632,13 +632,13 @@

    - +
    - + def construct_baseline_features(self, dataframe: Any) -> pandas.core.frame.DataFrame: @@ -661,13 +661,13 @@

    - +
    - + def construct_new_features(self, dataframe: Any) -> pandas.core.frame.DataFrame: @@ -724,7 +724,7 @@

    - +
    @@ -911,4 +911,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/task_generators.html b/docs/outrank/task_generators.html index 17abb63..17df4af 100644 --- a/docs/outrank/task_generators.html +++ b/docs/outrank/task_generators.html @@ -3,7 +3,7 @@ - + outrank.task_generators API documentation @@ -52,7 +52,7 @@

    API Documentation

    outrank.task_generators

    - + @@ -111,17 +111,17 @@

    logger = <Logger syn-logger (DEBUG)> - +

    - - + +
    - + def outrank_task_generate_data_set(args): @@ -346,4 +346,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/task_instance_ranking.html b/docs/outrank/task_instance_ranking.html new file mode 100644 index 0000000..49c7a7e --- /dev/null +++ b/docs/outrank/task_instance_ranking.html @@ -0,0 +1,521 @@ + + + + + + + outrank.task_instance_ranking API documentation + + + + + + + + + +
    +
    +

    +outrank.task_instance_ranking

    + + + + + + +
      1from __future__ import annotations
    +  2
    +  3import gzip
    +  4import os
    +  5from collections import Counter
    +  6from collections import defaultdict
    +  7from typing import Any
    +  8
    +  9import numpy as np
    + 10import pandas as pd
    + 11import tqdm
    + 12
    + 13from outrank.core_utils import generic_line_parser
    + 14from outrank.core_utils import get_dataset_info
    + 15from outrank.core_utils import get_num_of_instances
    + 16
    + 17try:
    + 18    import matplotlib.pyplot as plt
    + 19except:
    + 20    pass
    + 21
    + 22
    + 23def shannon_ent(string: str) -> float:
    + 24    counts = Counter(string)
    + 25    frequencies = ((i / len(string)) for i in counts.values())
    + 26    return -np.sum(f * np.log2(f) for f in frequencies)
    + 27
    + 28
    + 29def compute_entropy_avg(line: list) -> float:
    + 30    joint_ent = 0
    + 31    for field in line:
    + 32        joint_ent += shannon_ent(field)
    + 33    return joint_ent
    + 34
    + 35
    + 36def score_line(line):
    + 37    nan_prop = line.count('') / len(line)
    + 38    out_struct = {}
    + 39    out_struct['empty_string_prop'] = nan_prop
    + 40    out_struct['empty_dict'] = line.count('{}') / len(line)
    + 41    out_struct['all_empty'] = (line.count('{}') + line.count('')) / len(line)
    + 42    out_struct['all_zero'] = line.count('0') / len(line)
    + 43    for j in [30, 60, 100, 200, 300]:
    + 44        out_struct[f'all_more_{j}_chars'] = len(
    + 45            [x for x in line if len(x) > j], ) / len(line)
    + 46    out_struct['row_entropy'] = compute_entropy_avg(line)
    + 47    return out_struct
    + 48
    + 49
    + 50def outrank_task_rank_instances(args: Any) -> None:
    + 51
    + 52    data_encoding = 'utf-8'
    + 53    delimiter = '\t'
    + 54    dataset_info = get_dataset_info(args)
    + 55    local_pbar = tqdm.tqdm(
    + 56        total=get_num_of_instances(dataset_info.data_path) - 1,
    + 57        position=0,
    + 58        disable=args.disable_tqdm == 'True',
    + 59    )
    + 60    local_pbar.set_description('Starting ranking computation')
    + 61
    + 62    _, file_extension = os.path.splitext(dataset_info.data_path)
    + 63
    + 64    if file_extension == '.gz':
    + 65        file_stream = gzip.open(
    + 66            dataset_info.data_path,
    + 67            'rt',
    + 68            encoding=data_encoding,
    + 69        )
    + 70
    + 71    else:
    + 72        file_stream = open(dataset_info.data_path, encoding=data_encoding)
    + 73    line_counter = 0
    + 74    out_scores_lab = defaultdict(list)
    + 75
    + 76    for line in file_stream:
    + 77        line_counter += 1
    + 78        local_pbar.update(1)
    + 79
    + 80        parsed_line = generic_line_parser(
    + 81            line,
    + 82            delimiter,
    + 83            args,
    + 84            dataset_info.fw_map,
    + 85            dataset_info.column_names,
    + 86        )
    + 87
    + 88        if line_counter > 100_000:
    + 89            break
    + 90        out_scores_lab[line[0]].append(score_line(parsed_line))
    + 91
    + 92    for label, out_scores in out_scores_lab.items():
    + 93        out_df = pd.DataFrame(out_scores)
    + 94        os.makedirs(args.output_folder, exist_ok=True)
    + 95        for col in out_df.columns:
    + 96            sorted_vals = out_df[col].sort_values()
    + 97            plt.figure(figsize=(5, 5), dpi=300)
    + 98            plt.title(col + f' label: {label}')
    + 99            plt.hist(
    +100                x=sorted_vals * 100,
    +101                color='black',
    +102                density=True,
    +103                bins=100,
    +104            )
    +105            if 'entropy' not in col:
    +106                plt.xlabel('Proportion of namespaces (%)')
    +107            else:
    +108                plt.xlabel('Row entropy')
    +109            plt.ylabel('Density')
    +110            plt.tight_layout()
    +111            fname = f'distPlot{col}_{label}.pdf'
    +112            plt.savefig(os.path.join(args.output_folder, fname), dpi=300)
    +113            plt.cla()
    +114            plt.clf()
    +
    + + +
    +
    + +
    + + def + shannon_ent(string: str) -> float: + + + +
    + +
    24def shannon_ent(string: str) -> float:
    +25    counts = Counter(string)
    +26    frequencies = ((i / len(string)) for i in counts.values())
    +27    return -np.sum(f * np.log2(f) for f in frequencies)
    +
    + + + + +
    +
    + +
    + + def + compute_entropy_avg(line: list) -> float: + + + +
    + +
    30def compute_entropy_avg(line: list) -> float:
    +31    joint_ent = 0
    +32    for field in line:
    +33        joint_ent += shannon_ent(field)
    +34    return joint_ent
    +
    + + + + +
    +
    + +
    + + def + score_line(line): + + + +
    + +
    37def score_line(line):
    +38    nan_prop = line.count('') / len(line)
    +39    out_struct = {}
    +40    out_struct['empty_string_prop'] = nan_prop
    +41    out_struct['empty_dict'] = line.count('{}') / len(line)
    +42    out_struct['all_empty'] = (line.count('{}') + line.count('')) / len(line)
    +43    out_struct['all_zero'] = line.count('0') / len(line)
    +44    for j in [30, 60, 100, 200, 300]:
    +45        out_struct[f'all_more_{j}_chars'] = len(
    +46            [x for x in line if len(x) > j], ) / len(line)
    +47    out_struct['row_entropy'] = compute_entropy_avg(line)
    +48    return out_struct
    +
    + + + + +
    +
    + +
    + + def + outrank_task_rank_instances(args: Any) -> None: + + + +
    + +
     51def outrank_task_rank_instances(args: Any) -> None:
    + 52
    + 53    data_encoding = 'utf-8'
    + 54    delimiter = '\t'
    + 55    dataset_info = get_dataset_info(args)
    + 56    local_pbar = tqdm.tqdm(
    + 57        total=get_num_of_instances(dataset_info.data_path) - 1,
    + 58        position=0,
    + 59        disable=args.disable_tqdm == 'True',
    + 60    )
    + 61    local_pbar.set_description('Starting ranking computation')
    + 62
    + 63    _, file_extension = os.path.splitext(dataset_info.data_path)
    + 64
    + 65    if file_extension == '.gz':
    + 66        file_stream = gzip.open(
    + 67            dataset_info.data_path,
    + 68            'rt',
    + 69            encoding=data_encoding,
    + 70        )
    + 71
    + 72    else:
    + 73        file_stream = open(dataset_info.data_path, encoding=data_encoding)
    + 74    line_counter = 0
    + 75    out_scores_lab = defaultdict(list)
    + 76
    + 77    for line in file_stream:
    + 78        line_counter += 1
    + 79        local_pbar.update(1)
    + 80
    + 81        parsed_line = generic_line_parser(
    + 82            line,
    + 83            delimiter,
    + 84            args,
    + 85            dataset_info.fw_map,
    + 86            dataset_info.column_names,
    + 87        )
    + 88
    + 89        if line_counter > 100_000:
    + 90            break
    + 91        out_scores_lab[line[0]].append(score_line(parsed_line))
    + 92
    + 93    for label, out_scores in out_scores_lab.items():
    + 94        out_df = pd.DataFrame(out_scores)
    + 95        os.makedirs(args.output_folder, exist_ok=True)
    + 96        for col in out_df.columns:
    + 97            sorted_vals = out_df[col].sort_values()
    + 98            plt.figure(figsize=(5, 5), dpi=300)
    + 99            plt.title(col + f' label: {label}')
    +100            plt.hist(
    +101                x=sorted_vals * 100,
    +102                color='black',
    +103                density=True,
    +104                bins=100,
    +105            )
    +106            if 'entropy' not in col:
    +107                plt.xlabel('Proportion of namespaces (%)')
    +108            else:
    +109                plt.xlabel('Row entropy')
    +110            plt.ylabel('Density')
    +111            plt.tight_layout()
    +112            fname = f'distPlot{col}_{label}.pdf'
    +113            plt.savefig(os.path.join(args.output_folder, fname), dpi=300)
    +114            plt.cla()
    +115            plt.clf()
    +
    + + + + +
    +
    + + \ No newline at end of file diff --git a/docs/outrank/task_ranking.html b/docs/outrank/task_ranking.html index ac5e3a0..f6176a9 100644 --- a/docs/outrank/task_ranking.html +++ b/docs/outrank/task_ranking.html @@ -3,7 +3,7 @@ - + outrank.task_ranking API documentation @@ -49,7 +49,7 @@

    API Documentation

    outrank.task_ranking

    - + @@ -362,7 +362,7 @@

    - + def outrank_task_conduct_ranking(args: Any) -> None: @@ -640,7 +640,7 @@

    - +
    @@ -826,4 +826,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/task_selftest.html b/docs/outrank/task_selftest.html index f36e78e..cbbfeb9 100644 --- a/docs/outrank/task_selftest.html +++ b/docs/outrank/task_selftest.html @@ -3,7 +3,7 @@ - + outrank.task_selftest API documentation @@ -52,7 +52,7 @@

    API Documentation

    outrank.task_selftest

    - + @@ -81,16 +81,16 @@

    22 'outrank --task data_generator --num_synthetic_rows 100000', shell=True, 23 ) 24 subprocess.run( -25 'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;', +25 'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;', 26 shell=True, 27 ) 28 29 dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t') 30 31 logger.info("Verifying output's properties ..") -32 assert dfx.shape[0] == 120 +32 assert dfx.shape[0] == 201 33 assert dfx.shape[1] == 3 -34 assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)' +34 assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)' 35 36 to_remove = ['ranking_outputs', 'test_data_synthetic'] 37 for path in to_remove: @@ -99,6 +99,10 @@

    40 shutil.rmtree(path) 41 42 logger.info('All tests passed, OutRank seems in shape \N{winking face}') +43 +44 +45if __name__ == '__main__': +46 conduct_self_test()

    @@ -108,17 +112,17 @@

    logger = <Logger syn-logger (DEBUG)> - + - - + +

    - + def conduct_self_test(): @@ -132,16 +136,16 @@

    23 'outrank --task data_generator --num_synthetic_rows 100000', shell=True, 24 ) 25 subprocess.run( -26 'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;', +26 'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;', 27 shell=True, 28 ) 29 30 dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t') 31 32 logger.info("Verifying output's properties ..") -33 assert dfx.shape[0] == 120 +33 assert dfx.shape[0] == 201 34 assert dfx.shape[1] == 3 -35 assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)' +35 assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)' 36 37 to_remove = ['ranking_outputs', 'test_data_synthetic'] 38 for path in to_remove: @@ -153,7 +157,7 @@

    - +
    @@ -339,4 +343,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/task_summary.html b/docs/outrank/task_summary.html index 4c65d91..871d1ed 100644 --- a/docs/outrank/task_summary.html +++ b/docs/outrank/task_summary.html @@ -3,7 +3,7 @@ - + outrank.task_summary API documentation @@ -49,7 +49,7 @@

    API Documentation

    outrank.task_summary

    - + @@ -93,43 +93,44 @@

    37 38 min_score = np.min(final_df[f'Score {args.heuristic}'].values) 39 max_score = np.max(final_df[f'Score {args.heuristic}'].values) -40 final_df[f'Score {args.heuristic}'] = ( -41 final_df[f'Score {args.heuristic}'] - min_score -42 ) / (max_score - min_score) -43 logging.info(f'Storing summary files to {args.output_folder}') -44 pd.set_option('display.max_rows', None, 'display.max_columns', None) -45 singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') -46 final_df = final_df.reset_index(drop=True) -47 final_df.to_csv(singles_path, sep='\t') -48 -49 if args.interaction_order > 1: -50 feature_store = defaultdict(list) -51 for enx, row in final_df.iterrows(): -52 fname = row['Feature'] -53 score = row[f'Score {args.heuristic}'] -54 if 'AND' in fname: -55 for el in fname.split('-')[0].split(' AND '): -56 feature_store[el].append(score) -57 -58 final_aggregate_df = [] -59 for k, v in feature_store.items(): -60 final_aggregate_df.append( -61 { -62 'Feature': k, -63 f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median( -64 v, -65 ), -66 }, -67 ) -68 final_aggregate_df = pd.DataFrame(final_aggregate_df) -69 final_aggregate_df.to_csv( -70 os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', -71 ) -72 -73 final_df = final_df[final_df['Feature'].str.contains('_tr_')] -74 final_df.to_csv( -75 singles_path.replace('.tsv', '_transformers_only_imp.tsv'), sep='\t', -76 ) +40 if "MI" in args.heuristic: +41 final_df[f'Score {args.heuristic}'] = ( +42 final_df[f'Score {args.heuristic}'] - min_score +43 ) / (max_score - min_score) +44 logging.info(f'Storing summary files to {args.output_folder}') +45 pd.set_option('display.max_rows', None, 'display.max_columns', None) +46 singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') +47 final_df = final_df.reset_index(drop=True) +48 final_df.to_csv(singles_path, sep='\t') +49 +50 if args.interaction_order > 1: +51 feature_store = defaultdict(list) +52 for enx, row in final_df.iterrows(): +53 fname = row['Feature'] +54 score = row[f'Score {args.heuristic}'] +55 if 'AND' in fname: +56 for el in fname.split('-')[0].split(' AND '): +57 feature_store[el].append(score) +58 +59 final_aggregate_df = [] +60 for k, v in feature_store.items(): +61 final_aggregate_df.append( +62 { +63 'Feature': k, +64 f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median( +65 v, +66 ), +67 }, +68 ) +69 final_aggregate_df = pd.DataFrame(final_aggregate_df) +70 final_aggregate_df.to_csv( +71 os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', +72 ) +73 +74 final_df = final_df[final_df['Feature'].str.contains('_tr_')] +75 final_df.to_csv( +76 singles_path.replace('.tsv', '_transformers_only_imp.tsv'), sep='\t', +77 ) @@ -137,7 +138,7 @@

    - + def outrank_task_result_summary(args): @@ -172,47 +173,48 @@

    38 39 min_score = np.min(final_df[f'Score {args.heuristic}'].values) 40 max_score = np.max(final_df[f'Score {args.heuristic}'].values) -41 final_df[f'Score {args.heuristic}'] = ( -42 final_df[f'Score {args.heuristic}'] - min_score -43 ) / (max_score - min_score) -44 logging.info(f'Storing summary files to {args.output_folder}') -45 pd.set_option('display.max_rows', None, 'display.max_columns', None) -46 singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') -47 final_df = final_df.reset_index(drop=True) -48 final_df.to_csv(singles_path, sep='\t') -49 -50 if args.interaction_order > 1: -51 feature_store = defaultdict(list) -52 for enx, row in final_df.iterrows(): -53 fname = row['Feature'] -54 score = row[f'Score {args.heuristic}'] -55 if 'AND' in fname: -56 for el in fname.split('-')[0].split(' AND '): -57 feature_store[el].append(score) -58 -59 final_aggregate_df = [] -60 for k, v in feature_store.items(): -61 final_aggregate_df.append( -62 { -63 'Feature': k, -64 f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median( -65 v, -66 ), -67 }, -68 ) -69 final_aggregate_df = pd.DataFrame(final_aggregate_df) -70 final_aggregate_df.to_csv( -71 os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', -72 ) -73 -74 final_df = final_df[final_df['Feature'].str.contains('_tr_')] -75 final_df.to_csv( -76 singles_path.replace('.tsv', '_transformers_only_imp.tsv'), sep='\t', -77 ) +41 if "MI" in args.heuristic: +42 final_df[f'Score {args.heuristic}'] = ( +43 final_df[f'Score {args.heuristic}'] - min_score +44 ) / (max_score - min_score) +45 logging.info(f'Storing summary files to {args.output_folder}') +46 pd.set_option('display.max_rows', None, 'display.max_columns', None) +47 singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') +48 final_df = final_df.reset_index(drop=True) +49 final_df.to_csv(singles_path, sep='\t') +50 +51 if args.interaction_order > 1: +52 feature_store = defaultdict(list) +53 for enx, row in final_df.iterrows(): +54 fname = row['Feature'] +55 score = row[f'Score {args.heuristic}'] +56 if 'AND' in fname: +57 for el in fname.split('-')[0].split(' AND '): +58 feature_store[el].append(score) +59 +60 final_aggregate_df = [] +61 for k, v in feature_store.items(): +62 final_aggregate_df.append( +63 { +64 'Feature': k, +65 f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median( +66 v, +67 ), +68 }, +69 ) +70 final_aggregate_df = pd.DataFrame(final_aggregate_df) +71 final_aggregate_df.to_csv( +72 os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', +73 ) +74 +75 final_df = final_df[final_df['Feature'].str.contains('_tr_')] +76 final_df.to_csv( +77 singles_path.replace('.tsv', '_transformers_only_imp.tsv'), sep='\t', +78 )

    - +
    @@ -398,4 +400,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/task_visualization.html b/docs/outrank/task_visualization.html index 157be8c..4b606b4 100644 --- a/docs/outrank/task_visualization.html +++ b/docs/outrank/task_visualization.html @@ -3,7 +3,7 @@ - + outrank.task_visualization API documentation @@ -49,7 +49,7 @@

    API Documentation

    outrank.task_visualization

    - + @@ -87,7 +87,7 @@

    - + def outrank_task_visualize_results(args): @@ -112,7 +112,7 @@

    - +
    @@ -298,4 +298,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/outrank/visualizations.html b/docs/outrank/visualizations.html index 0d8575a..00e2bb1 100644 --- a/docs/outrank/visualizations.html +++ b/docs/outrank/visualizations.html @@ -3,7 +3,7 @@ - + outrank.visualizations API documentation @@ -46,10 +46,10 @@

    Submodules

    outrank.visualizations

    - - - - + + + + - + \ No newline at end of file diff --git a/docs/outrank/visualizations/ranking_visualization.html b/docs/outrank/visualizations/ranking_visualization.html index 8c8d06d..0a06918 100644 --- a/docs/outrank/visualizations/ranking_visualization.html +++ b/docs/outrank/visualizations/ranking_visualization.html @@ -3,7 +3,7 @@ - + outrank.visualizations.ranking_visualization API documentation @@ -58,7 +58,7 @@

    API Documentation

    outrank.visualizations.ranking_visualization

    - + @@ -410,7 +410,7 @@

    - + def visualize_hierarchical_clusters( triplet_dataframe: pandas.core.frame.DataFrame, output_folder: str, image_format: str = 'png', max_num_clusters: int = 100) -> None: @@ -579,7 +579,7 @@

    - + def visualize_heatmap( triplets: pandas.core.frame.DataFrame, output_folder: str, image_format: str) -> None: @@ -634,13 +634,13 @@

    - +
    - + def visualize_barplots( triplets: pandas.core.frame.DataFrame, output_folder: str, reference_json: str, image_format: str, label: str, heuristic: str) -> None: @@ -749,13 +749,13 @@

    - +
    - + def visualize_all( triplets: pandas.core.frame.DataFrame, output_folder: str, label: str = '', reference_json: str = '', image_format: str = 'png', heuristic: str = 'MI') -> None: @@ -977,4 +977,4 @@

    } }); - + \ No newline at end of file diff --git a/docs/search.js b/docs/search.js index 62f1434..871f608 100644 --- a/docs/search.js +++ b/docs/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();oWelcome to OutRank's documentation!

    \n\n

    All functions/methods can be searched-for (search bar on the left).

    \n\n

    This tool enables fast screening of feature-feature interactions. Its purpose is to give the user fast insight into potential redundancies/anomalies in the data.\nIt is implemented to operate in _mini batches_, it traverses the raw data incrementally, refining the rankings as it goes along. The core operation, interaction ranking, outputs triplets which look as follows:

    \n\n
    featureA    featureB    0.512\nfeatureA    featureC    0.125\n
    \n\n

    Setup

    \n\n
    \n
    pip install outrank\n
    \n
    \n\n

    and test a minimal cycle with

    \n\n
    \n
    outrank --task selftest\n
    \n
    \n\n

    if this passes, you can be pretty certain OutRank will perform as intended. OutRank's primary use case is as a CLI tool, begin exploring with

    \n\n
    \n
    outrank --help\n
    \n
    \n\n

    Example use cases

    \n\n
      \n
    • A minimal showcase of performing feature ranking on a generic CSV is demonstrated with this example.

    • \n
    • More examples demonstrating OutRank's capabilities are also available.

    • \n
    \n\n

    OutRank as a Python library

    \n\n

    Once installed, _OutRank_ can be used as any other Python library. For example, generic feature ranking algorithms can be accessed as

    \n\n
    \n
    from outrank.algorithms.feature_ranking.ranking_mi_numba import (\n    mutual_info_estimator_numba,\n)\n\n# Some synthetic minimal data (Numpy vectors)\na = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32)\n\nlowest = np.array(np.random.permutation(a), dtype=np.int32)\nmedium = np.array([1, 1, 0, 0, 1, 1, 1, 1], dtype=np.int32)\nhigh = np.array([1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32)\n\nlowest_score = mutual_info_estimator_numba(\n    a, lowest, np.float32(1.0), False,\n)\nmedium_score = mutual_info_estimator_numba(\n    a, medium, np.float32(1.0), False,\n)\nhigh_score = mutual_info_estimator_numba(\n    a, high, np.float32(1.0), False,\n)\n\nscores = [lowest_score, medium_score, high_score]\nsorted_score_indices = np.argsort(scores)\nassert np.sum(np.array([0, 1, 2]) - sorted_score_indices) ==  0\n
    \n
    \n"}, "outrank.algorithms": {"fullname": "outrank.algorithms", "modulename": "outrank.algorithms", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.feature_ranking": {"fullname": "outrank.algorithms.feature_ranking", "modulename": "outrank.algorithms.feature_ranking", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "numba_unique", "kind": "function", "doc": "

    Identify unique elements in an array, fast

    \n", "signature": "(a):", "funcdef": "def"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_conditional_entropy", "kind": "function", "doc": "

    \n", "signature": "(\tY_classes,\tclass_values,\tclass_var_shape,\tinitial_prob,\tnonzero_counts):", "funcdef": "def"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_entropies", "kind": "function", "doc": "

    Core entropy computation function

    \n", "signature": "(X, Y, all_events, f_values, f_value_counts, cardinality_correction):", "funcdef": "def"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "stratified_subsampling", "kind": "function", "doc": "

    \n", "signature": "(Y, X, approximation_factor, _f_values_X):", "funcdef": "def"}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "mutual_info_estimator_numba", "kind": "function", "doc": "

    Core estimator logic. Compute unique elements, subset if required

    \n", "signature": "(Y, X, approximation_factor=1.0, cardinality_correction=False):", "funcdef": "def"}, "outrank.algorithms.importance_estimator": {"fullname": "outrank.algorithms.importance_estimator", "modulename": "outrank.algorithms.importance_estimator", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.importance_estimator.logger": {"fullname": "outrank.algorithms.importance_estimator.logger", "modulename": "outrank.algorithms.importance_estimator", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, "outrank.algorithms.importance_estimator.sklearn_MI": {"fullname": "outrank.algorithms.importance_estimator.sklearn_MI", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_MI", "kind": "function", "doc": "

    \n", "signature": "(vector_first: Any, vector_second: Any) -> float:", "funcdef": "def"}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"fullname": "outrank.algorithms.importance_estimator.sklearn_surrogate", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_surrogate", "kind": "function", "doc": "

    \n", "signature": "(vector_first: Any, vector_second: Any, surrogate_model: str) -> float:", "funcdef": "def"}, "outrank.algorithms.importance_estimator.numba_mi": {"fullname": "outrank.algorithms.importance_estimator.numba_mi", "modulename": "outrank.algorithms.importance_estimator", "qualname": "numba_mi", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio):", "funcdef": "def"}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"fullname": "outrank.algorithms.importance_estimator.sklearn_mi_adj", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_mi_adj", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second):", "funcdef": "def"}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_pairwise", "kind": "function", "doc": "

    A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.

    \n", "signature": "(combination, args, tmp_df):", "funcdef": "def"}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"fullname": "outrank.algorithms.importance_estimator.rank_features_3MR", "modulename": "outrank.algorithms.importance_estimator", "qualname": "rank_features_3MR", "kind": "function", "doc": "

    \n", "signature": "(\trelevance_dict: dict[str, float],\tredundancy_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\trelational_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\tstrategy: str = 'median',\talpha: float = 1,\tbeta: float = 1) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_nonmyopic", "kind": "function", "doc": "

    \n", "signature": "(args: Any, tmp_df: pandas.core.frame.DataFrame):", "funcdef": "def"}, "outrank.algorithms.sketches": {"fullname": "outrank.algorithms.sketches", "modulename": "outrank.algorithms.sketches", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_cms": {"fullname": "outrank.algorithms.sketches.counting_cms", "modulename": "outrank.algorithms.sketches.counting_cms", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"fullname": "outrank.algorithms.sketches.counting_cms.cms_hash", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "cms_hash", "kind": "function", "doc": "

    \n", "signature": "(x, seed, width):", "funcdef": "def"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.__init__", "kind": "function", "doc": "

    \n", "signature": "(depth=6, width=32768, M=None)"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.depth", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.width", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.width", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.hash_seeds", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.M", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.M", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.add", "kind": "function", "doc": "

    \n", "signature": "(self, x, delta=1):", "funcdef": "def"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst, delta=1):", "funcdef": "def"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.query", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.query", "kind": "function", "doc": "

    \n", "signature": "(self, x):", "funcdef": "def"}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.get_matrix", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, "outrank.algorithms.sketches.counting_counters_ordinary": {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.__init__", "kind": "function", "doc": "

    \n", "signature": "(bound: int = 30000)"}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.max_bound_thr", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.default_counter", "kind": "variable", "doc": "

    \n", "annotation": ": collections.Counter"}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst):", "funcdef": "def"}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.add", "kind": "function", "doc": "

    \n", "signature": "(self, val):", "funcdef": "def"}, "outrank.algorithms.sketches.counting_ultiloglog": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "kind": "module", "doc": "

    This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache", "kind": "class", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.__init__", "kind": "function", "doc": "

    \n", "signature": "(error_rate=0.005)"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.p", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.m", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_set", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_size", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.width", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.hll_flag", "kind": "variable", "doc": "

    \n"}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.add", "kind": "function", "doc": "

    \n", "signature": "(self, value):", "funcdef": "def"}, "outrank.algorithms.synthetic_data_generators": {"fullname": "outrank.algorithms.synthetic_data_generators", "modulename": "outrank.algorithms.synthetic_data_generators", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "kind": "module", "doc": "

    \n"}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "qualname": "generate_random_matrix", "kind": "function", "doc": "

    \n", "signature": "(num_features=100, size=20000):", "funcdef": "def"}, "outrank.core_ranking": {"fullname": "outrank.core_ranking", "modulename": "outrank.core_ranking", "kind": "module", "doc": "

    \n"}, "outrank.core_ranking.logger": {"fullname": "outrank.core_ranking.logger", "modulename": "outrank.core_ranking", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"fullname": "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_CARDINALITY_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"fullname": "outrank.core_ranking.GLOBAL_COUNTS_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_COUNTS_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"fullname": "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_RARE_VALUE_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]", "default_value": "Counter()"}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"fullname": "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_PRIOR_COMB_COUNTS", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, int]", "default_value": "Counter()"}, "outrank.core_ranking.IGNORED_VALUES": {"fullname": "outrank.core_ranking.IGNORED_VALUES", "modulename": "outrank.core_ranking", "qualname": "IGNORED_VALUES", "kind": "variable", "doc": "

    \n", "default_value": "set()"}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"fullname": "outrank.core_ranking.HYPERLL_ERROR_BOUND", "modulename": "outrank.core_ranking", "qualname": "HYPERLL_ERROR_BOUND", "kind": "variable", "doc": "

    \n", "default_value": "0.02"}, "outrank.core_ranking.MAX_FEATURES_3MR": {"fullname": "outrank.core_ranking.MAX_FEATURES_3MR", "modulename": "outrank.core_ranking", "qualname": "MAX_FEATURES_3MR", "kind": "variable", "doc": "

    \n", "default_value": "10000"}, "outrank.core_ranking.prior_combinations_sample": {"fullname": "outrank.core_ranking.prior_combinations_sample", "modulename": "outrank.core_ranking", "qualname": "prior_combinations_sample", "kind": "function", "doc": "

    Make sure only relevant subspace of combinations is selected based on prior counts

    \n", "signature": "(\tcombinations: list[tuple[typing.Any, ...]],\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, "outrank.core_ranking.get_combinations_from_columns": {"fullname": "outrank.core_ranking.get_combinations_from_columns", "modulename": "outrank.core_ranking", "qualname": "get_combinations_from_columns", "kind": "function", "doc": "

    Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope

    \n", "signature": "(\tall_columns: pandas.core.indexes.base.Index,\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, "outrank.core_ranking.mixed_rank_graph": {"fullname": "outrank.core_ranking.mixed_rank_graph", "modulename": "outrank.core_ranking", "qualname": "mixed_rank_graph", "kind": "function", "doc": "

    Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tcpu_pool: Any,\tpbar: Any) -> outrank.core_utils.BatchRankingSummary:", "funcdef": "def"}, "outrank.core_ranking.enrich_with_transformations": {"fullname": "outrank.core_ranking.enrich_with_transformations", "modulename": "outrank.core_ranking", "qualname": "enrich_with_transformations", "kind": "function", "doc": "

    Construct a collection of new features based on pre-defined transformations/rules

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnum_col_types: set[str],\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_combined_features": {"fullname": "outrank.core_ranking.compute_combined_features", "modulename": "outrank.core_ranking", "qualname": "compute_combined_features", "kind": "function", "doc": "

    Compute higher order features via xxhash-based trick.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any,\tis_3mr: bool = False) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_expanded_multivalue_features": {"fullname": "outrank.core_ranking.compute_expanded_multivalue_features", "modulename": "outrank.core_ranking", "qualname": "compute_expanded_multivalue_features", "kind": "function", "doc": "

    Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value \"a,b,c\" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_subfeatures": {"fullname": "outrank.core_ranking.compute_subfeatures", "modulename": "outrank.core_ranking", "qualname": "compute_subfeatures", "kind": "function", "doc": "

    Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.\n->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.\n<->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.include_noisy_features": {"fullname": "outrank.core_ranking.include_noisy_features", "modulename": "outrank.core_ranking", "qualname": "include_noisy_features", "kind": "function", "doc": "

    Add randomized features that serve as a sanity check

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.compute_coverage": {"fullname": "outrank.core_ranking.compute_coverage", "modulename": "outrank.core_ranking", "qualname": "compute_coverage", "kind": "function", "doc": "

    Compute coverage of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, "outrank.core_ranking.compute_feature_memory_consumption": {"fullname": "outrank.core_ranking.compute_feature_memory_consumption", "modulename": "outrank.core_ranking", "qualname": "compute_feature_memory_consumption", "kind": "function", "doc": "

    An approximation of how much feature take up

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, "outrank.core_ranking.compute_value_counts": {"fullname": "outrank.core_ranking.compute_value_counts", "modulename": "outrank.core_ranking", "qualname": "compute_value_counts", "kind": "function", "doc": "

    Update the count structure

    \n", "signature": "(input_dataframe: pandas.core.frame.DataFrame, args: Any):", "funcdef": "def"}, "outrank.core_ranking.compute_cardinalities": {"fullname": "outrank.core_ranking.compute_cardinalities", "modulename": "outrank.core_ranking", "qualname": "compute_cardinalities", "kind": "function", "doc": "

    Compute cardinalities of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tpbar: Any,\tmax_unique_hist_constraint: int) -> None:", "funcdef": "def"}, "outrank.core_ranking.compute_bounds_increment": {"fullname": "outrank.core_ranking.compute_bounds_increment", "modulename": "outrank.core_ranking", "qualname": "compute_bounds_increment", "kind": "function", "doc": "

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnumeric_column_types: set[str]) -> dict[str, typing.Any]:", "funcdef": "def"}, "outrank.core_ranking.compute_batch_ranking": {"fullname": "outrank.core_ranking.compute_batch_ranking", "modulename": "outrank.core_ranking", "qualname": "compute_batch_ranking", "kind": "function", "doc": "

    Enrich the feature space and compute the batch importances

    \n", "signature": "(\tline_tmp_storage: list[list[typing.Any]],\tnumeric_column_types: set[str],\targs: Any,\tcpu_pool: Any,\tcolumn_descriptions: list[str],\tlogger: Any,\tpbar: Any) -> tuple[outrank.core_utils.BatchRankingSummary, dict[str, typing.Any], dict[str, set[str]], dict[str, set[str]]]:", "funcdef": "def"}, "outrank.core_ranking.get_num_of_instances": {"fullname": "outrank.core_ranking.get_num_of_instances", "modulename": "outrank.core_ranking", "qualname": "get_num_of_instances", "kind": "function", "doc": "

    Count the number of lines in a file, fast - useful for progress logging

    \n", "signature": "(fname: str) -> int:", "funcdef": "def"}, "outrank.core_ranking.get_grouped_df": {"fullname": "outrank.core_ranking.get_grouped_df", "modulename": "outrank.core_ranking", "qualname": "get_grouped_df", "kind": "function", "doc": "

    A helper method that enables median-based aggregation after processing

    \n", "signature": "(\timportances_df_list: list[tuple[str, str, float]]) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.core_ranking.checkpoint_importances_df": {"fullname": "outrank.core_ranking.checkpoint_importances_df", "modulename": "outrank.core_ranking", "qualname": "checkpoint_importances_df", "kind": "function", "doc": "

    A helper which stores intermediary state - useful for longer runs

    \n", "signature": "(importances_batch: list[tuple[str, str, float]]) -> None:", "funcdef": "def"}, "outrank.core_ranking.estimate_importances_minibatches": {"fullname": "outrank.core_ranking.estimate_importances_minibatches", "modulename": "outrank.core_ranking", "qualname": "estimate_importances_minibatches", "kind": "function", "doc": "

    Interaction score estimator - suitable for example for csv-like input data types.\nThis type of data is normally a single large csv, meaning that minibatch processing needs to\nhappen during incremental handling of the file (that\"s not the case for pre-separated ob data)

    \n", "signature": "(\tinput_file: str,\tcolumn_descriptions: list,\tfw_col_mapping: dict[str, str],\tnumeric_column_types: set,\tbatch_size: int = 100000,\targs: Any = None,\tdata_encoding: str = 'utf-8',\tcpu_pool: Any = None,\tdelimiter: str = '\\t',\tfeature_construction_mode: bool = False,\tlogger: Any = None) -> tuple[list[dict[str, typing.Any]], typing.Any, dict[typing.Any, typing.Any], list[dict[str, typing.Any]], list[dict[str, set[str]]], collections.defaultdict[str, list[set[str]]], dict[str, typing.Any], dict[str, typing.Any], dict[str, typing.Any]]:", "funcdef": "def"}, "outrank.core_selftest": {"fullname": "outrank.core_selftest", "modulename": "outrank.core_selftest", "kind": "module", "doc": "

    \n"}, "outrank.core_utils": {"fullname": "outrank.core_utils", "modulename": "outrank.core_utils", "kind": "module", "doc": "

    \n"}, "outrank.core_utils.pro_tips": {"fullname": "outrank.core_utils.pro_tips", "modulename": "outrank.core_utils", "qualname": "pro_tips", "kind": "variable", "doc": "

    \n", "default_value": "['OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', 'Visualization part also includes clustering - this might be very insightful!', 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', 'Give it as many threads as physically possible (--num_threads).', 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', 'Your target can be any feature! (explaining one feature with others)', 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.']"}, "outrank.core_utils.write_json_dump_to_file": {"fullname": "outrank.core_utils.write_json_dump_to_file", "modulename": "outrank.core_utils", "qualname": "write_json_dump_to_file", "kind": "function", "doc": "

    \n", "signature": "(args: Any, config_name: str) -> None:", "funcdef": "def"}, "outrank.core_utils.internal_hash": {"fullname": "outrank.core_utils.internal_hash", "modulename": "outrank.core_utils", "qualname": "internal_hash", "kind": "function", "doc": "

    A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure

    \n", "signature": "(input_obj: str) -> str:", "funcdef": "def"}, "outrank.core_utils.DatasetInformationStorage": {"fullname": "outrank.core_utils.DatasetInformationStorage", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage", "kind": "class", "doc": "

    A generic class for holding properties of a given type of dataset

    \n"}, "outrank.core_utils.DatasetInformationStorage.__init__": {"fullname": "outrank.core_utils.DatasetInformationStorage.__init__", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdata_path: str,\tcolumn_names: list[str],\tcolumn_types: set[str],\tcol_delimiter: str | None,\tencoding: str,\tfw_map: dict[str, str] | None)"}, "outrank.core_utils.DatasetInformationStorage.data_path": {"fullname": "outrank.core_utils.DatasetInformationStorage.data_path", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.data_path", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, "outrank.core_utils.DatasetInformationStorage.column_names": {"fullname": "outrank.core_utils.DatasetInformationStorage.column_names", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_names", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, "outrank.core_utils.DatasetInformationStorage.column_types": {"fullname": "outrank.core_utils.DatasetInformationStorage.column_types", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_types", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"fullname": "outrank.core_utils.DatasetInformationStorage.col_delimiter", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.col_delimiter", "kind": "variable", "doc": "

    \n", "annotation": ": str | None"}, "outrank.core_utils.DatasetInformationStorage.encoding": {"fullname": "outrank.core_utils.DatasetInformationStorage.encoding", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.encoding", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"fullname": "outrank.core_utils.DatasetInformationStorage.fw_map", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.fw_map", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, str] | None"}, "outrank.core_utils.NumericFeatureSummary": {"fullname": "outrank.core_utils.NumericFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, "outrank.core_utils.NumericFeatureSummary.__init__": {"fullname": "outrank.core_utils.NumericFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tfeature_name: str,\tminimum: float,\tmaximum: float,\tmedian: float,\tnum_unique: int)"}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"fullname": "outrank.core_utils.NumericFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, "outrank.core_utils.NumericFeatureSummary.minimum": {"fullname": "outrank.core_utils.NumericFeatureSummary.minimum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.minimum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, "outrank.core_utils.NumericFeatureSummary.maximum": {"fullname": "outrank.core_utils.NumericFeatureSummary.maximum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.maximum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, "outrank.core_utils.NumericFeatureSummary.median": {"fullname": "outrank.core_utils.NumericFeatureSummary.median", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.median", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"fullname": "outrank.core_utils.NumericFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, "outrank.core_utils.NominalFeatureSummary": {"fullname": "outrank.core_utils.NominalFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, "outrank.core_utils.NominalFeatureSummary.__init__": {"fullname": "outrank.core_utils.NominalFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(feature_name: str, num_unique: int)"}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"fullname": "outrank.core_utils.NominalFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"fullname": "outrank.core_utils.NominalFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, "outrank.core_utils.BatchRankingSummary": {"fullname": "outrank.core_utils.BatchRankingSummary", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary", "kind": "class", "doc": "

    A generic class representing batched ranking results

    \n"}, "outrank.core_utils.BatchRankingSummary.__init__": {"fullname": "outrank.core_utils.BatchRankingSummary.__init__", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttriplet_scores: list[tuple[str, str, float]],\tstep_times: dict[str, typing.Any])"}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"fullname": "outrank.core_utils.BatchRankingSummary.triplet_scores", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.triplet_scores", "kind": "variable", "doc": "

    \n", "annotation": ": list[tuple[str, str, float]]"}, "outrank.core_utils.BatchRankingSummary.step_times": {"fullname": "outrank.core_utils.BatchRankingSummary.step_times", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.step_times", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]"}, "outrank.core_utils.display_random_tip": {"fullname": "outrank.core_utils.display_random_tip", "modulename": "outrank.core_utils", "qualname": "display_random_tip", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, "outrank.core_utils.get_dataset_info": {"fullname": "outrank.core_utils.get_dataset_info", "modulename": "outrank.core_utils", "qualname": "get_dataset_info", "kind": "function", "doc": "

    \n", "signature": "(args: Any):", "funcdef": "def"}, "outrank.core_utils.display_tool_name": {"fullname": "outrank.core_utils.display_tool_name", "modulename": "outrank.core_utils", "qualname": "display_tool_name", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, "outrank.core_utils.parse_ob_line": {"fullname": "outrank.core_utils.parse_ob_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_line", "kind": "function", "doc": "

    Outbrain line parsing - generic TSVs

    \n", "signature": "(line_string: str, delimiter: str = '\\t', args: Any = None) -> list[str]:", "funcdef": "def"}, "outrank.core_utils.parse_ob_line_vw": {"fullname": "outrank.core_utils.parse_ob_line_vw", "modulename": "outrank.core_utils", "qualname": "parse_ob_line_vw", "kind": "function", "doc": "

    Parse a sparse vw line into a pandas df with pre-defined namespace

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping=None,\ttable_header=None,\tinclude_namespace_info=False) -> list[str | None]:", "funcdef": "def"}, "outrank.core_utils.parse_ob_csv_line": {"fullname": "outrank.core_utils.parse_ob_csv_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_csv_line", "kind": "function", "doc": "

    Data can have commas within JSON field dumps

    \n", "signature": "(line_string: str, delimiter: str = ',', args: Any = None) -> list[str]:", "funcdef": "def"}, "outrank.core_utils.generic_line_parser": {"fullname": "outrank.core_utils.generic_line_parser", "modulename": "outrank.core_utils", "qualname": "generic_line_parser", "kind": "function", "doc": "

    A generic method aimed to parse data from different sources.

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping: Any = None,\ttable_header: Any = None) -> list[typing.Any]:", "funcdef": "def"}, "outrank.core_utils.read_reference_json": {"fullname": "outrank.core_utils.read_reference_json", "modulename": "outrank.core_utils", "qualname": "read_reference_json", "kind": "function", "doc": "

    A helper method for reading a JSON

    \n", "signature": "(json_path) -> dict[str, dict]:", "funcdef": "def"}, "outrank.core_utils.parse_namespace": {"fullname": "outrank.core_utils.parse_namespace", "modulename": "outrank.core_utils", "qualname": "parse_namespace", "kind": "function", "doc": "

    Parse the feature namespace for type awareness

    \n", "signature": "(namespace_path: str) -> tuple[set[str], dict[str, str]]:", "funcdef": "def"}, "outrank.core_utils.read_column_names": {"fullname": "outrank.core_utils.read_column_names", "modulename": "outrank.core_utils", "qualname": "read_column_names", "kind": "function", "doc": "

    Read the col. header

    \n", "signature": "(mapping_file: str) -> list[str]:", "funcdef": "def"}, "outrank.core_utils.parse_ob_vw_feature_information": {"fullname": "outrank.core_utils.parse_ob_vw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_vw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_ob_raw_feature_information": {"fullname": "outrank.core_utils.parse_ob_raw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_raw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_ob_feature_information": {"fullname": "outrank.core_utils.parse_ob_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_csv_with_description_information": {"fullname": "outrank.core_utils.parse_csv_with_description_information", "modulename": "outrank.core_utils", "qualname": "parse_csv_with_description_information", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.parse_csv_raw": {"fullname": "outrank.core_utils.parse_csv_raw", "modulename": "outrank.core_utils", "qualname": "parse_csv_raw", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, "outrank.core_utils.extract_features_from_reference_JSON": {"fullname": "outrank.core_utils.extract_features_from_reference_JSON", "modulename": "outrank.core_utils", "qualname": "extract_features_from_reference_JSON", "kind": "function", "doc": "

    Given a model's JSON, extract unique features

    \n", "signature": "(json_path: str, combined_features_only=False) -> set[typing.Any]:", "funcdef": "def"}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"fullname": "outrank.core_utils.summarize_feature_bounds_for_transformers", "modulename": "outrank.core_utils", "qualname": "summarize_feature_bounds_for_transformers", "kind": "function", "doc": "

    summarization auxilliary method for generating JSON-based specs

    \n", "signature": "(\tbounds_object_storage: Any,\tfeature_types: list[str],\ttask_name: str,\tlabel_name: str,\tgranularity: int = 15,\toutput_summary_table_only: bool = False):", "funcdef": "def"}, "outrank.core_utils.summarize_rare_counts": {"fullname": "outrank.core_utils.summarize_rare_counts", "modulename": "outrank.core_utils", "qualname": "summarize_rare_counts", "kind": "function", "doc": "

    Write rare values

    \n", "signature": "(\tterm_counter: Any,\targs: Any,\tcardinality_object: Any,\tobject_info: outrank.core_utils.DatasetInformationStorage) -> None:", "funcdef": "def"}, "outrank.feature_transformations": {"fullname": "outrank.feature_transformations", "modulename": "outrank.feature_transformations", "kind": "module", "doc": "

    \n"}, "outrank.feature_transformations.feature_transformer_vault": {"fullname": "outrank.feature_transformations.feature_transformer_vault", "modulename": "outrank.feature_transformations.feature_transformer_vault", "kind": "module", "doc": "

    \n"}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "kind": "module", "doc": "

    \n"}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "MINIMAL_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)'}"}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "DEFAULT_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)'}"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "kind": "module", "doc": "

    \n"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "FW_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)', '_tr_fw_sqrt_res_1_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*1,0), 0))', '_tr_fw_log_res_1_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*1,0), 0))', '_tr_fw_log_res_1_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*1,0), 0))', '_tr_fw_log_res_1_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*1,0), 0))', '_tr_fw_log_res_1_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*1,0), 0))', '_tr_fw_log_res_1_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*1,0), 0))', '_tr_fw_log_res_1_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*1,0), 0))', '_tr_fw_log_res_1_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*1,0), 0))', '_tr_fw_log_res_1_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*1,0), 0))', '_tr_fw_sqrt_res_10_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*10,0), 0))', '_tr_fw_log_res_10_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*10,0), 0))', '_tr_fw_log_res_10_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*10,0), 0))', '_tr_fw_log_res_10_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*10,0), 0))', '_tr_fw_log_res_10_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*10,0), 0))', '_tr_fw_log_res_10_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*10,0), 0))', '_tr_fw_log_res_10_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*10,0), 0))', '_tr_fw_log_res_10_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*10,0), 0))', '_tr_fw_log_res_10_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*10,0), 0))', '_tr_fw_sqrt_res_50_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*50,0), 0))', '_tr_fw_log_res_50_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*50,0), 0))', '_tr_fw_log_res_50_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*50,0), 0))', '_tr_fw_log_res_50_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*50,0), 0))', '_tr_fw_log_res_50_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*50,0), 0))', '_tr_fw_log_res_50_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*50,0), 0))', '_tr_fw_log_res_50_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*50,0), 0))', '_tr_fw_log_res_50_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*50,0), 0))', '_tr_fw_log_res_50_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*50,0), 0))', '_tr_fw_sqrt_res_100_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*100,0), 0))', '_tr_fw_log_res_100_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*100,0), 0))', '_tr_fw_log_res_100_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*100,0), 0))', '_tr_fw_log_res_100_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*100,0), 0))', '_tr_fw_log_res_100_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*100,0), 0))', '_tr_fw_log_res_100_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*100,0), 0))', '_tr_fw_log_res_100_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*100,0), 0))', '_tr_fw_log_res_100_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*100,0), 0))', '_tr_fw_log_res_100_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*100,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*1,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*10,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*50,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*100,0), 0))'}"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "resolution_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 10, 50, 100]"}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "greater_than_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 2, 4, 8, 16, 32, 64, 96]"}, "outrank.feature_transformations.ranking_transformers": {"fullname": "outrank.feature_transformations.ranking_transformers", "modulename": "outrank.feature_transformations.ranking_transformers", "kind": "module", "doc": "

    \n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise", "kind": "class", "doc": "

    \n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.noise_preset", "kind": "variable", "doc": "

    \n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.construct_new_features", "kind": "function", "doc": "

    Generate a few standard noise distributions

    \n", "signature": "(self, dataframe: pandas.core.frame.DataFrame, label_column=None):", "funcdef": "def"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric", "kind": "class", "doc": "

    \n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.__init__", "kind": "function", "doc": "

    \n", "signature": "(numeric_column_names: set[str], preset: str = 'default')"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.numeric_column_names", "kind": "variable", "doc": "

    \n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.constructed_feature_names", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.max_maj_support", "kind": "variable", "doc": "

    \n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.nan_prop_support", "kind": "variable", "doc": "

    \n"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.get_vals", "kind": "function", "doc": "

    \n", "signature": "(self, tmp_df: pandas.core.frame.DataFrame, col_name: str) -> Any:", "funcdef": "def"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_baseline_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_new_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, "outrank.task_generators": {"fullname": "outrank.task_generators", "modulename": "outrank.task_generators", "kind": "module", "doc": "

    \n"}, "outrank.task_generators.logger": {"fullname": "outrank.task_generators.logger", "modulename": "outrank.task_generators", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, "outrank.task_generators.outrank_task_generate_data_set": {"fullname": "outrank.task_generators.outrank_task_generate_data_set", "modulename": "outrank.task_generators", "qualname": "outrank_task_generate_data_set", "kind": "function", "doc": "

    Core method for generating data sets

    \n", "signature": "(args):", "funcdef": "def"}, "outrank.task_ranking": {"fullname": "outrank.task_ranking", "modulename": "outrank.task_ranking", "kind": "module", "doc": "

    \n"}, "outrank.task_ranking.outrank_task_conduct_ranking": {"fullname": "outrank.task_ranking.outrank_task_conduct_ranking", "modulename": "outrank.task_ranking", "qualname": "outrank_task_conduct_ranking", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> None:", "funcdef": "def"}, "outrank.task_selftest": {"fullname": "outrank.task_selftest", "modulename": "outrank.task_selftest", "kind": "module", "doc": "

    \n"}, "outrank.task_selftest.logger": {"fullname": "outrank.task_selftest.logger", "modulename": "outrank.task_selftest", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, "outrank.task_selftest.conduct_self_test": {"fullname": "outrank.task_selftest.conduct_self_test", "modulename": "outrank.task_selftest", "qualname": "conduct_self_test", "kind": "function", "doc": "

    \n", "signature": "():", "funcdef": "def"}, "outrank.task_summary": {"fullname": "outrank.task_summary", "modulename": "outrank.task_summary", "kind": "module", "doc": "

    \n"}, "outrank.task_summary.outrank_task_result_summary": {"fullname": "outrank.task_summary.outrank_task_result_summary", "modulename": "outrank.task_summary", "qualname": "outrank_task_result_summary", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, "outrank.task_visualization": {"fullname": "outrank.task_visualization", "modulename": "outrank.task_visualization", "kind": "module", "doc": "

    \n"}, "outrank.task_visualization.outrank_task_visualize_results": {"fullname": "outrank.task_visualization.outrank_task_visualize_results", "modulename": "outrank.task_visualization", "qualname": "outrank_task_visualize_results", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, "outrank.visualizations": {"fullname": "outrank.visualizations", "modulename": "outrank.visualizations", "kind": "module", "doc": "

    \n"}, "outrank.visualizations.ranking_visualization": {"fullname": "outrank.visualizations.ranking_visualization", "modulename": "outrank.visualizations.ranking_visualization", "kind": "module", "doc": "

    \n"}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"fullname": "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_hierarchical_clusters", "kind": "function", "doc": "

    A method for visualization of hierarchical clusters w.r.t. different linkage functions

    \n", "signature": "(\ttriplet_dataframe: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str = 'png',\tmax_num_clusters: int = 100) -> None:", "funcdef": "def"}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"fullname": "outrank.visualizations.ranking_visualization.visualize_heatmap", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_heatmap", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str) -> None:", "funcdef": "def"}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"fullname": "outrank.visualizations.ranking_visualization.visualize_barplots", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_barplots", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\treference_json: str,\timage_format: str,\tlabel: str,\theuristic: str) -> None:", "funcdef": "def"}, "outrank.visualizations.ranking_visualization.visualize_all": {"fullname": "outrank.visualizations.ranking_visualization.visualize_all", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_all", "kind": "function", "doc": "

    A method for visualization of the obtained feature interaction maps.

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\tlabel: str = '',\treference_json: str = '',\timage_format: str = 'png',\theuristic: str = 'MI') -> None:", "funcdef": "def"}}, "docInfo": {"outrank": {"qualname": 0, "fullname": 1, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 896}, "outrank.algorithms": {"qualname": 0, "fullname": 2, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking": {"qualname": 0, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"qualname": 0, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"qualname": 2, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 9}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"qualname": 3, "fullname": 10, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"qualname": 2, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 41, "bases": 0, "doc": 6}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"qualname": 2, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 30, "bases": 0, "doc": 3}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"qualname": 4, "fullname": 11, "annotation": 0, "default_value": 0, "signature": 39, "bases": 0, "doc": 11}, "outrank.algorithms.importance_estimator": {"qualname": 0, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.logger": {"qualname": 1, "fullname": 5, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.sklearn_MI": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 31, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.numba_mi": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 31, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"qualname": 3, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 18, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"qualname": 4, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 22, "bases": 0, "doc": 21}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"qualname": 3, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 204, "bases": 0, "doc": 3}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"qualname": 4, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 3}, "outrank.algorithms.sketches": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms": {"qualname": 0, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 21, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"qualname": 1, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 19}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 34, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_counters_ordinary": {"qualname": 0, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"qualname": 1, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 19}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"qualname": 3, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 21, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"qualname": 4, "fullname": 10, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"qualname": 3, "fullname": 9, "annotation": 3, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"qualname": 3, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"qualname": 2, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog": {"qualname": 0, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 26}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"qualname": 1, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"qualname": 2, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.algorithms.synthetic_data_generators": {"qualname": 0, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"qualname": 0, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"qualname": 3, "fullname": 10, "annotation": 0, "default_value": 0, "signature": 27, "bases": 0, "doc": 3}, "outrank.core_ranking": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.logger": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"qualname": 3, "fullname": 6, "annotation": 5, "default_value": 1, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"qualname": 3, "fullname": 6, "annotation": 5, "default_value": 1, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"qualname": 4, "fullname": 7, "annotation": 4, "default_value": 2, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"qualname": 4, "fullname": 7, "annotation": 4, "default_value": 2, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.IGNORED_VALUES": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 2, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 2, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.MAX_FEATURES_3MR": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 1, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_ranking.prior_combinations_sample": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 75, "bases": 0, "doc": 15}, "outrank.core_ranking.get_combinations_from_columns": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 74, "bases": 0, "doc": 16}, "outrank.core_ranking.mixed_rank_graph": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 81, "bases": 0, "doc": 19}, "outrank.core_ranking.enrich_with_transformations": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 92, "bases": 0, "doc": 13}, "outrank.core_ranking.compute_combined_features": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 103, "bases": 0, "doc": 11}, "outrank.core_ranking.compute_expanded_multivalue_features": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 84, "bases": 0, "doc": 41}, "outrank.core_ranking.compute_subfeatures": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 84, "bases": 0, "doc": 70}, "outrank.core_ranking.include_noisy_features": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 73, "bases": 0, "doc": 11}, "outrank.core_ranking.compute_coverage": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 63, "bases": 0, "doc": 7}, "outrank.core_ranking.compute_feature_memory_consumption": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 63, "bases": 0, "doc": 10}, "outrank.core_ranking.compute_value_counts": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 6}, "outrank.core_ranking.compute_cardinalities": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 61, "bases": 0, "doc": 7}, "outrank.core_ranking.compute_bounds_increment": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 72, "bases": 0, "doc": 3}, "outrank.core_ranking.compute_batch_ranking": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 197, "bases": 0, "doc": 11}, "outrank.core_ranking.get_num_of_instances": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 19, "bases": 0, "doc": 15}, "outrank.core_ranking.get_grouped_df": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 59, "bases": 0, "doc": 12}, "outrank.core_ranking.checkpoint_importances_df": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 42, "bases": 0, "doc": 12}, "outrank.core_ranking.estimate_importances_minibatches": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 418, "bases": 0, "doc": 48}, "outrank.core_selftest": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.pro_tips": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 303, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.write_json_dump_to_file": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 30, "bases": 0, "doc": 3}, "outrank.core_utils.internal_hash": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 20, "bases": 0, "doc": 17}, "outrank.core_utils.DatasetInformationStorage": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 14}, "outrank.core_utils.DatasetInformationStorage.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 111, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.data_path": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.column_names": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.column_types": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"qualname": 3, "fullname": 6, "annotation": 4, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.encoding": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"qualname": 3, "fullname": 6, "annotation": 5, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 9}, "outrank.core_utils.NumericFeatureSummary.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 61, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.minimum": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.maximum": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.median": {"qualname": 2, "fullname": 5, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NominalFeatureSummary": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 9}, "outrank.core_utils.NominalFeatureSummary.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"qualname": 3, "fullname": 6, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.BatchRankingSummary": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 9}, "outrank.core_utils.BatchRankingSummary.__init__": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 67, "bases": 0, "doc": 3}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"qualname": 3, "fullname": 6, "annotation": 4, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.BatchRankingSummary.step_times": {"qualname": 3, "fullname": 6, "annotation": 4, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.core_utils.display_random_tip": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 10, "bases": 0, "doc": 3}, "outrank.core_utils.get_dataset_info": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 16, "bases": 0, "doc": 3}, "outrank.core_utils.display_tool_name": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 10, "bases": 0, "doc": 3}, "outrank.core_utils.parse_ob_line": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 68, "bases": 0, "doc": 7}, "outrank.core_utils.parse_ob_line_vw": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 100, "bases": 0, "doc": 15}, "outrank.core_utils.parse_ob_csv_line": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 63, "bases": 0, "doc": 10}, "outrank.core_utils.generic_line_parser": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 100, "bases": 0, "doc": 13}, "outrank.core_utils.read_reference_json": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 27, "bases": 0, "doc": 9}, "outrank.core_utils.parse_namespace": {"qualname": 2, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 46, "bases": 0, "doc": 9}, "outrank.core_utils.read_column_names": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 6}, "outrank.core_utils.parse_ob_vw_feature_information": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 9}, "outrank.core_utils.parse_ob_raw_feature_information": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 9}, "outrank.core_utils.parse_ob_feature_information": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 9}, "outrank.core_utils.parse_csv_with_description_information": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.core_utils.parse_csv_raw": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 26, "bases": 0, "doc": 3}, "outrank.core_utils.extract_features_from_reference_JSON": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 43, "bases": 0, "doc": 10}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 100, "bases": 0, "doc": 10}, "outrank.core_utils.summarize_rare_counts": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 67, "bases": 0, "doc": 5}, "outrank.feature_transformations": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault": {"qualname": 0, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"qualname": 0, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 56, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 173, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"qualname": 0, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 4589, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"qualname": 2, "fullname": 10, "annotation": 0, "default_value": 4, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"qualname": 3, "fullname": 11, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers": {"qualname": 0, "fullname": 5, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"qualname": 1, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 47, "bases": 0, "doc": 8}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"qualname": 1, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 43, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"qualname": 4, "fullname": 9, "annotation": 2, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"qualname": 3, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 51, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 39, "bases": 0, "doc": 3}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"qualname": 4, "fullname": 9, "annotation": 0, "default_value": 0, "signature": 39, "bases": 0, "doc": 3}, "outrank.task_generators": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_generators.logger": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_generators.outrank_task_generate_data_set": {"qualname": 5, "fullname": 8, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 8}, "outrank.task_ranking": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_ranking.outrank_task_conduct_ranking": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 19, "bases": 0, "doc": 3}, "outrank.task_selftest": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_selftest.logger": {"qualname": 1, "fullname": 4, "annotation": 0, "default_value": 8, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_selftest.conduct_self_test": {"qualname": 3, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 7, "bases": 0, "doc": 3}, "outrank.task_summary": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_summary.outrank_task_result_summary": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 3}, "outrank.task_visualization": {"qualname": 0, "fullname": 3, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.task_visualization.outrank_task_visualize_results": {"qualname": 4, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 11, "bases": 0, "doc": 3}, "outrank.visualizations": {"qualname": 0, "fullname": 2, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization": {"qualname": 0, "fullname": 4, "annotation": 0, "default_value": 0, "signature": 0, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"qualname": 3, "fullname": 7, "annotation": 0, "default_value": 0, "signature": 91, "bases": 0, "doc": 15}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 59, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 93, "bases": 0, "doc": 3}, "outrank.visualizations.ranking_visualization.visualize_all": {"qualname": 2, "fullname": 6, "annotation": 0, "default_value": 0, "signature": 135, "bases": 0, "doc": 13}}, "length": 164, "save": true}, "index": {"qualname": {"root": {"3": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}}, "df": 2}}}, "docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 8, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3, "b": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}}, "df": 3}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 1, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}}, "df": 7}}}}}}}}}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 3, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 4, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "w": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 2}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3}}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 11}}}}, "b": {"docs": {"outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 2}}}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 2}}}}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 3, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 1}}}}}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}}, "df": 10}}}}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}}, "df": 1}}, "s": {"docs": {"outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 4}}}}, "l": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_coverage": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 3}}, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}, "x": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}}, "df": 1}}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}}, "df": 3}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}}, "df": 1}}}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}, "y": {"docs": {"outrank.task_summary.outrank_task_result_summary": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 2}}}}}}, "k": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 3}}}}}}, "e": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}}, "df": 1}}}, "t": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}, "l": {"docs": {}, "df": 0, "f": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "m": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}}, "df": 2, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}}, "i": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 3, "x": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "x": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 2}}}}, "x": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 3, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 1}, "j": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 2, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 4}}}}}}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 8}}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 4}}}}}}}}}}, "g": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 4}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}}}}, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "j": {"docs": {"outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 5}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 8}, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}}}, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}}, "df": 4}}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}}}}}, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 9, "r": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}}, "df": 6}}}}}}}}}}}}}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 2}}}, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "p": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}}, "df": 2}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 2}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}, "w": {"docs": {"outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 2}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 2}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}}}}}, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_summary.outrank_task_result_summary": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 1}}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 8, "s": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 9}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 3}}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 9}}}}}}}}}}}}}}}}}}}}}}}}, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1}}}, "w": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}}, "df": 3}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 9}}}}}}}}}}}, "l": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {"outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}}, "df": 1}}}}}}}, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}}, "df": 1}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}}, "df": 2}}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 1}}}}}}}}}}, "f": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 8}}}}}}}}}}}}}}}}}}}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 2}}}}}}}, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}}, "df": 2}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 2}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 2}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1}}}}}, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 3, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}}, "df": 1}}}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 2}}}}}}, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}}, "df": 1}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}}, "df": 1}, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 4}}}}}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 1}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 4}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}, "s": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 1}}}, "w": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}}, "df": 2}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 5}}}}}}}}}, "o": {"docs": {}, "df": 0, "f": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}, "b": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 6}, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 4}}}}}}}, "j": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 3}}}}}}, "fullname": {"root": {"3": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}}, "df": 2}}}, "docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 8, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms": {"tf": 1}, "outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.algorithms.sketches": {"tf": 1}, "outrank.algorithms.sketches.counting_cms": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}, "outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_ranking": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_selftest": {"tf": 1}, "outrank.core_utils": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.feature_transformations": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.task_generators": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1.4142135623730951}, "outrank.task_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.4142135623730951}, "outrank.task_selftest": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}, "outrank.task_summary": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.4142135623730951}, "outrank.task_visualization": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1.4142135623730951}, "outrank.visualizations": {"tf": 1}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 164}}}}}}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}}, "df": 7}}}}}}}, "f": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}, "b": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 6}}, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms": {"tf": 1}, "outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.algorithms.sketches": {"tf": 1}, "outrank.algorithms.sketches.counting_cms": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}, "outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 50}}}}}}}}, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}, "d": {"docs": {}, "df": 0, "j": {"docs": {"outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 5}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 36, "s": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 9}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 3}}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 9}}}}}}}}}}}}}}}}}}}}}}}}, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1}}}, "w": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 5}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.feature_ranking": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1.4142135623730951}, "outrank.core_ranking": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.task_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 54}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 2}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}, "w": {"docs": {"outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 2}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 2}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}}}}}, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_summary.outrank_task_result_summary": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}}, "df": 2, "i": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 9, "x": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}}, "df": 1}}}}}}, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "x": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 2}}}}, "x": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 3, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {"outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 1}, "j": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3, "b": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}}, "df": 7}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 1, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}}, "df": 7}}}}}}}}}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 2}}}, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 3, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 4, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "w": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 2}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3}}}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 10}}}}}}}}}, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 45}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 11}}}}, "b": {"docs": {"outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 2}}}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 2}}}}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 3, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 1}}}}}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_cms": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 29}}}, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}}, "df": 10}}}}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}}, "df": 7}}}, "s": {"docs": {"outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 4}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_selftest": {"tf": 1}, "outrank.core_utils": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 73}}, "l": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_coverage": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_cms": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1.4142135623730951}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}}, "df": 12}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 3}}, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 10}}, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}, "x": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}}, "df": 1}}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}}, "df": 3}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}}, "df": 1}}}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}, "y": {"docs": {"outrank.task_summary": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.4142135623730951}}, "df": 2}}}}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 2}}}}}}, "k": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 3}}}}}, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches": {"tf": 1}, "outrank.algorithms.sketches.counting_cms": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 30}}}}}}}, "e": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}}, "df": 1}}}, "t": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}, "l": {"docs": {}, "df": 0, "f": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_selftest": {"tf": 1}, "outrank.task_selftest": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 4}}}}}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 1}}}, "y": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 3}}}}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 2, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 4}}}}}}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 8}}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator": {"tf": 1}, "outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}}, "df": 9, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 4}}}}}}}}}}, "g": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 4}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 8}, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.task_generators": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 6}}}, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}}}, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}}, "df": 4}}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}}}}}, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 9, "r": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}}, "df": 6}}}}}}}}}}}}}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 2}}}, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "p": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}}, "df": 1}}}}}}, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}}, "df": 3}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 9}}}}}}}}}}}, "l": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {"outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}}, "df": 1}}}}}}}, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1}}, "df": 1}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 4}}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 1}}}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.synthetic_data_generators": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 5, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.get_dataset_info": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 8}}}}}}}}}}}}}}}}}}}}}}}}, "f": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 2}}}}}}}, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1}}, "df": 2}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}}, "df": 2}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1}}, "df": 2}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1}}}}}, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 3, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 4}}}}}}}}}}}}}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}}, "df": 1}}}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 2}}}}}}, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}}, "df": 1}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1}}, "df": 1}, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.feature_transformations": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 23}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 8, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}, "outrank.feature_transformations.ranking_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 21}}}}}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.display_random_tip": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.display_tool_name": {"tf": 1}}, "df": 1}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.task_generators": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1.4142135623730951}, "outrank.task_ranking": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.4142135623730951}, "outrank.task_selftest": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}, "outrank.task_selftest.conduct_self_test": {"tf": 1}, "outrank.task_summary": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.4142135623730951}, "outrank.task_visualization": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1.4142135623730951}}, "df": 12}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.task_selftest.conduct_self_test": {"tf": 1}}, "df": 1}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1}}}, "s": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 1}}, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 8}}}}, "w": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}}, "df": 2}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.task_visualization": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 7, "s": {"docs": {"outrank.visualizations": {"tf": 1}, "outrank.visualizations.ranking_visualization": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 6}}}}}}, "e": {"docs": {"outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 5}}}}}}}}}, "j": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 3}}}}}}, "annotation": {"root": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1.4142135623730951}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 21, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}}, "df": 1}}}}}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1}}, "df": 1}}}}}}}, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}}, "df": 3}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 3}}}}}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1.4142135623730951}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1.4142135623730951}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 5}}}, "t": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1}}, "df": 4}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1}}, "df": 3}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 7}}, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1}}, "df": 2}}}}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1}}, "df": 1}}}, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "[": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 1}}}}}}}}}}}}}}, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1}}, "df": 2}}}}, "f": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1}}, "df": 4}}}}}}}, "default_value": {"root": {"0": {"1": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 1}, "2": {"docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 2}, "4": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 1}, "8": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}}, "df": 1}, "docs": {"outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 22.715633383201094}}, "df": 3}, "1": {"0": {"0": {"0": {"0": {"docs": {"outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8.06225774829855}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 3}, "docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 2}, "6": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {"outrank.core_utils.pro_tips": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2.8284271247461903}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 10.198039027185569}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 6}, "2": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.830951894845301}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 3}, "3": {"2": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "4": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "5": {"0": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "6": {"4": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "8": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 5.656854249492381}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "9": {"6": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1.4142135623730951}, "outrank.core_ranking.logger": {"tf": 1.4142135623730951}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2.8284271247461903}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 4.795831523312719}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 12.288205727444508}, "outrank.task_generators.logger": {"tf": 1.4142135623730951}, "outrank.task_selftest.logger": {"tf": 1.4142135623730951}}, "df": 13, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.313708498984761}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 6}, "o": {"docs": {}, "df": 0, "g": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 3.1622776601683795}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.74734012447073}}, "df": 3, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1.4142135623730951}, "outrank.core_ranking.logger": {"tf": 1.4142135623730951}, "outrank.task_generators.logger": {"tf": 1.4142135623730951}, "outrank.task_selftest.logger": {"tf": 1.4142135623730951}}, "df": 4}}}, "*": {"1": {"0": {"0": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}}}}}}, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "s": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 4}}, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.IGNORED_VALUES": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "q": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2.6457513110645907}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.61895003862225}}, "df": 3}}}}, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 4}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "f": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "v": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 2, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 2}}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "g": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.logger": {"tf": 1}, "outrank.core_ranking.logger": {"tf": 1}, "outrank.core_utils.pro_tips": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 16}, "outrank.task_generators.logger": {"tf": 1}, "outrank.task_selftest.logger": {"tf": 1}}, "df": 6}, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "c": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1}}, "df": 2}}}}}, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "v": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "x": {"2": {"7": {"docs": {"outrank.core_utils.pro_tips": {"tf": 5.656854249492381}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 4}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 6.324555320336759}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 23.49468024894146}}, "df": 4}, "docs": {}, "df": 0}, "docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2.449489742783178}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 4.795831523312719}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 23.130067012440755}}, "df": 3, "+": {"1": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 3}, "docs": {}, "df": 0}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1}}}}}}, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "f": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 3.872983346207417}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "w": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.313708498984761}}, "df": 2}}, "b": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "u": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "x": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1}, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1}, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "b": {"docs": {}, "df": 0, "s": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 2.8284271247461903}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 2.8284271247461903}}, "df": 3}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "n": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}, "x": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.4142135623730951}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.4142135623730951}}, "df": 2}}, "i": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}}}}}, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}, "o": {"docs": {}, "df": 0, "w": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "b": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "n": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2}}}}}}, "p": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2.449489742783178}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 4.58257569495584}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 23.08679276123039}}, "df": 3}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "k": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2.23606797749979}}, "df": 1}}}}, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}}}, "s": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.313708498984761}}, "df": 1}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.445523142259598}}, "df": 2}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "w": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}}, "df": 1}}}}, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "o": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 2}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 3.1622776601683795}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 11.74734012447073}}, "df": 3, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}}}}}, "w": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 16.0312195418814}}, "df": 2}}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "w": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1}}, "df": 2, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {"outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 8}}, "df": 1}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1.4142135623730951}}, "df": 1}}}}, "k": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "p": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}, "y": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {"outrank.core_utils.pro_tips": {"tf": 2}}, "df": 1, "r": {"docs": {"outrank.core_utils.pro_tips": {"tf": 1}}, "df": 1}}}}}}, "signature": {"root": {"0": {"0": {"5": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 2}, "1": {"0": {"0": {"0": {"0": {"0": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 2}, "docs": {}, "df": 0}, "5": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}, "docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}}, "df": 4}, "2": {"0": {"0": {"0": {"0": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "3": {"0": {"0": {"0": {"0": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "2": {"7": {"6": {"8": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "9": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2}, "outrank.core_utils.parse_ob_line": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 2.8284271247461903}}, "df": 7}, "docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}, "6": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}}, "df": 1}, "8": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}, "docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 3.1622776601683795}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 5.5677643628300215}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 5.477225575051661}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 4.795831523312719}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 5.477225575051661}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 4.898979485566356}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 5.656854249492381}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 4.69041575982343}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 3.7416573867739413}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 4.242640687119285}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 12.806248474865697}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 5.830951894845301}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 4.242640687119285}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 5.291502622129181}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 4.69041575982343}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 4.69041575982343}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 3.7416573867739413}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 3.1622776601683795}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 4.242640687119285}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 3.7416573867739413}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 3.7416573867739413}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 3.4641016151377544}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 3.7416573867739413}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 4.69041575982343}, "outrank.core_ranking.prior_combinations_sample": {"tf": 8}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 7.810249675906654}, "outrank.core_ranking.mixed_rank_graph": {"tf": 8}, "outrank.core_ranking.enrich_with_transformations": {"tf": 8.54400374531753}, "outrank.core_ranking.compute_combined_features": {"tf": 9.1104335791443}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 8.246211251235321}, "outrank.core_ranking.compute_subfeatures": {"tf": 8.246211251235321}, "outrank.core_ranking.include_noisy_features": {"tf": 7.681145747868608}, "outrank.core_ranking.compute_coverage": {"tf": 7.14142842854285}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 7.14142842854285}, "outrank.core_ranking.compute_value_counts": {"tf": 5.830951894845301}, "outrank.core_ranking.compute_cardinalities": {"tf": 6.855654600401044}, "outrank.core_ranking.compute_bounds_increment": {"tf": 7.54983443527075}, "outrank.core_ranking.compute_batch_ranking": {"tf": 12.449899597988733}, "outrank.core_ranking.get_num_of_instances": {"tf": 4}, "outrank.core_ranking.get_grouped_df": {"tf": 6.855654600401044}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 5.830951894845301}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 18.16590212458495}, "outrank.core_utils.write_json_dump_to_file": {"tf": 4.898979485566356}, "outrank.core_utils.internal_hash": {"tf": 4}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 9.38083151964686}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 7}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 4.47213595499958}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 7.3484692283495345}, "outrank.core_utils.display_random_tip": {"tf": 3}, "outrank.core_utils.get_dataset_info": {"tf": 3.7416573867739413}, "outrank.core_utils.display_tool_name": {"tf": 3}, "outrank.core_utils.parse_ob_line": {"tf": 7.416198487095663}, "outrank.core_utils.parse_ob_line_vw": {"tf": 8.831760866327848}, "outrank.core_utils.parse_ob_csv_line": {"tf": 7.14142842854285}, "outrank.core_utils.generic_line_parser": {"tf": 8.94427190999916}, "outrank.core_utils.read_reference_json": {"tf": 4.69041575982343}, "outrank.core_utils.parse_namespace": {"tf": 6.082762530298219}, "outrank.core_utils.read_column_names": {"tf": 4.58257569495584}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_ob_feature_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 4.47213595499958}, "outrank.core_utils.parse_csv_raw": {"tf": 4.47213595499958}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 5.744562646538029}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 8.774964387392123}, "outrank.core_utils.summarize_rare_counts": {"tf": 7.211102550927978}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 6.164414002968976}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 5.744562646538029}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 6.324555320336759}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 5.656854249492381}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 5.656854249492381}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 3.1622776601683795}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 4}, "outrank.task_selftest.conduct_self_test": {"tf": 2.6457513110645907}, "outrank.task_summary.outrank_task_result_summary": {"tf": 3.1622776601683795}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 3.1622776601683795}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 8.366600265340756}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 6.855654600401044}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 8.602325267042627}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 10.295630140987}}, "df": 80, "a": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}}, "df": 1, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 2}, "p": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}}}}}}}}}}}}, "n": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 2.449489742783178}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1.7320508075688772}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1.4142135623730951}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.7320508075688772}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 2.449489742783178}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 3.3166247903554}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 2}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}}, "df": 33}}, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.get_dataset_info": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1}}, "df": 26}}}}, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 4}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1.4142135623730951}}, "df": 1, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 2}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}}}}}}}, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 30}}, "m": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 6, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 6, "s": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}}}, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}}}, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 3}}}}}}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}}, "df": 1, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}}, "df": 3}}}}, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1.4142135623730951}}, "df": 4}}}}}}, "s": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 4}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}}, "df": 1}}, "l": {"docs": {}, "df": 0, "f": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}}, "df": 11}}, "t": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 10}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_batch_ranking": {"tf": 2.6457513110645907}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 3.7416573867739413}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 2.6457513110645907}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_line": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1.7320508075688772}, "outrank.core_utils.generic_line_parser": {"tf": 1.4142135623730951}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 2}, "outrank.core_utils.read_column_names": {"tf": 1.4142135623730951}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 2.23606797749979}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 2.23606797749979}}, "df": 32, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 2}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}}, "t": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 8}, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "x": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}}, "df": 13}}}, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}}}}, "f": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 2}}}, "s": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}}}}}}}}}, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}}, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "b": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 23}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 9}}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}}, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 6}}}, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 2}}}, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1.4142135623730951}, "outrank.core_utils.display_random_tip": {"tf": 1}, "outrank.core_utils.display_tool_name": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 2}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1.7320508075688772}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 19}}}, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 5, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 4}}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.write_json_dump_to_file": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 5, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 2, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 2}}}}}}}}}, "x": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1}}, "df": 6}, "e": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}}, "df": 2}}}}}}}}, "f": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}}, "df": 2, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}}}}, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 6}}}}, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1}}, "df": 4}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 2}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.7320508075688772}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 7}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 22}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 2}}}}}}}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}, "w": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 4}, "o": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}}}, "m": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}}, "df": 1, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1, "l": {"docs": {"outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1}}, "df": 1}}}}, "i": {"docs": {"outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 2, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}}, "df": 2}}}}}, "a": {"docs": {}, "df": 0, "x": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 2, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}}, "df": 1}}}}}, "p": {"docs": {"outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}}, "df": 1, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}}, "df": 4}}}}}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 3}}}}}}}, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 2}}}}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {"outrank.algorithms.importance_estimator.numba_mi": {"tf": 1}}, "df": 1}}, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}}}}, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}}}}}}, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 2}}}}}}}}}, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 2, "m": {"docs": {}, "df": 0, "p": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 4}}, "u": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.4142135623730951}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 9}}}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 2.449489742783178}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2.8284271247461903}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 9}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 6}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 3}}}}}}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 3}}}, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "f": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}}, "df": 4}, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 2.449489742783178}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2.8284271247461903}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 10}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}}, "df": 7, "f": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_combined_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}, "outrank.core_ranking.include_noisy_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_coverage": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_value_counts": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_cardinalities": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 22}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 6}}}}}}}}}}}}}}}}}}}}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}}, "df": 1}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}}, "df": 2}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 6}}}}}}}, "s": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}}}}}}}}, "f": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1}}, "df": 1, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}}}}}, "b": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 3}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 2}}}}}}}}}}}}}}}}}}}, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1}}, "df": 2}}}}}, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1}}, "df": 2}}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.get_grouped_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 2.23606797749979}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 14}}, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 5}}}, "o": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 7}}}}}, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 8}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 5}}}}}, "b": {"docs": {}, "df": 0, "j": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1.4142135623730951}}, "df": 2}}}}}, "n": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 2}}}}, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1}, "outrank.core_utils.parse_csv_raw": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 8}}}, "f": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1}}, "df": 3}}}}}}, "j": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4}}}}, "g": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}}}}}}}}, "bases": {"root": {"docs": {}, "df": 0}}, "doc": {"root": {"0": {"docs": {"outrank": {"tf": 4}}, "df": 1}, "1": {"2": {"5": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {"outrank": {"tf": 4.358898943540674}}, "df": 1}, "2": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "5": {"1": {"2": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}, "docs": {"outrank": {"tf": 24.20743687382041}, "outrank.algorithms": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking.ranking_mi_numba": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling": {"tf": 1.7320508075688772}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.logger": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.sklearn_MI": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.sklearn_surrogate": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.numba_mi": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.sklearn_mi_adj": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.rank_features_3MR": {"tf": 1.7320508075688772}, "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.cms_hash": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.width": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.M": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.add": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.query": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_counters_ordinary": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1.4142135623730951}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add": {"tf": 1.7320508075688772}, "outrank.algorithms.synthetic_data_generators": {"tf": 1.7320508075688772}, "outrank.algorithms.synthetic_data_generators.generator_naive": {"tf": 1.7320508075688772}, "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix": {"tf": 1.7320508075688772}, "outrank.core_ranking": {"tf": 1.7320508075688772}, "outrank.core_ranking.logger": {"tf": 1.7320508075688772}, "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE": {"tf": 1.7320508075688772}, "outrank.core_ranking.GLOBAL_COUNTS_STORAGE": {"tf": 1.7320508075688772}, "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE": {"tf": 1.7320508075688772}, "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS": {"tf": 1.7320508075688772}, "outrank.core_ranking.IGNORED_VALUES": {"tf": 1.7320508075688772}, "outrank.core_ranking.HYPERLL_ERROR_BOUND": {"tf": 1.7320508075688772}, "outrank.core_ranking.MAX_FEATURES_3MR": {"tf": 1.7320508075688772}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1.4142135623730951}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_combined_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 2.23606797749979}, "outrank.core_ranking.include_noisy_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_coverage": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_value_counts": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_cardinalities": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_bounds_increment": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_num_of_instances": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_grouped_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_selftest": {"tf": 1.7320508075688772}, "outrank.core_utils": {"tf": 1.7320508075688772}, "outrank.core_utils.pro_tips": {"tf": 1.7320508075688772}, "outrank.core_utils.write_json_dump_to_file": {"tf": 1.7320508075688772}, "outrank.core_utils.internal_hash": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.data_path": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.column_names": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.column_types": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.col_delimiter": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.encoding": {"tf": 1.7320508075688772}, "outrank.core_utils.DatasetInformationStorage.fw_map": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1.4142135623730951}, "outrank.core_utils.NumericFeatureSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.feature_name": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.minimum": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.maximum": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.median": {"tf": 1.7320508075688772}, "outrank.core_utils.NumericFeatureSummary.num_unique": {"tf": 1.7320508075688772}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1.4142135623730951}, "outrank.core_utils.NominalFeatureSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.NominalFeatureSummary.feature_name": {"tf": 1.7320508075688772}, "outrank.core_utils.NominalFeatureSummary.num_unique": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary": {"tf": 1.4142135623730951}, "outrank.core_utils.BatchRankingSummary.__init__": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary.triplet_scores": {"tf": 1.7320508075688772}, "outrank.core_utils.BatchRankingSummary.step_times": {"tf": 1.7320508075688772}, "outrank.core_utils.display_random_tip": {"tf": 1.7320508075688772}, "outrank.core_utils.get_dataset_info": {"tf": 1.7320508075688772}, "outrank.core_utils.display_tool_name": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_line": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1.4142135623730951}, "outrank.core_utils.generic_line_parser": {"tf": 1.7320508075688772}, "outrank.core_utils.read_reference_json": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_namespace": {"tf": 1.4142135623730951}, "outrank.core_utils.read_column_names": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_csv_with_description_information": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_csv_raw": {"tf": 1.7320508075688772}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1.4142135623730951}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1.4142135623730951}, "outrank.core_utils.summarize_rare_counts": {"tf": 1.4142135623730951}, "outrank.feature_transformations": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range": {"tf": 1.7320508075688772}, "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1.4142135623730951}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features": {"tf": 1.7320508075688772}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features": {"tf": 1.7320508075688772}, "outrank.task_generators": {"tf": 1.7320508075688772}, "outrank.task_generators.logger": {"tf": 1.7320508075688772}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1.4142135623730951}, "outrank.task_ranking": {"tf": 1.7320508075688772}, "outrank.task_ranking.outrank_task_conduct_ranking": {"tf": 1.7320508075688772}, "outrank.task_selftest": {"tf": 1.7320508075688772}, "outrank.task_selftest.logger": {"tf": 1.7320508075688772}, "outrank.task_selftest.conduct_self_test": {"tf": 1.7320508075688772}, "outrank.task_summary": {"tf": 1.7320508075688772}, "outrank.task_summary.outrank_task_result_summary": {"tf": 1.7320508075688772}, "outrank.task_visualization": {"tf": 1.7320508075688772}, "outrank.task_visualization.outrank_task_visualize_results": {"tf": 1.7320508075688772}, "outrank.visualizations": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1.4142135623730951}, "outrank.visualizations.ranking_visualization.visualize_heatmap": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization.visualize_barplots": {"tf": 1.7320508075688772}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1.7320508075688772}}, "df": 164, "w": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 4}}}}, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {"outrank": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 6, "i": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}}}, "t": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "o": {"docs": {"outrank": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 5, "o": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 2.449489742783178}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1.4142135623730951}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.read_column_names": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 14}, "i": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 4}}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}}, "df": 4}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "/": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}}}}}}}}}}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "k": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "k": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}, "w": {"docs": {}, "df": 0, "o": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}}, "df": 1}}, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 3, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank": {"tf": 3.1622776601683795}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}, "b": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {"outrank": {"tf": 1.4142135623730951}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 6, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 2, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}, "f": {"docs": {"outrank": {"tf": 1.4142135623730951}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 18}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2}}}}}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}, "b": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 4, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}}}}}}}, "s": {"docs": {"outrank": {"tf": 1.7320508075688772}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 4, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "p": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "s": {"docs": {"outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 1}}, "l": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2}}}}}}, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1, "d": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 2.8284271247461903}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 2}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}}}, "y": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank": {"tf": 1}}, "df": 1, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}}}}}}, "b": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}}, "df": 2}}, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "k": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 2}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}}, "df": 2}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}}, "df": 2}}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "d": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}}, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 1, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 2}}, "r": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {"outrank": {"tf": 1.7320508075688772}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 9, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}}, "df": 1}}}}}}, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}}}}, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 2}}}}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "y": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 2}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "f": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 2}}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}}}}}}}}}}, "a": {"docs": {"outrank": {"tf": 3.1622776601683795}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.7320508075688772}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1.4142135623730951}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1.4142135623730951}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1.4142135623730951}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 28, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2}, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}, "s": {"docs": {}, "df": 0, "o": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "g": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "m": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1}}}}}}}}}, "s": {"docs": {"outrank": {"tf": 2.6457513110645907}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 3, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 2, "d": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 3}, "y": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 3}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "y": {"docs": {"outrank": {"tf": 2.23606797749979}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}}, "df": 2}}}, "g": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}, "o": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}, "c": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}, "u": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 1}}}}}}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}, "|": {"docs": {}, "df": 0, "*": {"docs": {}, "df": 0, "|": {"docs": {}, "df": 0, "b": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}, "d": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "g": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}}}}}}}, "f": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 1}}}}, "w": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 1}}}}}}}}}, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "/": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}}}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank": {"tf": 1.4142135623730951}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.7320508075688772}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 13, "m": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 3}}, "l": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1.7320508075688772}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 2.23606797749979}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1.7320508075688772}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 2}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 11, "a": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1}, "b": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "c": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "s": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.include_noisy_features": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 8}}}}}}, "w": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 2}, "outrank.core_utils.generic_line_parser": {"tf": 1}}, "df": 3}}}, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"3": {"2": {"docs": {"outrank": {"tf": 1.7320508075688772}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 1}}, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 2}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 3}, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}, "l": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 1}}}}}}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 4}, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}}}}}}}}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 8, "d": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 2}}}}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.core_ranking.compute_value_counts": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1.7320508075688772}}, "df": 1}}}}}}}}, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}}, "l": {"docs": {"outrank.core_utils.read_column_names": {"tf": 1}}, "df": 1, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}}, "df": 1}}}}}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_coverage": {"tf": 1}}, "df": 1}}}}}}}, "y": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 4}}}, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}, "s": {"docs": {}, "df": 0, "v": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1.4142135623730951}}, "df": 2}}, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}}, "b": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2, "e": {"docs": {"outrank": {"tf": 2}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 3, "g": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}, "outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_combined_features": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 10}}}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}, "t": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}, "o": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "k": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "w": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 2}}, "df": 1}}}}, "g": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}, "g": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}, "n": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1}}}}}, "t": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}, "k": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}, "k": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}, "b": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 2}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}}}}, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 1}}}}}, "x": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1.7320508075688772}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}}}, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank": {"tf": 2}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 3}}, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}}}}}}}, "f": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2}}}}}}}}, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 2}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1.4142135623730951}, "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 6, "t": {"3": {"2": {"docs": {"outrank": {"tf": 2}}, "df": 1}, "docs": {}, "df": 0}, "docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2}}}}}}}, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}, "o": {"docs": {"outrank": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 2}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}, "c": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.compute_coverage": {"tf": 1}, "outrank.core_ranking.compute_cardinalities": {"tf": 1}}, "df": 3}}}}}}}}}}}, "f": {"docs": {}, "df": 0, "o": {"docs": {"outrank": {"tf": 2}}, "df": 1}}, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1}}}, "v": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}}, "e": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}}, "df": 1}}}}}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "t": {"docs": {"outrank": {"tf": 1.7320508075688772}}, "df": 1, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "s": {"docs": {"outrank": {"tf": 2}, "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.prior_combinations_sample": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 6}, "m": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2}}}}}, "s": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}}}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {"outrank": {"tf": 1}}, "df": 1, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.compute_batch_ranking": {"tf": 1}}, "df": 2}}}}}}}}}}, "f": {"docs": {"outrank": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 2}, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}}, "df": 1}}}}}}}}, "p": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "o": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}, "i": {"docs": {}, "df": 0, "p": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1.4142135623730951}}, "df": 1}}}}}, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.parse_ob_line": {"tf": 1}}, "df": 1}}}, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 3, "r": {"docs": {"outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 3}}}}, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "w": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}, "outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 3, "t": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "y": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}}}}}}}, "g": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}}}, "c": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 2}}}}}, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}}, "df": 1}}}}}}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "f": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {"outrank": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}, "m": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}, "y": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1}}}}}}, "g": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}}, "df": 1, "n": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 3}}}}, "o": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank": {"tf": 1.4142135623730951}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.DatasetInformationStorage": {"tf": 1}, "outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}, "outrank.core_utils.parse_ob_line": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.parse_ob_vw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_raw_feature_information": {"tf": 1}, "outrank.core_utils.parse_ob_feature_information": {"tf": 1}}, "df": 11}}, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}}, "df": 2}}}, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}}}}, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}}}}, "u": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1.4142135623730951}}, "df": 1, "r": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "d": {"docs": {"outrank": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}}, "df": 2}, "f": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 2}}}}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 3}}}}, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique": {"tf": 1}, "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 3}}}}}, "p": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_value_counts": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "/": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}}}}}}}}}}}}}}}, "f": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "q": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba": {"tf": 1}}, "df": 1}}}}}}, "l": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}}, "df": 1}}}}, "p": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 1}}}}}}}}}}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 1}}}}}, "a": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_utils.read_column_names": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.read_reference_json": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "w": {"docs": {"outrank": {"tf": 1}}, "df": 1}, "n": {"docs": {}, "df": 0, "k": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank": {"tf": 2.23606797749979}, "outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_utils.internal_hash": {"tf": 1}, "outrank.core_utils.BatchRankingSummary": {"tf": 1}}, "df": 4, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}, "d": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "m": {"docs": {"outrank": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.include_noisy_features": {"tf": 1}}, "df": 1}}}}}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 1}}}, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.checkpoint_importances_df": {"tf": 1}}, "df": 1}}}}, "m": {"docs": {}, "df": 0, "i": {"docs": {"outrank": {"tf": 1}}, "df": 1, "n": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2, "i": {"docs": {"outrank": {"tf": 1}}, "df": 1, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 1.7320508075688772}}, "df": 1}}}, "b": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}}, "x": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "d": {"docs": {"outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}}, "d": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "e": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "l": {"docs": {"outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}}, "df": 1}}}}, "u": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank": {"tf": 2}}, "df": 1}}}, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}, "l": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "s": {"docs": {"outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 1}}}}, "v": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}}, "df": 1}}}}}}}}, "c": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}}}, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {"outrank": {"tf": 2}}, "df": 1}}, "a": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}}, "df": 1}}}}, "t": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {"outrank.algorithms.importance_estimator.get_importances_estimate_pairwise": {"tf": 1}, "outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_utils.generic_line_parser": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}, "outrank.task_generators.outrank_task_generate_data_set": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 8}}}}, "m": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "y": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}, "outrank.algorithms.sketches.counting_ultiloglog": {"tf": 1}}, "df": 3}}}}, "a": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}, "a": {"docs": {}, "df": 0, "k": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.prior_combinations_sample": {"tf": 1}}, "df": 1}}, "p": {"docs": {}, "df": 0, "s": {"docs": {"outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 1}}}}, "y": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "u": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}, "h": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "p": {"docs": {"outrank": {"tf": 1}}, "df": 1, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.get_grouped_df": {"tf": 1}, "outrank.core_ranking.checkpoint_importances_df": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}}, "df": 3}}}}, "u": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_ranking.get_combinations_from_columns": {"tf": 1}, "outrank.core_ranking.mixed_rank_graph": {"tf": 1}}, "df": 2}}}}}}}, "r": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}, "a": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_utils.read_column_names": {"tf": 1}}, "df": 1}}}}}, "i": {"docs": {}, "df": 0, "g": {"docs": {}, "df": 0, "h": {"docs": {"outrank": {"tf": 2}}, "df": 1, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}}, "df": 1}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2}}}}}, "p": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "n": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}, "r": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.internal_hash": {"tf": 1}}, "df": 1}}}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}}, "df": 1}}}, "o": {"docs": {}, "df": 0, "t": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}, "w": {"docs": {"outrank.core_ranking.compute_feature_memory_consumption": {"tf": 1}}, "df": 1}, "l": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "n": {"docs": {}, "df": 0, "g": {"docs": {"outrank.core_utils.DatasetInformationStorage": {"tf": 1}}, "df": 1}}}}}}}, "n": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "b": {"docs": {}, "df": 0, "a": {"docs": {"outrank": {"tf": 2.23606797749979}, "outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 3}, "e": {"docs": {}, "df": 0, "r": {"docs": {"outrank.core_ranking.get_num_of_instances": {"tf": 1}}, "df": 1}}}, "p": {"docs": {}, "df": 0, "y": {"docs": {"outrank": {"tf": 1}}, "df": 1}}, "e": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "c": {"docs": {"outrank.core_utils.NumericFeatureSummary": {"tf": 1}, "outrank.core_utils.NominalFeatureSummary": {"tf": 1}}, "df": 2}}}}}}, "p": {"docs": {"outrank": {"tf": 3.872983346207417}}, "df": 1}, "e": {"docs": {}, "df": 0, "w": {"docs": {"outrank.core_ranking.enrich_with_transformations": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}}, "df": 2}, "e": {"docs": {}, "df": 0, "d": {"docs": {}, "df": 0, "s": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "y": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}}}}}}, "t": {"docs": {"outrank.core_ranking.estimate_importances_minibatches": {"tf": 1}}, "df": 1}, "i": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "e": {"docs": {"outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features": {"tf": 1}}, "df": 1}}}}, "a": {"docs": {}, "df": 0, "m": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "p": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}, "outrank.core_utils.parse_namespace": {"tf": 1}}, "df": 2}}}}}}}}}, "v": {"docs": {}, "df": 0, "e": {"docs": {}, "df": 0, "c": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "r": {"docs": {}, "df": 0, "s": {"docs": {"outrank": {"tf": 1}}, "df": 1}}}}}}, "i": {"docs": {}, "df": 0, "a": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}, "s": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "z": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "t": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters": {"tf": 1}, "outrank.visualizations.ranking_visualization.visualize_all": {"tf": 1}}, "df": 2}}}}}}}}}}}}, "a": {"docs": {}, "df": 0, "l": {"docs": {}, "df": 0, "u": {"docs": {}, "df": 0, "e": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1.4142135623730951}, "outrank.core_ranking.compute_subfeatures": {"tf": 1.4142135623730951}}, "df": 2, "s": {"docs": {"outrank.core_ranking.compute_expanded_multivalue_features": {"tf": 1}, "outrank.core_ranking.compute_subfeatures": {"tf": 1}, "outrank.core_utils.summarize_rare_counts": {"tf": 1}}, "df": 3}}}}}, "w": {"docs": {"outrank.core_utils.parse_ob_line_vw": {"tf": 1}}, "df": 1}}, "j": {"docs": {}, "df": 0, "i": {"docs": {}, "df": 0, "t": {"docs": {"outrank.algorithms.sketches.counting_cms.CountMinSketch": {"tf": 1}, "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter": {"tf": 1}}, "df": 2}}, "s": {"docs": {}, "df": 0, "o": {"docs": {}, "df": 0, "n": {"docs": {"outrank.core_utils.parse_ob_csv_line": {"tf": 1}, "outrank.core_utils.read_reference_json": {"tf": 1}, "outrank.core_utils.extract_features_from_reference_JSON": {"tf": 1}, "outrank.core_utils.summarize_feature_bounds_for_transformers": {"tf": 1}}, "df": 4}}}}, "x": {"docs": {}, "df": 0, "x": {"docs": {}, "df": 0, "h": {"docs": {}, "df": 0, "a": {"docs": {}, "df": 0, "s": {"docs": {}, "df": 0, "h": {"docs": {"outrank.core_ranking.compute_combined_features": {"tf": 1}}, "df": 1}}}}}}}}}, "pipeline": ["trimmer"], "_isPrebuiltIndex": true}; + /** pdoc search index */const docs = [{"fullname": "outrank", "modulename": "outrank", "kind": "module", "doc": "

    Welcome to OutRank's documentation!

    \n\n

    All functions/methods can be searched-for (search bar on the left).

    \n\n

    This tool enables fast screening of feature-feature interactions. Its purpose is to give the user fast insight into potential redundancies/anomalies in the data.\nIt is implemented to operate in _mini batches_, it traverses the raw data incrementally, refining the rankings as it goes along. The core operation, interaction ranking, outputs triplets which look as follows:

    \n\n
    featureA    featureB    0.512\nfeatureA    featureC    0.125\n
    \n\n

    Setup

    \n\n
    \n
    pip install outrank\n
    \n
    \n\n

    and test a minimal cycle with

    \n\n
    \n
    outrank --task selftest\n
    \n
    \n\n

    if this passes, you can be pretty certain OutRank will perform as intended. OutRank's primary use case is as a CLI tool, begin exploring with

    \n\n
    \n
    outrank --help\n
    \n
    \n\n

    Example use cases

    \n\n
      \n
    • A minimal showcase of performing feature ranking on a generic CSV is demonstrated with this example.

    • \n
    • More examples demonstrating OutRank's capabilities are also available.

    • \n
    \n\n

    OutRank as a Python library

    \n\n

    Once installed, _OutRank_ can be used as any other Python library. For example, generic feature ranking algorithms can be accessed as

    \n\n
    \n
    from outrank.algorithms.feature_ranking.ranking_mi_numba import (\n    mutual_info_estimator_numba,\n)\n\n# Some synthetic minimal data (Numpy vectors)\na = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32)\n\nlowest = np.array(np.random.permutation(a), dtype=np.int32)\nmedium = np.array([1, 1, 0, 0, 1, 1, 1, 1], dtype=np.int32)\nhigh = np.array([1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32)\n\nlowest_score = mutual_info_estimator_numba(\n    a, lowest, np.float32(1.0), False,\n)\nmedium_score = mutual_info_estimator_numba(\n    a, medium, np.float32(1.0), False,\n)\nhigh_score = mutual_info_estimator_numba(\n    a, high, np.float32(1.0), False,\n)\n\nscores = [lowest_score, medium_score, high_score]\nsorted_score_indices = np.argsort(scores)\nassert np.sum(np.array([0, 1, 2]) - sorted_score_indices) ==  0\n
    \n
    \n\n
    \n\n

    Creating a simple dataset

    \n\n
    \n
    from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification\n\ncc = CategoricalClassification()\n\n# Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35\nX = cc.generate_data(9, \n                     10000, \n                     cardinality=35, \n                     ensure_rep=True, \n                     random_values=True, \n                     low=0, \n                     high=40)\n\n# Creates target labels via clustering\ny = cc.generate_labels(X, n=2, class_relation='cluster')\n
    \n
    \n"}, {"fullname": "outrank.algorithms", "modulename": "outrank.algorithms", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking", "modulename": "outrank.algorithms.feature_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "numba_unique", "kind": "function", "doc": "

    Identify unique elements in an array, fast

    \n", "signature": "(a):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_conditional_entropy", "kind": "function", "doc": "

    \n", "signature": "(\tY_classes,\tclass_values,\tclass_var_shape,\tinitial_prob,\tnonzero_counts):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_entropies", "kind": "function", "doc": "

    Core entropy computation function

    \n", "signature": "(X, Y, all_events, f_values, f_value_counts, cardinality_correction):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "stratified_subsampling", "kind": "function", "doc": "

    \n", "signature": "(Y, X, approximation_factor, _f_values_X):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "mutual_info_estimator_numba", "kind": "function", "doc": "

    Core estimator logic. Compute unique elements, subset if required

    \n", "signature": "(Y, X, approximation_factor=1.0, cardinality_correction=False):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator", "modulename": "outrank.algorithms.importance_estimator", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.importance_estimator.logger", "modulename": "outrank.algorithms.importance_estimator", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.algorithms.importance_estimator.num_folds", "modulename": "outrank.algorithms.importance_estimator", "qualname": "num_folds", "kind": "variable", "doc": "

    \n", "default_value": "4"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_MI", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_MI", "kind": "function", "doc": "

    \n", "signature": "(vector_first: Any, vector_second: Any) -> float:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_surrogate", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_surrogate", "kind": "function", "doc": "

    \n", "signature": "(\tvector_first: Any,\tvector_second: Any,\tX: Any,\tsurrogate_model: str) -> float:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.numba_mi", "modulename": "outrank.algorithms.importance_estimator", "qualname": "numba_mi", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_mi_adj", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_mi_adj", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_pairwise", "kind": "function", "doc": "

    A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.

    \n", "signature": "(combination, reference_model_features, args, tmp_df):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.rank_features_3MR", "modulename": "outrank.algorithms.importance_estimator", "qualname": "rank_features_3MR", "kind": "function", "doc": "

    \n", "signature": "(\trelevance_dict: dict[str, float],\tredundancy_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\trelational_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\tstrategy: str = 'median',\talpha: float = 1,\tbeta: float = 1) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_nonmyopic", "kind": "function", "doc": "

    \n", "signature": "(args: Any, tmp_df: pandas.core.frame.DataFrame):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.initialize_classifier", "modulename": "outrank.algorithms.importance_estimator", "qualname": "initialize_classifier", "kind": "function", "doc": "

    \n", "signature": "(surrogate_model: str):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches", "modulename": "outrank.algorithms.sketches", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms", "modulename": "outrank.algorithms.sketches.counting_cms", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.cms_hash", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "cms_hash", "kind": "function", "doc": "

    \n", "signature": "(x, seed, width):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.__init__", "kind": "function", "doc": "

    \n", "signature": "(depth=6, width=32768, M=None)"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.depth", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.width", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.width", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.hash_seeds", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.M", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.M", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.add", "kind": "function", "doc": "

    \n", "signature": "(self, x, delta=1):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst, delta=1):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.query", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.query", "kind": "function", "doc": "

    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.get_matrix", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.__init__", "kind": "function", "doc": "

    \n", "signature": "(bound: int = 30000)"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.max_bound_thr", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.default_counter", "kind": "variable", "doc": "

    \n", "annotation": ": collections.Counter"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.add", "kind": "function", "doc": "

    \n", "signature": "(self, val):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "kind": "module", "doc": "

    This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.__init__", "kind": "function", "doc": "

    \n", "signature": "(error_rate=0.005)"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.p", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.m", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_set", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_size", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.width", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.hll_flag", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.add", "kind": "function", "doc": "

    \n", "signature": "(self, value):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators", "modulename": "outrank.algorithms.synthetic_data_generators", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.dataset_info", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.dataset_info", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_data", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_data", "kind": "function", "doc": "

    Generates dataset based on given parameters

    \n\n
    Parameters
    \n\n
      \n
    • n_features: number of generated features
    • \n
    • n_samples: number of generated samples
    • \n
    • cardinality: default cardinality of the dataset
    • \n
    • structure: structure of the dataset
    • \n
    • ensure_rep: flag, ensures all given values represented
    • \n
    • random_values: flag, enables random (integer) feature values from set [low, high]
    • \n
    • low: sets lower bound of random feature values
    • \n
    • high: sets high bound of random feature values
    • \n
    • seed: sets seed of numpy random
    • \n
    \n\n
    Returns
    \n\n
    \n

    X, 2D dataset

    \n
    \n", "signature": "(\tself,\tn_features: int,\tn_samples: int,\tcardinality: int = 5,\tstructure: Union[list, numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]], NoneType] = None,\tensure_rep: bool = False,\trandom_values: bool | None = False,\tlow: int | None = 0,\thigh: int | None = 1000,\tseed: int = 42) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_combinations", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_combinations", "kind": "function", "doc": "

    Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indexes of features to be in combination
    • \n
    • combination_function: optional custom function for combining feature vectors
    • \n
    • combination_type: string flag, either liner or nonlinear, defining combination type
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with added resultant feature

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tcombination_function: Optional = None,\tcombination_type: Literal = 'linear') -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_correlated", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_correlated", "kind": "function", "doc": "

    Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indices of features to generate correlated feature to
    • \n
    • r: (Pearson) correlation factor
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with generated correlated features

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tr: float = 0.8) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_duplicates", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_duplicates", "kind": "function", "doc": "

    Generates duplicate features

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indices of features to duplicate
    • \n
    \n\n
    Returns
    \n\n
    \n

    dataset with duplicated features

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_labels", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_labels", "kind": "function", "doc": "

    Generates labels for dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • n: number of class labels
    • \n
    • p: class distribution
    • \n
    • k: constant
    • \n
    • decision_function: optional user-defined decision function
    • \n
    • class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    • \n
    • balance: boolean, whether to balance clustering class labels
    • \n
    \n\n
    Returns
    \n\n
    \n

    array of labels, corresponding to dataset X

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tn: int = 2,\tp: Union[float, list[float], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]] = 0.5,\tk: int | float = 2,\tdecision_function: Optional = None,\tclass_relation: str = 'linear',\tbalance: bool = False):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_noise", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_noise", "kind": "function", "doc": "

    Simulates noise on given dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset to apply noise to
    • \n
    • y: required target labels for categorical noise generation
    • \n
    • p: amount of noise to apply. Defaults to 0.2
    • \n
    • type: type of noise to apply, either categorical or missing
    • \n
    • missing_val: value to simulate missing values. Defaults to float('-inf')
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with noise applied

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tp: float = 0.2,\ttype: Literal = 'categorical',\tmissing_val: str | int | float = -inf) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.downsample_dataset", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.downsample_dataset", "kind": "function", "doc": "

    Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.

    \n\n
    Parameters
    \n\n
      \n
    • X: Dataset to downsample
    • \n
    • y: Labels corresponding to X
    • \n
    • N: Optional number of samples per class to downsample to
    • \n
    • seed: Seed for random state of resample function
    • \n
    • reshuffle: Reshuffle the dataset after downsampling
    • \n
    \n\n
    Returns
    \n\n
    \n

    Balanced X and y after downsampling

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tN: int | None = None,\tseed: int = 42,\treshuffle: bool = False) -> tuple[numpy.ndarray, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.print_dataset", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.print_dataset", "kind": "function", "doc": "

    Prints given dataset

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • y: labels
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.summarize", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.summarize", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "qualname": "generate_random_matrix", "kind": "function", "doc": "

    \n", "signature": "(num_features=100, size=20000):", "funcdef": "def"}, {"fullname": "outrank.core_ranking", "modulename": "outrank.core_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_ranking.logger", "modulename": "outrank.core_ranking", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_CARDINALITY_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, {"fullname": "outrank.core_ranking.GLOBAL_COUNTS_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_COUNTS_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, {"fullname": "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_RARE_VALUE_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]", "default_value": "Counter()"}, {"fullname": "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_PRIOR_COMB_COUNTS", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, int]", "default_value": "Counter()"}, {"fullname": "outrank.core_ranking.IGNORED_VALUES", "modulename": "outrank.core_ranking", "qualname": "IGNORED_VALUES", "kind": "variable", "doc": "

    \n", "default_value": "set()"}, {"fullname": "outrank.core_ranking.HYPERLL_ERROR_BOUND", "modulename": "outrank.core_ranking", "qualname": "HYPERLL_ERROR_BOUND", "kind": "variable", "doc": "

    \n", "default_value": "0.02"}, {"fullname": "outrank.core_ranking.MAX_FEATURES_3MR", "modulename": "outrank.core_ranking", "qualname": "MAX_FEATURES_3MR", "kind": "variable", "doc": "

    \n", "default_value": "10000"}, {"fullname": "outrank.core_ranking.prior_combinations_sample", "modulename": "outrank.core_ranking", "qualname": "prior_combinations_sample", "kind": "function", "doc": "

    Make sure only relevant subspace of combinations is selected based on prior counts

    \n", "signature": "(\tcombinations: list[tuple[typing.Any, ...]],\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.get_combinations_from_columns", "modulename": "outrank.core_ranking", "qualname": "get_combinations_from_columns", "kind": "function", "doc": "

    Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope

    \n", "signature": "(\tall_columns: pandas.core.indexes.base.Index,\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.mixed_rank_graph", "modulename": "outrank.core_ranking", "qualname": "mixed_rank_graph", "kind": "function", "doc": "

    Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tcpu_pool: Any,\tpbar: Any) -> outrank.core_utils.BatchRankingSummary:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.enrich_with_transformations", "modulename": "outrank.core_ranking", "qualname": "enrich_with_transformations", "kind": "function", "doc": "

    Construct a collection of new features based on pre-defined transformations/rules

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnum_col_types: set[str],\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_combined_features", "modulename": "outrank.core_ranking", "qualname": "compute_combined_features", "kind": "function", "doc": "

    Compute higher order features via xxhash-based trick.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tpbar: Any,\tis_3mr: bool = False) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_expanded_multivalue_features", "modulename": "outrank.core_ranking", "qualname": "compute_expanded_multivalue_features", "kind": "function", "doc": "

    Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value \"a,b,c\" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_subfeatures", "modulename": "outrank.core_ranking", "qualname": "compute_subfeatures", "kind": "function", "doc": "

    Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.\n->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.\n<->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.include_noisy_features", "modulename": "outrank.core_ranking", "qualname": "include_noisy_features", "kind": "function", "doc": "

    Add randomized features that serve as a sanity check

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_coverage", "modulename": "outrank.core_ranking", "qualname": "compute_coverage", "kind": "function", "doc": "

    Compute coverage of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_feature_memory_consumption", "modulename": "outrank.core_ranking", "qualname": "compute_feature_memory_consumption", "kind": "function", "doc": "

    An approximation of how much feature take up

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_value_counts", "modulename": "outrank.core_ranking", "qualname": "compute_value_counts", "kind": "function", "doc": "

    Update the count structure

    \n", "signature": "(input_dataframe: pandas.core.frame.DataFrame, args: Any):", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_cardinalities", "modulename": "outrank.core_ranking", "qualname": "compute_cardinalities", "kind": "function", "doc": "

    Compute cardinalities of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tpbar: Any,\tmax_unique_hist_constraint: int) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_bounds_increment", "modulename": "outrank.core_ranking", "qualname": "compute_bounds_increment", "kind": "function", "doc": "

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnumeric_column_types: set[str]) -> dict[str, typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_batch_ranking", "modulename": "outrank.core_ranking", "qualname": "compute_batch_ranking", "kind": "function", "doc": "

    Enrich the feature space and compute the batch importances

    \n", "signature": "(\tline_tmp_storage: list[list[typing.Any]],\tnumeric_column_types: set[str],\targs: Any,\tcpu_pool: Any,\tcolumn_descriptions: list[str],\tlogger: Any,\tpbar: Any) -> tuple[outrank.core_utils.BatchRankingSummary, dict[str, typing.Any], dict[str, set[str]], dict[str, set[str]]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.get_grouped_df", "modulename": "outrank.core_ranking", "qualname": "get_grouped_df", "kind": "function", "doc": "

    A helper method that enables median-based aggregation after processing

    \n", "signature": "(\timportances_df_list: list[tuple[str, str, float]]) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.checkpoint_importances_df", "modulename": "outrank.core_ranking", "qualname": "checkpoint_importances_df", "kind": "function", "doc": "

    A helper which stores intermediary state - useful for longer runs

    \n", "signature": "(importances_batch: list[tuple[str, str, float]]) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.estimate_importances_minibatches", "modulename": "outrank.core_ranking", "qualname": "estimate_importances_minibatches", "kind": "function", "doc": "

    Interaction score estimator - suitable for example for csv-like input data types.\nThis type of data is normally a single large csv, meaning that minibatch processing needs to\nhappen during incremental handling of the file (that\"s not the case for pre-separated ob data)

    \n", "signature": "(\tinput_file: str,\tcolumn_descriptions: list,\tfw_col_mapping: dict[str, str],\tnumeric_column_types: set,\tbatch_size: int = 100000,\targs: Any = None,\tdata_encoding: str = 'utf-8',\tcpu_pool: Any = None,\tdelimiter: str = '\\t',\tfeature_construction_mode: bool = False,\tlogger: Any = None) -> tuple[list[dict[str, typing.Any]], typing.Any, dict[typing.Any, typing.Any], list[dict[str, typing.Any]], list[dict[str, set[str]]], collections.defaultdict[str, list[set[str]]], dict[str, typing.Any], dict[str, typing.Any], dict[str, typing.Any]]:", "funcdef": "def"}, {"fullname": "outrank.core_selftest", "modulename": "outrank.core_selftest", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_utils", "modulename": "outrank.core_utils", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_utils.pro_tips", "modulename": "outrank.core_utils", "qualname": "pro_tips", "kind": "variable", "doc": "

    \n", "default_value": "['OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', 'Visualization part also includes clustering - this might be very insightful!', 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', 'Give it as many threads as physically possible (--num_threads).', 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', 'Your target can be any feature! (explaining one feature with others)', 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.']"}, {"fullname": "outrank.core_utils.write_json_dump_to_file", "modulename": "outrank.core_utils", "qualname": "write_json_dump_to_file", "kind": "function", "doc": "

    \n", "signature": "(args: Any, config_name: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.internal_hash", "modulename": "outrank.core_utils", "qualname": "internal_hash", "kind": "function", "doc": "

    A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure

    \n", "signature": "(input_obj: str) -> str:", "funcdef": "def"}, {"fullname": "outrank.core_utils.DatasetInformationStorage", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage", "kind": "class", "doc": "

    A generic class for holding properties of a given type of dataset

    \n"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.__init__", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdata_path: str,\tcolumn_names: list[str],\tcolumn_types: set[str],\tcol_delimiter: str | None,\tencoding: str,\tfw_map: dict[str, str] | None)"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.data_path", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.data_path", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.column_names", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_names", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.column_types", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_types", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.col_delimiter", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.col_delimiter", "kind": "variable", "doc": "

    \n", "annotation": ": str | None"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.encoding", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.encoding", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.fw_map", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.fw_map", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, str] | None"}, {"fullname": "outrank.core_utils.NumericFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tfeature_name: str,\tminimum: float,\tmaximum: float,\tmedian: float,\tnum_unique: int)"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.minimum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.minimum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.maximum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.maximum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.median", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.median", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, {"fullname": "outrank.core_utils.NominalFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(feature_name: str, num_unique: int)"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, {"fullname": "outrank.core_utils.BatchRankingSummary", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary", "kind": "class", "doc": "

    A generic class representing batched ranking results

    \n"}, {"fullname": "outrank.core_utils.BatchRankingSummary.__init__", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttriplet_scores: list[tuple[str, str, float]],\tstep_times: dict[str, typing.Any])"}, {"fullname": "outrank.core_utils.BatchRankingSummary.triplet_scores", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.triplet_scores", "kind": "variable", "doc": "

    \n", "annotation": ": list[tuple[str, str, float]]"}, {"fullname": "outrank.core_utils.BatchRankingSummary.step_times", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.step_times", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]"}, {"fullname": "outrank.core_utils.display_random_tip", "modulename": "outrank.core_utils", "qualname": "display_random_tip", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.get_dataset_info", "modulename": "outrank.core_utils", "qualname": "get_dataset_info", "kind": "function", "doc": "

    \n", "signature": "(args: Any):", "funcdef": "def"}, {"fullname": "outrank.core_utils.display_tool_name", "modulename": "outrank.core_utils", "qualname": "display_tool_name", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_line", "kind": "function", "doc": "

    Outbrain line parsing - generic TSVs

    \n", "signature": "(line_string: str, delimiter: str = '\\t', args: Any = None) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_line_vw", "modulename": "outrank.core_utils", "qualname": "parse_ob_line_vw", "kind": "function", "doc": "

    Parse a sparse vw line into a pandas df with pre-defined namespace

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping=None,\ttable_header=None,\tinclude_namespace_info=False) -> list[str | None]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_csv_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_csv_line", "kind": "function", "doc": "

    Data can have commas within JSON field dumps

    \n", "signature": "(line_string: str, delimiter: str = ',', args: Any = None) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.generic_line_parser", "modulename": "outrank.core_utils", "qualname": "generic_line_parser", "kind": "function", "doc": "

    A generic method aimed to parse data from different sources.

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping: Any = None,\ttable_header: Any = None) -> list[typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.read_reference_json", "modulename": "outrank.core_utils", "qualname": "read_reference_json", "kind": "function", "doc": "

    A helper method for reading a JSON

    \n", "signature": "(json_path) -> dict[str, dict]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_namespace", "modulename": "outrank.core_utils", "qualname": "parse_namespace", "kind": "function", "doc": "

    Parse the feature namespace for type awareness

    \n", "signature": "(namespace_path: str) -> tuple[set[str], dict[str, str]]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.read_column_names", "modulename": "outrank.core_utils", "qualname": "read_column_names", "kind": "function", "doc": "

    Read the col. header

    \n", "signature": "(mapping_file: str) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_vw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_vw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_raw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_raw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_csv_with_description_information", "modulename": "outrank.core_utils", "qualname": "parse_csv_with_description_information", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_csv_raw", "modulename": "outrank.core_utils", "qualname": "parse_csv_raw", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.extract_features_from_reference_JSON", "modulename": "outrank.core_utils", "qualname": "extract_features_from_reference_JSON", "kind": "function", "doc": "

    Given a model's JSON, extract unique features

    \n", "signature": "(\tjson_path: str,\tcombined_features_only=False,\tall_features=False) -> set[typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.summarize_feature_bounds_for_transformers", "modulename": "outrank.core_utils", "qualname": "summarize_feature_bounds_for_transformers", "kind": "function", "doc": "

    summarization auxilliary method for generating JSON-based specs

    \n", "signature": "(\tbounds_object_storage: Any,\tfeature_types: list[str],\ttask_name: str,\tlabel_name: str,\tgranularity: int = 15,\toutput_summary_table_only: bool = False):", "funcdef": "def"}, {"fullname": "outrank.core_utils.summarize_rare_counts", "modulename": "outrank.core_utils", "qualname": "summarize_rare_counts", "kind": "function", "doc": "

    Write rare values

    \n", "signature": "(\tterm_counter: Any,\targs: Any,\tcardinality_object: Any,\tobject_info: outrank.core_utils.DatasetInformationStorage) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.is_prior_heuristic", "modulename": "outrank.core_utils", "qualname": "is_prior_heuristic", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> bool:", "funcdef": "def"}, {"fullname": "outrank.core_utils.get_num_of_instances", "modulename": "outrank.core_utils", "qualname": "get_num_of_instances", "kind": "function", "doc": "

    Count the number of lines in a file, fast - useful for progress logging

    \n", "signature": "(fname: str) -> int:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations", "modulename": "outrank.feature_transformations", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault", "modulename": "outrank.feature_transformations.feature_transformer_vault", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "MINIMAL_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "DEFAULT_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "FW_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)', '_tr_fw_sqrt_res_1_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*1,0), 0))', '_tr_fw_log_res_1_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*1,0), 0))', '_tr_fw_log_res_1_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*1,0), 0))', '_tr_fw_log_res_1_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*1,0), 0))', '_tr_fw_log_res_1_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*1,0), 0))', '_tr_fw_log_res_1_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*1,0), 0))', '_tr_fw_log_res_1_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*1,0), 0))', '_tr_fw_log_res_1_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*1,0), 0))', '_tr_fw_log_res_1_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*1,0), 0))', '_tr_fw_sqrt_res_10_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*10,0), 0))', '_tr_fw_log_res_10_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*10,0), 0))', '_tr_fw_log_res_10_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*10,0), 0))', '_tr_fw_log_res_10_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*10,0), 0))', '_tr_fw_log_res_10_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*10,0), 0))', '_tr_fw_log_res_10_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*10,0), 0))', '_tr_fw_log_res_10_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*10,0), 0))', '_tr_fw_log_res_10_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*10,0), 0))', '_tr_fw_log_res_10_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*10,0), 0))', '_tr_fw_sqrt_res_50_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*50,0), 0))', '_tr_fw_log_res_50_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*50,0), 0))', '_tr_fw_log_res_50_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*50,0), 0))', '_tr_fw_log_res_50_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*50,0), 0))', '_tr_fw_log_res_50_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*50,0), 0))', '_tr_fw_log_res_50_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*50,0), 0))', '_tr_fw_log_res_50_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*50,0), 0))', '_tr_fw_log_res_50_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*50,0), 0))', '_tr_fw_log_res_50_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*50,0), 0))', '_tr_fw_sqrt_res_100_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*100,0), 0))', '_tr_fw_log_res_100_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*100,0), 0))', '_tr_fw_log_res_100_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*100,0), 0))', '_tr_fw_log_res_100_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*100,0), 0))', '_tr_fw_log_res_100_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*100,0), 0))', '_tr_fw_log_res_100_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*100,0), 0))', '_tr_fw_log_res_100_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*100,0), 0))', '_tr_fw_log_res_100_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*100,0), 0))', '_tr_fw_log_res_100_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*100,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*1,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*10,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*50,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*100,0), 0))'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "resolution_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 10, 50, 100]"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "greater_than_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 2, 4, 8, 16, 32, 64, 96]"}, {"fullname": "outrank.feature_transformations.ranking_transformers", "modulename": "outrank.feature_transformations.ranking_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.noise_preset", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.construct_new_features", "kind": "function", "doc": "

    Generate a few standard noise distributions

    \n", "signature": "(self, dataframe: pandas.core.frame.DataFrame, label_column=None):", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.__init__", "kind": "function", "doc": "

    \n", "signature": "(numeric_column_names: set[str], preset: str = 'default')"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.numeric_column_names", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.constructed_feature_names", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.max_maj_support", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.nan_prop_support", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.get_vals", "kind": "function", "doc": "

    \n", "signature": "(self, tmp_df: pandas.core.frame.DataFrame, col_name: str) -> Any:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_baseline_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_new_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.task_generators", "modulename": "outrank.task_generators", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_generators.logger", "modulename": "outrank.task_generators", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.task_generators.outrank_task_generate_data_set", "modulename": "outrank.task_generators", "qualname": "outrank_task_generate_data_set", "kind": "function", "doc": "

    Core method for generating data sets

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking", "modulename": "outrank.task_instance_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_instance_ranking.shannon_ent", "modulename": "outrank.task_instance_ranking", "qualname": "shannon_ent", "kind": "function", "doc": "

    \n", "signature": "(string: str) -> float:", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.compute_entropy_avg", "modulename": "outrank.task_instance_ranking", "qualname": "compute_entropy_avg", "kind": "function", "doc": "

    \n", "signature": "(line: list) -> float:", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.score_line", "modulename": "outrank.task_instance_ranking", "qualname": "score_line", "kind": "function", "doc": "

    \n", "signature": "(line):", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.outrank_task_rank_instances", "modulename": "outrank.task_instance_ranking", "qualname": "outrank_task_rank_instances", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> None:", "funcdef": "def"}, {"fullname": "outrank.task_ranking", "modulename": "outrank.task_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_ranking.outrank_task_conduct_ranking", "modulename": "outrank.task_ranking", "qualname": "outrank_task_conduct_ranking", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> None:", "funcdef": "def"}, {"fullname": "outrank.task_selftest", "modulename": "outrank.task_selftest", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_selftest.logger", "modulename": "outrank.task_selftest", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.task_selftest.conduct_self_test", "modulename": "outrank.task_selftest", "qualname": "conduct_self_test", "kind": "function", "doc": "

    \n", "signature": "():", "funcdef": "def"}, {"fullname": "outrank.task_summary", "modulename": "outrank.task_summary", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_summary.outrank_task_result_summary", "modulename": "outrank.task_summary", "qualname": "outrank_task_result_summary", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.task_visualization", "modulename": "outrank.task_visualization", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_visualization.outrank_task_visualize_results", "modulename": "outrank.task_visualization", "qualname": "outrank_task_visualize_results", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.visualizations", "modulename": "outrank.visualizations", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.visualizations.ranking_visualization", "modulename": "outrank.visualizations.ranking_visualization", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_hierarchical_clusters", "kind": "function", "doc": "

    A method for visualization of hierarchical clusters w.r.t. different linkage functions

    \n", "signature": "(\ttriplet_dataframe: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str = 'png',\tmax_num_clusters: int = 100) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_heatmap", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_heatmap", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_barplots", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_barplots", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\treference_json: str,\timage_format: str,\tlabel: str,\theuristic: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_all", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_all", "kind": "function", "doc": "

    A method for visualization of the obtained feature interaction maps.

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\tlabel: str = '',\treference_json: str = '',\timage_format: str = 'png',\theuristic: str = 'MI') -> None:", "funcdef": "def"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough. @@ -43,4 +43,4 @@ window.pdocSearch = (function(){ }, expand: true }); -})(); +})(); \ No newline at end of file From 18c6cb8ed38ccca31021b1b430bb5b075951a4c5 Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Mon, 15 Jul 2024 11:55:42 +0200 Subject: [PATCH 8/9] updated documentation with changes after merging PR --- docs/outrank.html | 5 +- .../cc_generator.html | 4238 ++++++++--------- docs/search.js | 2 +- 3 files changed, 2079 insertions(+), 2166 deletions(-) diff --git a/docs/outrank.html b/docs/outrank.html index 6af2d42..b920250 100644 --- a/docs/outrank.html +++ b/docs/outrank.html @@ -164,9 +164,8 @@

    Creating a simple dataset

    1"""
     2.. include:: ../docs/DOCSMAIN.md
    -3.. include:: ../docs/generator_docs.md
    -4"""
    -5from __future__ import annotations
    +3"""
    +4from __future__ import annotations
     
    diff --git a/docs/outrank/algorithms/synthetic_data_generators/cc_generator.html b/docs/outrank/algorithms/synthetic_data_generators/cc_generator.html index 7d97838..53423c6 100644 --- a/docs/outrank/algorithms/synthetic_data_generators/cc_generator.html +++ b/docs/outrank/algorithms/synthetic_data_generators/cc_generator.html @@ -33,6 +33,9 @@

    API Documentation

  • CategoricalClassification
  • @@ -105,833 +105,804 @@

    16 17class CategoricalClassification: 18 - 19 def __init__(self): - 20 self.dataset_info = { - 21 'general': {}, - 22 'combinations': [], - 23 'correlations': [], - 24 'duplicates': [], - 25 'labels': [], - 26 'noise': [], - 27 } - 28 - 29 def __repr__(self): - 30 return f"CategoricalClassification(dataset_info={self.dataset_info})" - 31 - 32 def generate_data( - 33 self, - 34 n_features: int, - 35 n_samples: int, - 36 cardinality: int = 5, - 37 structure: list | ArrayLike | None = None, - 38 ensure_rep: bool = False, - 39 random_values: bool | None = False, - 40 low: int | None = 0, - 41 high: int | None = 1000, - 42 seed: int = 42, - 43 ) -> np.ndarray: - 44 - 45 """ - 46 Generates dataset based on given parameters - 47 :param n_features: number of generated features - 48 :param n_samples: number of generated samples - 49 :param cardinality: default cardinality of the dataset - 50 :param structure: structure of the dataset - 51 :param ensure_rep: flag, ensures all given values represented - 52 :param random_values: flag, enables random (integer) feature values from set [low, high] - 53 :param low: sets lower bound of random feature values - 54 :param high: sets high bound of random feature values - 55 :param seed: sets seed of numpy random - 56 :return: X, 2D dataset - 57 """ - 58 - 59 self.dataset_info.update({ - 60 'general': { - 61 'n_features': n_features, - 62 'n_samples': n_samples, - 63 'cardinality': cardinality, - 64 'structure': structure, - 65 'ensure_rep': ensure_rep, - 66 'seed': seed, - 67 }, - 68 }) - 69 - 70 np.random.seed(seed) - 71 X = np.empty([n_features, n_samples]) - 72 - 73 if structure is None: - 74 # No specific structure parameter passed - 75 for i in range(n_features): - 76 x = self._generate_feature( - 77 n_samples, - 78 cardinality=cardinality, - 79 ensure_rep=ensure_rep, - 80 random_values=random_values, - 81 low=low, - 82 high=high, - 83 ) - 84 X[i] = x - 85 else: - 86 # Structure parameter passed, building based on structure - 87 ix = 0 - 88 for data in structure: - 89 if not isinstance(data[0], (list, np.ndarray)): - 90 # Data in structure is a tuple of (feature index (integer), feature attributes) - 91 feature_ix, feature_attributes = data - 92 - 93 if ix < feature_ix: - 94 # Filling out the dataset up to column index feature_ix - 95 for i in range(ix, feature_ix): - 96 x = self._generate_feature( - 97 n_samples, - 98 cardinality=cardinality, - 99 ensure_rep=ensure_rep, -100 random_values=random_values, -101 low=low, -102 high=high, -103 ) -104 X[ix] = x -105 ix += 1 -106 -107 x = self._configure_generate_feature( -108 feature_attributes, -109 n_samples, -110 ensure_rep=ensure_rep, -111 random_values=random_values, -112 low=low, -113 high=high, -114 ) -115 X[ix] = x -116 ix += 1 -117 -118 else: -119 # Data in structure is a tuple of (list of feature indexes, feature attributes) -120 feature_ixs, feature_attributes = data -121 -122 for feature_ix in feature_ixs: -123 # Filling out the dataset up to feature_ix -124 if ix < feature_ix: -125 for i in range(ix, feature_ix): -126 x = self._generate_feature( -127 n_samples, -128 cardinality=cardinality, -129 ensure_rep=ensure_rep, -130 random_values=random_values, -131 low=low, -132 high=high, -133 ) -134 X[ix] = x -135 ix += 1 -136 -137 x = self._configure_generate_feature( -138 feature_attributes, -139 n_samples, -140 ensure_rep=ensure_rep, -141 random_values=random_values, -142 low=low, -143 high=high, -144 ) -145 -146 X[ix] = x -147 ix += 1 -148 -149 if ix < n_features: -150 # Fill out the rest of the dataset -151 for i in range(ix, n_features): -152 x = self._generate_feature( -153 n_samples, -154 cardinality=cardinality, -155 ensure_rep=ensure_rep, -156 random_values=random_values, -157 low=low, -158 high=high, -159 ) -160 X[i] = x -161 -162 return X.T + 19 def __init__(self, seed: int = 42): + 20 np.random.seed(seed) + 21 self.dataset_info = { + 22 'general': {}, + 23 'combinations': [], + 24 'correlations': [], + 25 'duplicates': [], + 26 'labels': {}, + 27 'noise': [], + 28 } + 29 + 30 def __repr__(self): + 31 return f"CategoricalClassification(dataset_info={self.dataset_info})" + 32 + 33 def generate_data( + 34 self, + 35 n_features: int, + 36 n_samples: int, + 37 cardinality: int = 5, + 38 structure: list | ArrayLike | None = None, + 39 ensure_rep: bool = False, + 40 random_values: bool | None = False, + 41 low: int | None = 0, + 42 high: int | None = 1000, + 43 seed: int = 42, + 44 ) -> np.ndarray: + 45 + 46 """ + 47 Generates dataset based on given parameters + 48 :param n_features: number of generated features + 49 :param n_samples: number of generated samples + 50 :param cardinality: default cardinality of the dataset + 51 :param structure: structure of the dataset + 52 :param ensure_rep: flag, ensures all given values represented + 53 :param random_values: flag, enables random (integer) feature values from set [low, high] + 54 :param low: sets lower bound of random feature values + 55 :param high: sets high bound of random feature values + 56 :param seed: sets seed of numpy random + 57 :return: X, 2D dataset + 58 """ + 59 + 60 self.dataset_info.update({ + 61 'general': { + 62 'n_features': n_features, + 63 'n_samples': n_samples, + 64 'cardinality': cardinality, + 65 'structure': structure, + 66 'ensure_rep': ensure_rep, + 67 'seed': seed, + 68 }, + 69 }) + 70 + 71 np.random.seed(seed) + 72 X = np.empty([n_features, n_samples]) + 73 + 74 # No specific structure parameter passed + 75 if structure is None: + 76 for i in range(n_features): + 77 x = self._generate_feature( + 78 n_samples, + 79 cardinality=cardinality, + 80 ensure_rep=ensure_rep, + 81 random_values=random_values, + 82 low=low, + 83 high=high, + 84 ) + 85 X[i] = x + 86 # Structure parameter passed, building based on structure + 87 else: + 88 ix = 0 + 89 for data in structure: + 90 + 91 # Data in structure is a tuple of (feature index (integer), feature attributes) + 92 if not isinstance(data[0], (list, np.ndarray)): + 93 feature_ix, feature_attributes = data + 94 + 95 # Filling out the dataset up to column index feature_ix + 96 if ix < feature_ix: + 97 for i in range(ix, feature_ix): + 98 x = self._generate_feature( + 99 n_samples, +100 cardinality=cardinality, +101 ensure_rep=ensure_rep, +102 random_values=random_values, +103 low=low, +104 high=high, +105 ) +106 X[ix] = x +107 ix += 1 +108 +109 x = self._configure_generate_feature( +110 feature_attributes, +111 n_samples, +112 ensure_rep=ensure_rep, +113 random_values=random_values, +114 low=low, +115 high=high, +116 ) +117 X[ix] = x +118 ix += 1 +119 +120 # Data in structure is a tuple of (list of feature indexes, feature attributes) +121 else: +122 feature_ixs, feature_attributes = data +123 +124 # Filling out the dataset up to feature_ix +125 for feature_ix in feature_ixs: +126 if ix < feature_ix: +127 for i in range(ix, feature_ix): +128 x = self._generate_feature( +129 n_samples, +130 cardinality=cardinality, +131 ensure_rep=ensure_rep, +132 random_values=random_values, +133 low=low, +134 high=high, +135 ) +136 X[ix] = x +137 ix += 1 +138 +139 x = self._configure_generate_feature( +140 feature_attributes, +141 n_samples, +142 ensure_rep=ensure_rep, +143 random_values=random_values, +144 low=low, +145 high=high, +146 ) +147 +148 X[ix] = x +149 ix += 1 +150 +151 # Fill out the rest of the dataset +152 if ix < n_features: +153 for i in range(ix, n_features): +154 x = self._generate_feature( +155 n_samples, +156 cardinality=cardinality, +157 ensure_rep=ensure_rep, +158 random_values=random_values, +159 low=low, +160 high=high, +161 ) +162 X[i] = x 163 -164 def _configure_generate_feature( -165 self, -166 feature_attributes: int | list | ArrayLike, -167 n_samples: int, -168 ensure_rep: bool = False, -169 random_values: bool | None = False, -170 low: int | None = 0, -171 high: int | None = 1000, -172 ) -> np.ndarray: -173 -174 """ -175 Helper function, calls _generate_feature with appropriate parameters based on feature_attributes -176 :param feature_attributes: either integer (cardinality) or list of feature attributes -177 :param n_samples: number of samples in dataset -178 :param ensure_rep: ensures all values are represented at least once in the feature vector -179 :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1 -180 :param low: lower bound of random feature vector values -181 :param high: upper bound of random feature vector values -182 :return: feature vector -183 """ -184 -185 if not isinstance(feature_attributes, (list, np.ndarray)): -186 # feature_cardinality is just an integer, generate feature either with random values or -187 # [low, low+cardinality] -188 x = self._generate_feature( -189 n_samples, -190 cardinality=feature_attributes, -191 ensure_rep=ensure_rep, -192 random_values=random_values, -193 low=low, -194 high=high, -195 ) -196 else: -197 # feature_cardinality is a list of [value_domain, value_frequencies] -198 if isinstance(feature_attributes[0], (list, np.ndarray)): -199 value_domain, value_frequencies = feature_attributes -200 x = self._generate_feature( -201 n_samples, -202 vec=value_domain, -203 ensure_rep=ensure_rep, -204 p=value_frequencies, -205 ) -206 else: -207 # feature_cardinality is value_domain (list of values for feature) -208 value_domain = feature_attributes -209 x = self._generate_feature( -210 n_samples, -211 vec=value_domain, -212 ensure_rep=ensure_rep, -213 ) -214 -215 return x +164 return X.T +165 +166 def _configure_generate_feature( +167 self, +168 feature_attributes: int | list | ArrayLike, +169 n_samples: int, +170 ensure_rep: bool = False, +171 random_values: bool | None = False, +172 low: int | None = 0, +173 high: int | None = 1000, +174 ) -> np.ndarray: +175 +176 """ +177 Helper function, calls _generate_feature with appropriate parameters based on feature_attributes +178 :param feature_attributes: either integer (cardinality) or list of feature attributes +179 :param n_samples: number of samples in dataset +180 :param ensure_rep: ensures all values are represented at least once in the feature vector +181 :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1 +182 :param low: lower bound of random feature vector values +183 :param high: upper bound of random feature vector values +184 :return: feature vector +185 """ +186 +187 # feature_cardinality is just an integer, generate feature either with random values or +188 # [low, low+cardinality] +189 if not isinstance(feature_attributes, (list, np.ndarray)): +190 x = self._generate_feature( +191 n_samples, +192 cardinality=feature_attributes, +193 ensure_rep=ensure_rep, +194 random_values=random_values, +195 low=low, +196 high=high, +197 ) +198 # feature_cardinality is a list of [value_domain, value_frequencies] +199 else: +200 if isinstance(feature_attributes[0], (list, np.ndarray)): +201 value_domain, value_frequencies = feature_attributes +202 x = self._generate_feature( +203 n_samples, +204 vec=value_domain, +205 ensure_rep=ensure_rep, +206 p=value_frequencies, +207 ) +208 # feature_cardinality is value_domain (list of values for feature) +209 else: +210 value_domain = feature_attributes +211 x = self._generate_feature( +212 n_samples, +213 vec=value_domain, +214 ensure_rep=ensure_rep, +215 ) 216 -217 def _generate_feature( -218 self, -219 size: int, -220 vec: list[int] | ArrayLike | None = None, -221 cardinality: int = 5, -222 ensure_rep: bool = False, -223 random_values: bool | None = False, -224 low: int | None = 0, -225 high: int | None = 1000, -226 p: list[float] | np.ndarray | None = None, -227 ) -> np.ndarray: -228 """ -229 Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value. -230 :param vec: list of feature values -231 :param cardinality: single value cardinality -232 :param size: length of feature vector -233 :param ensure_rep: ensures all values are represented at least once in the feature vector -234 :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1 -235 :param low: lower bound of random feature vector values -236 :param high: upper bound of random feature vector values -237 :param p: list of probabilities of each value -238 :return: feature vector x -239 """ -240 -241 if vec is None: -242 if random_values: -243 vec = np.random.choice(range(low, high + 1), cardinality, replace=False) -244 else: -245 vec = np.arange(low, low + cardinality, 1) -246 else: -247 vec = np.array(vec) -248 -249 if p is None: -250 v_shift = vec - vec[np.random.randint(len(vec))] -251 p = norm.pdf(v_shift, scale=3) -252 else: -253 p = np.array(p) -254 -255 p = p / p.sum() +217 return x +218 +219 def _generate_feature( +220 self, +221 size: int, +222 vec: list[int] | ArrayLike | None = None, +223 cardinality: int = 5, +224 ensure_rep: bool = False, +225 random_values: bool | None = False, +226 low: int | None = 0, +227 high: int | None = 1000, +228 p: list[float] | np.ndarray | None = None, +229 ) -> np.ndarray: +230 """ +231 Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value. +232 :param vec: list of feature values +233 :param cardinality: single value cardinality +234 :param size: length of feature vector +235 :param ensure_rep: ensures all values are represented at least once in the feature vector +236 :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1 +237 :param low: lower bound of random feature vector values +238 :param high: upper bound of random feature vector values +239 :param p: list of probabilities of each value +240 :return: feature vector x +241 """ +242 +243 if vec is None: +244 if random_values: +245 vec = np.random.choice(range(low, high + 1), cardinality, replace=False) +246 else: +247 vec = np.arange(low, low + cardinality, 1) +248 else: +249 vec = np.array(vec) +250 +251 if p is None: +252 v_shift = vec - vec[np.random.randint(len(vec))] +253 p = norm.pdf(v_shift, scale=3) +254 else: +255 p = np.array(p) 256 -257 if ensure_rep and len(vec) < size: -258 sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p) -259 sampled_values = np.append(sampled_values, vec) -260 else: -261 sampled_values = np.random.choice(vec, size=size, p=p) -262 -263 np.random.shuffle(sampled_values) -264 return sampled_values -265 -266 def generate_combinations( -267 self, -268 X: ArrayLike, -269 feature_indices: list[int] | ArrayLike, -270 combination_function: Optional = None, -271 combination_type: Literal = 'linear', -272 ) -> np.ndarray: -273 """ -274 Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X -275 :param X: dataset -276 :param feature_indices: indexes of features to be in combination -277 :param combination_function: optional custom function for combining feature vectors -278 :param combination_type: string flag, either liner or nonlinear, defining combination type -279 :return: X with added resultant feature -280 """ -281 -282 selected_features = X[:, feature_indices] +257 p = p / p.sum() +258 +259 if ensure_rep and len(vec) < size: +260 sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p) +261 sampled_values = np.append(sampled_values, vec) +262 else: +263 sampled_values = np.random.choice(vec, size=size, p=p) +264 +265 np.random.shuffle(sampled_values) +266 return sampled_values +267 +268 def generate_combinations( +269 self, +270 X: ArrayLike, +271 feature_indices: list[int] | ArrayLike, +272 combination_function: Optional = None, +273 combination_type: Literal['linear', 'nonlinear'] = 'linear', +274 ) -> np.ndarray: +275 """ +276 Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X +277 :param X: dataset +278 :param feature_indices: indexes of features to be in combination +279 :param combination_function: optional custom function for combining feature vectors +280 :param combination_type: string flag, either liner or nonlinear, defining combination type +281 :return: X with added resultant feature +282 """ 283 -284 if combination_function is None: -285 if combination_type == 'linear': -286 combination_function = lambda x: np.sum(x, axis=1) -287 elif combination_type == 'nonlinear': -288 combination_function = lambda x: np.sin(np.sum(x, axis=1)) -289 else: -290 combination_type = str(combination_function.__name__) -291 -292 combination_result = combination_function(selected_features) +284 selected_features = X[:, feature_indices] +285 +286 if combination_function is None: +287 if combination_type == 'linear': +288 combination_function = lambda x: np.sum(x, axis=1) +289 elif combination_type == 'nonlinear': +290 combination_function = lambda x: np.sin(np.sum(x, axis=1)) +291 else: +292 combination_type = str(combination_function.__name__) 293 -294 combination_ix = len(X[0]) +294 combination_result = combination_function(selected_features) 295 -296 self.dataset_info['combinations'].append({ -297 'feature_indices': feature_indices, -298 'combination_type': combination_type, -299 'combination_ix': combination_ix, -300 }) -301 -302 return np.column_stack((X, combination_result)) +296 combination_ix = len(X[0]) +297 +298 self.dataset_info['combinations'].append({ +299 'feature_indices': feature_indices, +300 'combination_type': combination_type, +301 'combination_ix': combination_ix, +302 }) 303 -304 def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray: -305 """ -306 Performs bitwise XOR operation on two integer arrays -307 :param arr: features to perform XOR operation on -308 :return: bitwise XOR result -309 """ -310 arrT = arr.T -311 arrT = arrT.astype(int) -312 out = np.bitwise_xor(arrT[0], arrT[1]) -313 if len(arrT) > 2: -314 for i in range(2, len(arrT)): -315 out = np.bitwise_xor(out, arrT[i]) -316 -317 return out.T +304 return np.column_stack((X, combination_result)) +305 +306 def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray: +307 """ +308 Performs bitwise XOR operation on two integer arrays +309 :param arr: features to perform XOR operation on +310 :return: bitwise XOR result +311 """ +312 arrT = arr.T +313 arrT = arrT.astype(int) +314 out = np.bitwise_xor(arrT[0], arrT[1]) +315 if len(arrT) > 2: +316 for i in range(2, len(arrT)): +317 out = np.bitwise_xor(out, arrT[i]) 318 -319 def _and(self, arr: list[int] | ArrayLike) -> np.ndarray: -320 """ -321 Performs bitwise AND operation on two integer arrays -322 :param arr: features to perform AND operation on -323 :return: bitwise AND result -324 """ -325 arrT = arr.T -326 arrT = arrT.astype(int) -327 out = np.bitwise_xor(arrT[0], arrT[1]) -328 if len(arrT) > 2: -329 for i in range(2, len(arrT)): -330 out = np.bitwise_and(out, arrT[i]) -331 -332 return out.T +319 return out.T +320 +321 def _and(self, arr: list[int] | ArrayLike) -> np.ndarray: +322 """ +323 Performs bitwise AND operation on two integer arrays +324 :param arr: features to perform AND operation on +325 :return: bitwise AND result +326 """ +327 arrT = arr.T +328 arrT = arrT.astype(int) +329 out = np.bitwise_xor(arrT[0], arrT[1]) +330 if len(arrT) > 2: +331 for i in range(2, len(arrT)): +332 out = np.bitwise_and(out, arrT[i]) 333 -334 def _or(self, arr: list[int] | ArrayLike) -> np.ndarray: -335 """ -336 Performs bitwise OR operation on two integer arrays -337 :param arr: features to perform OR operation on -338 :return: bitwise OR result -339 """ -340 arrT = arr.T -341 arrT = arrT.astype(int) -342 out = np.bitwise_xor(arrT[0], arrT[1]) -343 if len(arrT) > 2: -344 for i in range(2, len(arrT)): -345 out = np.bitwise_or(out, arrT[i]) -346 -347 return out.T +334 return out.T +335 +336 def _or(self, arr: list[int] | ArrayLike) -> np.ndarray: +337 """ +338 Performs bitwise OR operation on two integer arrays +339 :param arr: features to perform OR operation on +340 :return: bitwise OR result +341 """ +342 arrT = arr.T +343 arrT = arrT.astype(int) +344 out = np.bitwise_xor(arrT[0], arrT[1]) +345 if len(arrT) > 2: +346 for i in range(2, len(arrT)): +347 out = np.bitwise_or(out, arrT[i]) 348 -349 def generate_correlated( -350 self, -351 X: ArrayLike, -352 feature_indices: list[int] | ArrayLike, -353 r: float = 0.8, -354 ) -> np.ndarray: -355 -356 """ -357 Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0. -358 :param X: dataset -359 :param feature_indices: indices of features to generate correlated feature to -360 :param r: (Pearson) correlation factor -361 :return: X with generated correlated features -362 """ -363 -364 if not isinstance(feature_indices, (list, np.ndarray)): -365 feature_indices = np.array([feature_indices]) -366 -367 if len(feature_indices) > 1: -368 correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1) -369 else: -370 correlated_ixs = len(X[0]) -371 -372 selected_features = X[:, feature_indices] -373 transposed = np.transpose(selected_features) -374 correlated_features = [] -375 -376 for t in transposed: -377 theta = np.arccos(r) -378 t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10) -379 -380 rand = np.random.normal(0, 1, len(t_standard)) -381 rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10) -382 -383 M = np.column_stack((t_standard, rand)) -384 M_centred = (M - np.mean(M, axis=0)) -385 -386 Id = np.eye(len(t)) -387 Q = qr(M_centred[:, [0]], mode='economic')[0] -388 P = np.dot(Q, Q.T) -389 orthogonal_projection = np.dot(Id - P, M_centred[:, 1]) -390 M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection)) -391 -392 Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0)))) -393 corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0] -394 -395 correlated_features.append(corr) +349 return out.T +350 +351 def generate_correlated( +352 self, +353 X: ArrayLike, +354 feature_indices: list[int] | ArrayLike, +355 r: float = 0.8, +356 ) -> np.ndarray: +357 +358 """ +359 Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0. +360 :param X: dataset +361 :param feature_indices: indices of features to generate correlated feature to +362 :param r: (Pearson) correlation factor +363 :return: X with generated correlated features +364 """ +365 +366 if not isinstance(feature_indices, (list, np.ndarray)): +367 feature_indices = np.array([feature_indices]) +368 +369 if len(feature_indices) > 1: +370 correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1) +371 else: +372 correlated_ixs = len(X[0]) +373 +374 selected_features = X[:, feature_indices] +375 transposed = np.transpose(selected_features) +376 correlated_features = [] +377 +378 for t in transposed: +379 theta = np.arccos(r) +380 t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10) +381 +382 rand = np.random.normal(0, 1, len(t_standard)) +383 rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10) +384 +385 M = np.column_stack((t_standard, rand)) +386 M_centred = (M - np.mean(M, axis=0)) +387 +388 Id = np.eye(len(t)) +389 Q = qr(M_centred[:, [0]], mode='economic')[0] +390 P = np.dot(Q, Q.T) +391 orthogonal_projection = np.dot(Id - P, M_centred[:, 1]) +392 M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection)) +393 +394 Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0)))) +395 corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0] 396 -397 correlated_features = np.transpose(correlated_features) +397 correlated_features.append(corr) 398 -399 self.dataset_info['correlations'].append({ -400 'feature_indices': feature_indices, -401 'correlated_indices': correlated_ixs, -402 'correlation_factor': r, -403 }) -404 -405 return np.column_stack((X, correlated_features)) +399 correlated_features = np.transpose(correlated_features) +400 +401 self.dataset_info['correlations'].append({ +402 'feature_indices': feature_indices, +403 'correlated_indices': correlated_ixs, +404 'correlation_factor': r, +405 }) 406 -407 def generate_duplicates( -408 self, -409 X: ArrayLike, -410 feature_indices: list[int] | ArrayLike, -411 ) -> np.ndarray: -412 """ -413 Generates duplicate features -414 :param X: dataset -415 :param feature_indices: indices of features to duplicate -416 :return: dataset with duplicated features -417 """ -418 if not isinstance(feature_indices, (list, np.ndarray)): -419 feature_indices = np.array([feature_indices]) -420 -421 duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1) +407 return np.column_stack((X, correlated_features)) +408 +409 def generate_duplicates( +410 self, +411 X: ArrayLike, +412 feature_indices: list[int] | ArrayLike, +413 ) -> np.ndarray: +414 """ +415 Generates duplicate features +416 :param X: dataset +417 :param feature_indices: indices of features to duplicate +418 :return: dataset with duplicated features +419 """ +420 if not isinstance(feature_indices, (list, np.ndarray)): +421 feature_indices = np.array([feature_indices]) 422 -423 selected_features = X[:, feature_indices] +423 duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1) 424 -425 self.dataset_info['duplicates'].append({ -426 'feature_indices': feature_indices, -427 'duplicate_indices': duplicated_ixs, -428 }) -429 -430 return np.column_stack((X, selected_features)) +425 selected_features = X[:, feature_indices] +426 +427 self.dataset_info['duplicates'].append({ +428 'feature_indices': feature_indices, +429 'duplicate_indices': duplicated_ixs, +430 }) 431 -432 def generate_labels( -433 self, -434 X: ArrayLike, -435 n: int = 2, -436 p: float | list[float] | ArrayLike = 0.5, -437 k: int | float = 2, -438 decision_function: Optional = None, -439 class_relation: str = 'linear', -440 balance: bool = False, -441 ): -442 """ -443 Generates labels for dataset X -444 :param X: dataset -445 :param n: number of class labels -446 :param p: class distribution -447 :param k: constant -448 :param decision_function: optional user-defined decision function -449 :param class_relation: string, either 'linear', 'nonlinear', or 'cluster' -450 :param balance: boolean, whether to balance clustering class labels -451 :return: array of labels, corresponding to dataset X -452 """ -453 -454 if isinstance(p, (list, np.ndarray)): -455 if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0') -456 if len(p) > n: raise ValueError('length of p must equal n') +432 return np.column_stack((X, selected_features)) +433 +434 def generate_labels( +435 self, +436 X: ArrayLike, +437 n: int = 2, +438 p: float | list[float] | ArrayLike = 0.5, +439 k: int | float = 2, +440 decision_function: Optional = None, +441 class_relation: Literal['linear', 'nonlinear', 'cluster'] = 'linear', +442 balance: bool = False, +443 random_state: int = 42, +444 ): +445 """ +446 Generates labels for dataset X +447 :param X: dataset +448 :param n: number of class labels +449 :param p: class distribution +450 :param k: constant +451 :param decision_function: optional user-defined decision function +452 :param class_relation: string, either 'linear', 'nonlinear', or 'cluster' +453 :param balance: boolean, whether to balance clustering class labels +454 :param random_state: seed for KMeans clustering, defaults to 42 +455 :return: array of labels, corresponding to dataset X +456 """ 457 -458 if p > 1: raise ValueError('p must be less than 1.0') -459 -460 n_samples, n_features = X.shape +458 if isinstance(p, (list, np.ndarray)): +459 if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0') +460 if len(p) > n: raise ValueError('length of p must equal n') 461 -462 if decision_function is None: -463 if class_relation == 'linear': -464 decision_function = lambda x: np.sum(2 * x + 3, axis=1) -465 elif class_relation == 'nonlinear': -466 decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1) -467 elif class_relation == 'cluster': -468 decision_function = None -469 else: -470 class_relation = str(decision_function.__name__) -471 -472 y = [] -473 if decision_function is not None: -474 if n > 2: -475 if type(p) != list: -476 p = 1 / n -477 percentiles = [p * 100] -478 for i in range(1, n - 1): -479 percentiles.append(percentiles[i - 1] + (p * 100)) -480 -481 decision_boundary = decision_function(X) -482 p_points = np.percentile(decision_boundary, percentiles) -483 -484 y = np.zeros_like(decision_boundary, dtype=int) -485 for p_point in p_points: -486 y += (decision_boundary > p_point) -487 else: -488 decision_boundary = decision_function(X) -489 percentiles = [x * 100 for x in p] -490 -491 for i in range(1, len(percentiles) - 1): -492 percentiles[i] += percentiles[i - 1] -493 -494 percentiles.insert(0, 0) -495 percentiles.pop() -496 print(percentiles) +462 if p > 1: raise ValueError('p must be less than 1.0') +463 +464 n_samples, n_features = X.shape +465 +466 if decision_function is None: +467 if class_relation == 'linear': +468 decision_function = lambda x: np.sum(2 * x + 3, axis=1) +469 elif class_relation == 'nonlinear': +470 decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1) +471 elif class_relation == 'cluster': +472 decision_function = None +473 else: +474 class_relation = str(decision_function.__name__) +475 +476 y = [] +477 if decision_function is not None: +478 if n > 2: +479 if type(p) != list: +480 p = 1 / n +481 percentiles = [p * 100] +482 for i in range(1, n - 1): +483 percentiles.append(percentiles[i - 1] + (p * 100)) +484 +485 decision_boundary = decision_function(X) +486 p_points = np.percentile(decision_boundary, percentiles) +487 +488 y = np.zeros_like(decision_boundary, dtype=int) +489 for p_point in p_points: +490 y += (decision_boundary > p_point) +491 else: +492 decision_boundary = decision_function(X) +493 percentiles = [x * 100 for x in p] +494 +495 for i in range(1, len(percentiles) - 1): +496 percentiles[i] += percentiles[i - 1] 497 -498 p_points = np.percentile(decision_boundary, percentiles) -499 print(p_points) -500 -501 y = np.zeros_like(decision_boundary, dtype=int) -502 for i in range(1, n): -503 p_point = p_points[i] -504 for j in range(len(decision_boundary)): -505 if decision_boundary[j] > p_point: -506 y[j] += 1 -507 else: -508 decision_boundary = decision_function(X) -509 p_point = np.percentile(decision_boundary, p * 100) -510 y = np.where(decision_boundary > p_point, 1, 0) -511 else: -512 if p == 0.5: -513 p = 1.0 -514 else: -515 p = [p, 1 - p] -516 y = self._cluster_data(X, n, p=p, balance=balance) -517 -518 self.dataset_info.update({ -519 'labels': { -520 'class_relation': class_relation, -521 'n_class': n, -522 }, -523 }) -524 -525 return y -526 -527 def _cluster_data( -528 self, -529 X: ArrayLike, -530 n: int, -531 p: float | list[float] | ArrayLike | None = 1.0, -532 balance: bool = False, -533 ) -> np.ndarray: -534 """ -535 Cluster data using kmeans -536 :param X: dataset -537 :param n: number of clusters -538 :param p: class distribution -539 :param balance: balance the clusters according to p -540 :return: array of labels, corresponding to dataset X -541 """ -542 -543 kmeans = KMeans(n_clusters=n) -544 -545 kmeans.fit(X) -546 -547 cluster_labels = kmeans.labels_ +498 percentiles.insert(0, 0) +499 percentiles.pop() +500 print(percentiles) +501 +502 p_points = np.percentile(decision_boundary, percentiles) +503 print(p_points) +504 +505 y = np.zeros_like(decision_boundary, dtype=int) +506 for i in range(1, n): +507 p_point = p_points[i] +508 for j in range(len(decision_boundary)): +509 if decision_boundary[j] > p_point: +510 y[j] += 1 +511 else: +512 decision_boundary = decision_function(X) +513 p_point = np.percentile(decision_boundary, p * 100) +514 y = np.where(decision_boundary > p_point, 1, 0) +515 else: +516 if p == 0.5: +517 p = 1.0 +518 else: +519 p = [p, 1 - p] +520 y = self._cluster_data(X, n, p=p, balance=balance, random_state=random_state) +521 +522 self.dataset_info.update({ +523 'labels': { +524 'class_relation': class_relation, +525 'n_class': n, +526 }, +527 }) +528 +529 return y +530 +531 def _cluster_data( +532 self, +533 X: ArrayLike, +534 n: int, +535 p: float | list[float] | ArrayLike | None = 1.0, +536 balance: bool = False, +537 random_state: int = 42, +538 ) -> np.ndarray: +539 """ +540 Cluster data using kmeans +541 :param X: dataset +542 :param n: number of clusters +543 :param p: class distribution +544 :param balance: balance the clusters according to p +545 :random_state: seed for KMeans clustering, defaults to 42 +546 :return: array of labels, corresponding to dataset X +547 """ 548 -549 if not isinstance(p, (list, np.ndarray)): # Fully balanced clusters -550 samples_per_cluster = [len(X) // n] * n -551 else: -552 samples = len(X) -553 samples_per_cluster = [] -554 if not isinstance(p, (list, np.ndarray)): -555 samples_per_cluster.append(int(samples * p) // n) -556 samples_per_cluster.append(int(samples * (1 - p)) // n) -557 else: -558 if len(p) == n: -559 for val in p: -560 samples_per_cluster.append(int(samples * val)) -561 else: -562 raise Exception('Length of balance parameter must equal number of clusters.') -563 -564 # Adjust cluster sizes -565 if balance: -566 adjustments = [] -567 overflow_samples = [] -568 overflow_indices = [] -569 for i in range(n): -570 cluster_size = np.sum(cluster_labels == i) -571 -572 adjustment = samples_per_cluster[i] - cluster_size -573 adjustments.append(adjustment) -574 -575 if adjustment < 0: # Cluter is too large -576 -577 centroid = kmeans.cluster_centers_[i] -578 dataset_indices = np.where(cluster_labels == i)[0] # Indices of samples in dataset -579 cluster_samples = np.copy(X[dataset_indices]) -580 -581 distances = np.linalg.norm( -582 cluster_samples - centroid, -583 axis=1, -584 ) # Distances of cluster samples to cluster centroid -585 cluster_sample_indices = np.argsort(distances) -586 dataset_indices_sorted = dataset_indices[ -587 cluster_sample_indices -588 ] # Indices of samples sorted by sample distance to cluster centroid -589 -590 overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:] # Overflow samples -591 dataset_indices_sorted = dataset_indices_sorted[ -592 samples_per_cluster[i]: -593 ] # Dataset indices of overflow samples -594 -595 for i in range(len(overflow_sample_indices)): -596 overflow_samples.append(cluster_samples[overflow_sample_indices[i]]) -597 overflow_indices.append(dataset_indices_sorted[i]) -598 -599 overflow_samples = np.array(overflow_samples) -600 overflow_indices = np.array(overflow_indices) -601 -602 # Making adjustments -603 for i in range(n): -604 -605 if adjustments[i] > 0: -606 centroid = kmeans.cluster_centers_[i] -607 distances = np.linalg.norm(overflow_samples - centroid, axis=1) -608 -609 closest_sample_indices = np.argsort(distances) -610 -611 overflow_indices_sorted = overflow_indices[closest_sample_indices] +549 kmeans = KMeans(n_clusters=n, random_state=random_state) +550 +551 kmeans.fit(X) +552 +553 cluster_labels = kmeans.labels_ +554 +555 # Fully balanced clusters +556 if not isinstance(p, (list, np.ndarray)): +557 samples_per_cluster = [len(X) // n] * n +558 else: +559 samples = len(X) +560 samples_per_cluster = [] +561 if not isinstance(p, (list, np.ndarray)): +562 samples_per_cluster.append(int(samples * p) // n) +563 samples_per_cluster.append(int(samples * (1 - p)) // n) +564 else: +565 if len(p) == n: +566 for val in p: +567 samples_per_cluster.append(int(samples * val)) +568 else: +569 raise Exception('Length of balance parameter must equal number of clusters.') +570 +571 # Adjust cluster sizes +572 if balance: +573 adjustments = [] +574 overflow_samples = [] +575 overflow_indices = [] +576 for i in range(n): +577 cluster_size = np.sum(cluster_labels == i) +578 +579 adjustment = samples_per_cluster[i] - cluster_size +580 adjustments.append(adjustment) +581 +582 # Cluster is too large +583 if adjustment < 0: +584 centroid = kmeans.cluster_centers_[i] +585 # Indices of samples in dataset +586 dataset_indices = np.where(cluster_labels == i)[0] +587 cluster_samples = np.copy(X[dataset_indices]) +588 +589 distances = np.linalg.norm( +590 cluster_samples - centroid, +591 axis=1, +592 ) # Distances of cluster samples to cluster centroid +593 cluster_sample_indices = np.argsort(distances) +594 dataset_indices_sorted = dataset_indices[ +595 cluster_sample_indices +596 ] # Indices of samples sorted by sample distance to cluster centroid +597 +598 overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:] # Overflow samples +599 dataset_indices_sorted = dataset_indices_sorted[ +600 samples_per_cluster[i]: +601 ] # Dataset indices of overflow samples +602 +603 for i in range(len(overflow_sample_indices)): +604 overflow_samples.append(cluster_samples[overflow_sample_indices[i]]) +605 overflow_indices.append(dataset_indices_sorted[i]) +606 +607 overflow_samples = np.array(overflow_samples) +608 overflow_indices = np.array(overflow_indices) +609 +610 # Making adjustments +611 for i in range(n): 612 -613 sample_indices_slice = closest_sample_indices[:adjustments[i]] -614 overflow_indices_slice = overflow_indices_sorted[:adjustments[i]] -615 -616 cluster_labels[overflow_indices_slice] = i -617 -618 overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0) -619 overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0) +613 if adjustments[i] > 0: +614 centroid = kmeans.cluster_centers_[i] +615 distances = np.linalg.norm(overflow_samples - centroid, axis=1) +616 +617 closest_sample_indices = np.argsort(distances) +618 +619 overflow_indices_sorted = overflow_indices[closest_sample_indices] 620 -621 return np.array(cluster_labels) -622 -623 def generate_noise( -624 self, -625 X: ArrayLike, -626 y: list[int] | ArrayLike, -627 p: float = 0.2, -628 type: Literal = 'categorical', -629 missing_val: str | int | float = float('-inf'), -630 ) -> np.ndarray: -631 -632 """ -633 Simulates noise on given dataset X -634 :param X: dataset to apply noise to -635 :param y: required target labels for categorical noise generation -636 :param p: amount of noise to apply. Defaults to 0.2 -637 :param type: type of noise to apply, either categorical or missing -638 :param missing_val: value to simulate missing values. Defaults to float('-inf') -639 :return: X with noise applied -640 """ -641 -642 self.dataset_info['noise'].append({ -643 'type': type, -644 'amount': p, -645 }) -646 -647 if type == 'categorical': -648 label_values, label_count = np.unique(y, return_counts=True) -649 n_labels = len(label_values) -650 -651 inds = y.argsort() -652 y_sort = y[inds] -653 X_sort = X[inds] +621 sample_indices_slice = closest_sample_indices[:adjustments[i]] +622 overflow_indices_slice = overflow_indices_sorted[:adjustments[i]] +623 +624 cluster_labels[overflow_indices_slice] = i +625 +626 overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0) +627 overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0) +628 +629 return np.array(cluster_labels) +630 +631 def generate_noise( +632 self, +633 X: ArrayLike, +634 y: list[int] | ArrayLike, +635 p: float = 0.2, +636 type: Literal['categorical', 'missing'] = 'categorical', +637 missing_val: str | int | float = float('-inf'), +638 ) -> np.ndarray: +639 +640 """ +641 Simulates noise on given dataset X +642 :param X: dataset to apply noise to +643 :param y: required target labels for categorical noise generation +644 :param p: amount of noise to apply. Defaults to 0.2 +645 :param type: type of noise to apply, either categorical or missing +646 :param missing_val: value to simulate missing values. Defaults to float('-inf') +647 :return: X with noise applied +648 """ +649 +650 self.dataset_info['noise'].append({ +651 'type': type, +652 'amount': p, +653 }) 654 -655 Xs_T = X_sort.T -656 n = Xs_T.shape[1] -657 n_flip = int(n * p) +655 if type == 'categorical': +656 label_values, label_count = np.unique(y, return_counts=True) +657 n_labels = len(label_values) 658 -659 for feature in Xs_T: -660 unique_per_label = {} -661 -662 for i in range(n_labels): -663 if i == 0: -664 unique = np.unique(feature[:label_count[i]]) -665 unique_per_label[label_values[i]] = set(unique) -666 else: -667 unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1]) -668 unique_per_label[label_values[i]] = set(unique) +659 inds = y.argsort() +660 y_sort = y[inds] +661 X_sort = X[inds] +662 +663 Xs_T = X_sort.T +664 n = Xs_T.shape[1] +665 n_flip = int(n * p) +666 +667 for feature in Xs_T: +668 unique_per_label = {} 669 -670 ixs = np.random.choice(n, n_flip, replace=False) -671 -672 for ix in ixs: -673 current_label = y_sort[ix] -674 possible_labels = np.where(label_values != current_label)[0] -675 -676 # find all unique values from labels != current label -677 values = set() -678 for key in possible_labels: -679 values = values.union(unique_per_label[key]) -680 -681 # remove any overlapping values, ensuring replacement values are unique & from a target label != -682 # current label -683 for val in unique_per_label[current_label] & values: -684 values.remove(val) -685 -686 if len(values) > 0: -687 val = np.random.choice(list(values)) +670 for i in range(n_labels): +671 if i == 0: +672 unique = np.unique(feature[:label_count[i]]) +673 unique_per_label[label_values[i]] = set(unique) +674 else: +675 unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1]) +676 unique_per_label[label_values[i]] = set(unique) +677 +678 ixs = np.random.choice(n, n_flip, replace=False) +679 +680 for ix in ixs: +681 current_label = y_sort[ix] +682 possible_labels = np.where(label_values != current_label)[0] +683 +684 # find all unique values from labels != current label +685 values = set() +686 for key in possible_labels: +687 values = values.union(unique_per_label[key]) 688 -689 else: -690 key = possible_labels[np.random.randint(len(possible_labels))] -691 values = unique_per_label[key] -692 val = np.random.choice(list(values)) +689 # remove any overlapping values, ensuring replacement values are unique & from a target label != +690 # current label +691 for val in unique_per_label[current_label] & values: +692 values.remove(val) 693 -694 feature[ix] = val -695 -696 rev_ind = inds.argsort() -697 X_noise = Xs_T.T -698 X_noise = X_noise[rev_ind] -699 -700 return X_noise +694 if len(values) > 0: +695 val = np.random.choice(list(values)) +696 +697 else: +698 key = possible_labels[np.random.randint(len(possible_labels))] +699 values = unique_per_label[key] +700 val = np.random.choice(list(values)) 701 -702 elif type == 'missing': -703 X_noise = np.copy(X) -704 Xn_T = X_noise.T -705 n = Xn_T.shape[1] -706 n_missing = int(n * p) -707 #print("n to delete:", n_missing) -708 -709 for feature in Xn_T: -710 ixs = np.random.choice(n, n_missing, replace=False) -711 -712 for ix in ixs: -713 feature[ix] = missing_val -714 -715 return Xn_T.T +702 feature[ix] = val +703 +704 rev_ind = inds.argsort() +705 X_noise = Xs_T.T +706 X_noise = X_noise[rev_ind] +707 +708 return X_noise +709 +710 elif type == 'missing': +711 X_noise = np.copy(X) +712 Xn_T = X_noise.T +713 n = Xn_T.shape[1] +714 n_missing = int(n * p) +715 #print("n to delete:", n_missing) 716 -717 def downsample_dataset( -718 self, -719 X: ArrayLike, -720 y: list[int] | ArrayLike, -721 N: int | None = None, -722 seed: int = 42, -723 reshuffle: bool = False, -724 ) -> tuple[np.ndarray, np.ndarray]: -725 -726 """ -727 Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset. -728 :param X: Dataset to downsample -729 :param y: Labels corresponding to X -730 :param N: Optional number of samples per class to downsample to -731 :param seed: Seed for random state of resample function -732 :param reshuffle: Reshuffle the dataset after downsampling -733 :return: Balanced X and y after downsampling -734 """ -735 -736 original_shape = X.shape -737 -738 values, counts = np.unique(y, return_counts=True) -739 if N is None: -740 N = min(counts) -741 -742 if N > min(counts): -743 raise ValueError('N must be equal to or less than the number of samples in minority class') -744 -745 X_arrays_list = [] -746 y_downsampled = [] -747 for label in values: -748 X_label = [X[i] for i in range(len(y)) if y[i] == label] -749 X_label_downsample = resample( -750 X_label, -751 replace=True, -752 n_samples=N, -753 random_state=seed, -754 ) -755 X_arrays_list.append(X_label_downsample) -756 ys = [label] * N -757 y_downsampled = np.concatenate((y_downsampled, ys), axis=0) -758 -759 X_downsampled = np.concatenate(X_arrays_list, axis=0) -760 -761 if reshuffle: -762 indices = np.arange(len(X_downsampled)) -763 np.random.shuffle(indices) -764 X_downsampled = X_downsampled[indices] -765 y_downsampled = y_downsampled[indices] -766 -767 downsampled_shape = X_downsampled.shape -768 -769 self.dataset_info.update({ -770 'downsampling': { -771 'original_shape': original_shape, -772 'downsampled_shape': downsampled_shape, -773 }, -774 }) -775 -776 return X_downsampled, y_downsampled +717 for feature in Xn_T: +718 ixs = np.random.choice(n, n_missing, replace=False) +719 +720 for ix in ixs: +721 feature[ix] = missing_val +722 +723 return Xn_T.T +724 +725 else: +726 raise ValueError(f'Type {type} not supported') +727 +728 def downsample_dataset( +729 self, +730 X: ArrayLike, +731 y: list[int] | ArrayLike, +732 N: int | None = None, +733 seed: int = 42, +734 reshuffle: bool = False, +735 ) -> tuple[np.ndarray, np.ndarray]: +736 +737 """ +738 Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset. +739 :param X: Dataset to downsample +740 :param y: Labels corresponding to X +741 :param N: Optional number of samples per class to downsample to +742 :param seed: Seed for random state of resample function +743 :param reshuffle: Reshuffle the dataset after downsampling +744 :return: Balanced X and y after downsampling +745 """ +746 +747 original_shape = X.shape +748 +749 values, counts = np.unique(y, return_counts=True) +750 if N is None: +751 N = min(counts) +752 +753 if N > min(counts): +754 raise ValueError('N must be equal to or less than the number of samples in minority class') +755 +756 X_arrays_list = [] +757 y_downsampled = [] +758 for label in values: +759 X_label = [X[i] for i in range(len(y)) if y[i] == label] +760 X_label_downsample = resample( +761 X_label, +762 replace=True, +763 n_samples=N, +764 random_state=seed, +765 ) +766 X_arrays_list.append(X_label_downsample) +767 ys = [label] * N +768 y_downsampled = np.concatenate((y_downsampled, ys), axis=0) +769 +770 X_downsampled = np.concatenate(X_arrays_list, axis=0) +771 +772 if reshuffle: +773 indices = np.arange(len(X_downsampled)) +774 np.random.shuffle(indices) +775 X_downsampled = X_downsampled[indices] +776 y_downsampled = y_downsampled[indices] 777 -778 def print_dataset( -779 self, -780 X: ArrayLike, -781 y: ArrayLike, -782 ): -783 """ -784 Prints given dataset -785 :param X: dataset -786 :param y: labels -787 :return: -788 """ -789 -790 n_samples, n_features = X.shape -791 n = 0 -792 for arr in X: -793 print('[', end='') -794 for i in range(n_features): -795 if i == n_features - 1: -796 print(arr[i], end='') -797 else: -798 print(arr[i], end=', ') -799 print(f'], Label: {y[n]}') -800 n += 1 -801 -802 def summarize(self): -803 -804 print(f"Number of features: {self.dataset_info['general']['n_features']}") -805 print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}") -806 if self.dataset_info['downsampling']: -807 print( -808 f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}", -809 ) -810 print(f"Number of classes: {self.dataset_info['labels']['n_class']}") -811 print(f"Class relation: {self.dataset_info['labels']['class_relation']}") +778 downsampled_shape = X_downsampled.shape +779 +780 self.dataset_info.update({ +781 'downsampling': { +782 'original_shape': original_shape, +783 'downsampled_shape': downsampled_shape, +784 }, +785 }) +786 +787 return X_downsampled, y_downsampled +788 +789 def print_dataset( +790 self, +791 X: ArrayLike, +792 y: ArrayLike, +793 ): +794 """ +795 Prints given dataset +796 :param X: dataset +797 :param y: labels +798 :return: +799 """ +800 +801 n_samples, n_features = X.shape +802 n = 0 +803 for arr in X: +804 print('[', end='') +805 for i in range(n_features): +806 if i == n_features - 1: +807 print(arr[i], end='') +808 else: +809 print(arr[i], end=', ') +810 print(f'], Label: {y[n]}') +811 n += 1 812 -813 print('-------------------------------------') -814 -815 if len(self.dataset_info['combinations']) > 0: -816 print('Combinations:') -817 for comb in self.dataset_info['combinations']: -818 print( -819 f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}", -820 ) -821 print('-------------------------------------') -822 -823 if len(self.dataset_info['correlations']) > 0: -824 print('Correlations:') -825 for corr in self.dataset_info['correlations']: -826 print( -827 f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}", -828 ) -829 print('-------------------------------------') -830 -831 if len(self.dataset_info['duplicates']) > 0: -832 print('Duplicates:') -833 for dup in self.dataset_info['duplicates']: -834 print( -835 f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}", -836 ) -837 print('-------------------------------------') -838 -839 if len(self.dataset_info['noise']) > 0: -840 print('Simulated noise:') -841 for noise in self.dataset_info['noise']: -842 print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}") -843 print('-------------------------------------') -844 -845 print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']") +813 """ +814 def summarize(self): +815 # TODO: Logging function +816 """

    @@ -949,838 +920,835 @@

     18class CategoricalClassification:
      19
    - 20    def __init__(self):
    - 21        self.dataset_info = {
    - 22            'general': {},
    - 23            'combinations': [],
    - 24            'correlations': [],
    - 25            'duplicates': [],
    - 26            'labels': [],
    - 27            'noise': [],
    - 28        }
    - 29
    - 30    def __repr__(self):
    - 31        return f"CategoricalClassification(dataset_info={self.dataset_info})"
    - 32
    - 33    def generate_data(
    - 34        self,
    - 35        n_features: int,
    - 36        n_samples: int,
    - 37        cardinality: int = 5,
    - 38        structure: list | ArrayLike | None = None,
    - 39        ensure_rep: bool = False,
    - 40        random_values: bool | None = False,
    - 41        low: int | None = 0,
    - 42        high: int | None = 1000,
    - 43        seed: int = 42,
    - 44    ) -> np.ndarray:
    - 45
    - 46        """
    - 47        Generates dataset based on given parameters
    - 48        :param n_features: number of generated features
    - 49        :param n_samples: number of generated samples
    - 50        :param cardinality: default cardinality of the dataset
    - 51        :param structure: structure of the dataset
    - 52        :param ensure_rep: flag, ensures all given values represented
    - 53        :param random_values: flag, enables random (integer) feature values from set [low, high]
    - 54        :param low: sets lower bound of random feature values
    - 55        :param high: sets high bound of random feature values
    - 56        :param seed: sets seed of numpy random
    - 57        :return: X, 2D dataset
    - 58        """
    - 59
    - 60        self.dataset_info.update({
    - 61            'general': {
    - 62                'n_features': n_features,
    - 63                'n_samples': n_samples,
    - 64                'cardinality': cardinality,
    - 65                'structure': structure,
    - 66                'ensure_rep': ensure_rep,
    - 67                'seed': seed,
    - 68            },
    - 69        })
    - 70
    - 71        np.random.seed(seed)
    - 72        X = np.empty([n_features, n_samples])
    - 73
    - 74        if structure is None:
    - 75            # No specific structure parameter passed
    - 76            for i in range(n_features):
    - 77                x = self._generate_feature(
    - 78                    n_samples,
    - 79                    cardinality=cardinality,
    - 80                    ensure_rep=ensure_rep,
    - 81                    random_values=random_values,
    - 82                    low=low,
    - 83                    high=high,
    - 84                )
    - 85                X[i] = x
    - 86        else:
    - 87            # Structure parameter passed, building based on structure
    - 88            ix = 0
    - 89            for data in structure:
    - 90                if not isinstance(data[0], (list, np.ndarray)):
    - 91                    # Data in structure is a tuple of (feature index (integer), feature attributes)
    - 92                    feature_ix, feature_attributes = data
    - 93
    - 94                    if ix < feature_ix:
    - 95                        # Filling out the dataset up to column index feature_ix
    - 96                        for i in range(ix, feature_ix):
    - 97                            x = self._generate_feature(
    - 98                                n_samples,
    - 99                                cardinality=cardinality,
    -100                                ensure_rep=ensure_rep,
    -101                                random_values=random_values,
    -102                                low=low,
    -103                                high=high,
    -104                            )
    -105                            X[ix] = x
    -106                            ix += 1
    -107
    -108                    x = self._configure_generate_feature(
    -109                        feature_attributes,
    -110                        n_samples,
    -111                        ensure_rep=ensure_rep,
    -112                        random_values=random_values,
    -113                        low=low,
    -114                        high=high,
    -115                    )
    -116                    X[ix] = x
    -117                    ix += 1
    -118
    -119                else:
    -120                    # Data in structure is a tuple of (list of feature indexes, feature attributes)
    -121                    feature_ixs, feature_attributes = data
    -122
    -123                    for feature_ix in feature_ixs:
    -124                        # Filling out the dataset up to feature_ix
    -125                        if ix < feature_ix:
    -126                            for i in range(ix, feature_ix):
    -127                                x = self._generate_feature(
    -128                                    n_samples,
    -129                                    cardinality=cardinality,
    -130                                    ensure_rep=ensure_rep,
    -131                                    random_values=random_values,
    -132                                    low=low,
    -133                                    high=high,
    -134                                )
    -135                                X[ix] = x
    -136                                ix += 1
    -137
    -138                        x = self._configure_generate_feature(
    -139                            feature_attributes,
    -140                            n_samples,
    -141                            ensure_rep=ensure_rep,
    -142                            random_values=random_values,
    -143                            low=low,
    -144                            high=high,
    -145                        )
    -146
    -147                        X[ix] = x
    -148                        ix += 1
    -149
    -150            if ix < n_features:
    -151                # Fill out the rest of the dataset
    -152                for i in range(ix, n_features):
    -153                    x = self._generate_feature(
    -154                        n_samples,
    -155                        cardinality=cardinality,
    -156                        ensure_rep=ensure_rep,
    -157                        random_values=random_values,
    -158                        low=low,
    -159                        high=high,
    -160                    )
    -161                    X[i] = x
    -162
    -163        return X.T
    + 20    def __init__(self, seed: int = 42):
    + 21        np.random.seed(seed)
    + 22        self.dataset_info = {
    + 23            'general': {},
    + 24            'combinations': [],
    + 25            'correlations': [],
    + 26            'duplicates': [],
    + 27            'labels': {},
    + 28            'noise': [],
    + 29        }
    + 30
    + 31    def __repr__(self):
    + 32        return f"CategoricalClassification(dataset_info={self.dataset_info})"
    + 33
    + 34    def generate_data(
    + 35        self,
    + 36        n_features: int,
    + 37        n_samples: int,
    + 38        cardinality: int = 5,
    + 39        structure: list | ArrayLike | None = None,
    + 40        ensure_rep: bool = False,
    + 41        random_values: bool | None = False,
    + 42        low: int | None = 0,
    + 43        high: int | None = 1000,
    + 44        seed: int = 42,
    + 45    ) -> np.ndarray:
    + 46
    + 47        """
    + 48        Generates dataset based on given parameters
    + 49        :param n_features: number of generated features
    + 50        :param n_samples: number of generated samples
    + 51        :param cardinality: default cardinality of the dataset
    + 52        :param structure: structure of the dataset
    + 53        :param ensure_rep: flag, ensures all given values represented
    + 54        :param random_values: flag, enables random (integer) feature values from set [low, high]
    + 55        :param low: sets lower bound of random feature values
    + 56        :param high: sets high bound of random feature values
    + 57        :param seed: sets seed of numpy random
    + 58        :return: X, 2D dataset
    + 59        """
    + 60
    + 61        self.dataset_info.update({
    + 62            'general': {
    + 63                'n_features': n_features,
    + 64                'n_samples': n_samples,
    + 65                'cardinality': cardinality,
    + 66                'structure': structure,
    + 67                'ensure_rep': ensure_rep,
    + 68                'seed': seed,
    + 69            },
    + 70        })
    + 71
    + 72        np.random.seed(seed)
    + 73        X = np.empty([n_features, n_samples])
    + 74
    + 75        # No specific structure parameter passed
    + 76        if structure is None:
    + 77            for i in range(n_features):
    + 78                x = self._generate_feature(
    + 79                    n_samples,
    + 80                    cardinality=cardinality,
    + 81                    ensure_rep=ensure_rep,
    + 82                    random_values=random_values,
    + 83                    low=low,
    + 84                    high=high,
    + 85                )
    + 86                X[i] = x
    + 87        # Structure parameter passed, building based on structure
    + 88        else:
    + 89            ix = 0
    + 90            for data in structure:
    + 91
    + 92                # Data in structure is a tuple of (feature index (integer), feature attributes)
    + 93                if not isinstance(data[0], (list, np.ndarray)):
    + 94                    feature_ix, feature_attributes = data
    + 95
    + 96                    # Filling out the dataset up to column index feature_ix
    + 97                    if ix < feature_ix:
    + 98                        for i in range(ix, feature_ix):
    + 99                            x = self._generate_feature(
    +100                                n_samples,
    +101                                cardinality=cardinality,
    +102                                ensure_rep=ensure_rep,
    +103                                random_values=random_values,
    +104                                low=low,
    +105                                high=high,
    +106                            )
    +107                            X[ix] = x
    +108                            ix += 1
    +109
    +110                    x = self._configure_generate_feature(
    +111                        feature_attributes,
    +112                        n_samples,
    +113                        ensure_rep=ensure_rep,
    +114                        random_values=random_values,
    +115                        low=low,
    +116                        high=high,
    +117                    )
    +118                    X[ix] = x
    +119                    ix += 1
    +120
    +121                # Data in structure is a tuple of (list of feature indexes, feature attributes)
    +122                else:
    +123                    feature_ixs, feature_attributes = data
    +124
    +125                    # Filling out the dataset up to feature_ix
    +126                    for feature_ix in feature_ixs:
    +127                        if ix < feature_ix:
    +128                            for i in range(ix, feature_ix):
    +129                                x = self._generate_feature(
    +130                                    n_samples,
    +131                                    cardinality=cardinality,
    +132                                    ensure_rep=ensure_rep,
    +133                                    random_values=random_values,
    +134                                    low=low,
    +135                                    high=high,
    +136                                )
    +137                                X[ix] = x
    +138                                ix += 1
    +139
    +140                        x = self._configure_generate_feature(
    +141                            feature_attributes,
    +142                            n_samples,
    +143                            ensure_rep=ensure_rep,
    +144                            random_values=random_values,
    +145                            low=low,
    +146                            high=high,
    +147                        )
    +148
    +149                        X[ix] = x
    +150                        ix += 1
    +151
    +152            # Fill out the rest of the dataset
    +153            if ix < n_features:
    +154                for i in range(ix, n_features):
    +155                    x = self._generate_feature(
    +156                        n_samples,
    +157                        cardinality=cardinality,
    +158                        ensure_rep=ensure_rep,
    +159                        random_values=random_values,
    +160                        low=low,
    +161                        high=high,
    +162                    )
    +163                    X[i] = x
     164
    -165    def _configure_generate_feature(
    -166        self,
    -167        feature_attributes: int | list | ArrayLike,
    -168        n_samples: int,
    -169        ensure_rep: bool = False,
    -170        random_values: bool | None = False,
    -171        low: int | None = 0,
    -172        high: int | None = 1000,
    -173    ) -> np.ndarray:
    -174
    -175        """
    -176        Helper function, calls _generate_feature with appropriate parameters based on feature_attributes
    -177        :param feature_attributes: either integer (cardinality) or list of feature attributes
    -178        :param n_samples: number of samples in dataset
    -179        :param ensure_rep: ensures all values are represented at least once in the feature vector
    -180        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    -181        :param low: lower bound of random feature vector values
    -182        :param high: upper bound of random feature vector values
    -183        :return: feature vector
    -184        """
    -185
    -186        if not isinstance(feature_attributes, (list, np.ndarray)):
    -187            # feature_cardinality is just an integer, generate feature either with random values or
    -188            # [low, low+cardinality]
    -189            x = self._generate_feature(
    -190                n_samples,
    -191                cardinality=feature_attributes,
    -192                ensure_rep=ensure_rep,
    -193                random_values=random_values,
    -194                low=low,
    -195                high=high,
    -196            )
    -197        else:
    -198            # feature_cardinality is a list of [value_domain, value_frequencies]
    -199            if isinstance(feature_attributes[0], (list, np.ndarray)):
    -200                value_domain, value_frequencies = feature_attributes
    -201                x = self._generate_feature(
    -202                    n_samples,
    -203                    vec=value_domain,
    -204                    ensure_rep=ensure_rep,
    -205                    p=value_frequencies,
    -206                )
    -207            else:
    -208                # feature_cardinality is value_domain (list of values for feature)
    -209                value_domain = feature_attributes
    -210                x = self._generate_feature(
    -211                    n_samples,
    -212                    vec=value_domain,
    -213                    ensure_rep=ensure_rep,
    -214                )
    -215
    -216        return x
    +165        return X.T
    +166
    +167    def _configure_generate_feature(
    +168        self,
    +169        feature_attributes: int | list | ArrayLike,
    +170        n_samples: int,
    +171        ensure_rep: bool = False,
    +172        random_values: bool | None = False,
    +173        low: int | None = 0,
    +174        high: int | None = 1000,
    +175    ) -> np.ndarray:
    +176
    +177        """
    +178        Helper function, calls _generate_feature with appropriate parameters based on feature_attributes
    +179        :param feature_attributes: either integer (cardinality) or list of feature attributes
    +180        :param n_samples: number of samples in dataset
    +181        :param ensure_rep: ensures all values are represented at least once in the feature vector
    +182        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    +183        :param low: lower bound of random feature vector values
    +184        :param high: upper bound of random feature vector values
    +185        :return: feature vector
    +186        """
    +187
    +188        # feature_cardinality is just an integer, generate feature either with random values or
    +189        # [low, low+cardinality]
    +190        if not isinstance(feature_attributes, (list, np.ndarray)):
    +191            x = self._generate_feature(
    +192                n_samples,
    +193                cardinality=feature_attributes,
    +194                ensure_rep=ensure_rep,
    +195                random_values=random_values,
    +196                low=low,
    +197                high=high,
    +198            )
    +199        # feature_cardinality is a list of [value_domain, value_frequencies]
    +200        else:
    +201            if isinstance(feature_attributes[0], (list, np.ndarray)):
    +202                value_domain, value_frequencies = feature_attributes
    +203                x = self._generate_feature(
    +204                    n_samples,
    +205                    vec=value_domain,
    +206                    ensure_rep=ensure_rep,
    +207                    p=value_frequencies,
    +208                )
    +209            # feature_cardinality is value_domain (list of values for feature)
    +210            else:
    +211                value_domain = feature_attributes
    +212                x = self._generate_feature(
    +213                    n_samples,
    +214                    vec=value_domain,
    +215                    ensure_rep=ensure_rep,
    +216                )
     217
    -218    def _generate_feature(
    -219        self,
    -220        size: int,
    -221        vec: list[int] | ArrayLike | None = None,
    -222        cardinality: int = 5,
    -223        ensure_rep: bool = False,
    -224        random_values: bool | None = False,
    -225        low: int | None = 0,
    -226        high: int | None = 1000,
    -227        p: list[float] | np.ndarray | None = None,
    -228    ) -> np.ndarray:
    -229        """
    -230        Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value.
    -231        :param vec: list of feature values
    -232        :param cardinality: single value cardinality
    -233        :param size: length of feature vector
    -234        :param ensure_rep: ensures all values are represented at least once in the feature vector
    -235        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    -236        :param low: lower bound of random feature vector values
    -237        :param high: upper bound of random feature vector values
    -238        :param p: list of probabilities of each value
    -239        :return: feature vector x
    -240        """
    -241
    -242        if vec is None:
    -243            if random_values:
    -244                vec = np.random.choice(range(low, high + 1), cardinality, replace=False)
    -245            else:
    -246                vec = np.arange(low, low + cardinality, 1)
    -247        else:
    -248            vec = np.array(vec)
    -249
    -250        if p is None:
    -251            v_shift = vec - vec[np.random.randint(len(vec))]
    -252            p = norm.pdf(v_shift, scale=3)
    -253        else:
    -254            p = np.array(p)
    -255
    -256        p = p / p.sum()
    +218        return x
    +219
    +220    def _generate_feature(
    +221        self,
    +222        size: int,
    +223        vec: list[int] | ArrayLike | None = None,
    +224        cardinality: int = 5,
    +225        ensure_rep: bool = False,
    +226        random_values: bool | None = False,
    +227        low: int | None = 0,
    +228        high: int | None = 1000,
    +229        p: list[float] | np.ndarray | None = None,
    +230    ) -> np.ndarray:
    +231        """
    +232        Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value.
    +233        :param vec: list of feature values
    +234        :param cardinality: single value cardinality
    +235        :param size: length of feature vector
    +236        :param ensure_rep: ensures all values are represented at least once in the feature vector
    +237        :param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
    +238        :param low: lower bound of random feature vector values
    +239        :param high: upper bound of random feature vector values
    +240        :param p: list of probabilities of each value
    +241        :return: feature vector x
    +242        """
    +243
    +244        if vec is None:
    +245            if random_values:
    +246                vec = np.random.choice(range(low, high + 1), cardinality, replace=False)
    +247            else:
    +248                vec = np.arange(low, low + cardinality, 1)
    +249        else:
    +250            vec = np.array(vec)
    +251
    +252        if p is None:
    +253            v_shift = vec - vec[np.random.randint(len(vec))]
    +254            p = norm.pdf(v_shift, scale=3)
    +255        else:
    +256            p = np.array(p)
     257
    -258        if ensure_rep and len(vec) < size:
    -259            sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p)
    -260            sampled_values = np.append(sampled_values, vec)
    -261        else:
    -262            sampled_values = np.random.choice(vec, size=size, p=p)
    -263
    -264        np.random.shuffle(sampled_values)
    -265        return sampled_values
    -266
    -267    def generate_combinations(
    -268        self,
    -269        X: ArrayLike,
    -270        feature_indices: list[int] | ArrayLike,
    -271        combination_function: Optional = None,
    -272        combination_type: Literal = 'linear',
    -273    ) -> np.ndarray:
    -274        """
    -275        Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
    -276        :param X: dataset
    -277        :param feature_indices: indexes of features to be in combination
    -278        :param combination_function: optional custom function for combining feature vectors
    -279        :param combination_type: string flag, either liner or nonlinear, defining combination type
    -280        :return: X with added resultant feature
    -281        """
    -282
    -283        selected_features = X[:, feature_indices]
    +258        p = p / p.sum()
    +259
    +260        if ensure_rep and len(vec) < size:
    +261            sampled_values = np.random.choice(vec, size=(size - len(vec)), p=p)
    +262            sampled_values = np.append(sampled_values, vec)
    +263        else:
    +264            sampled_values = np.random.choice(vec, size=size, p=p)
    +265
    +266        np.random.shuffle(sampled_values)
    +267        return sampled_values
    +268
    +269    def generate_combinations(
    +270        self,
    +271        X: ArrayLike,
    +272        feature_indices: list[int] | ArrayLike,
    +273        combination_function: Optional = None,
    +274        combination_type: Literal['linear', 'nonlinear'] = 'linear',
    +275    ) -> np.ndarray:
    +276        """
    +277        Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
    +278        :param X: dataset
    +279        :param feature_indices: indexes of features to be in combination
    +280        :param combination_function: optional custom function for combining feature vectors
    +281        :param combination_type: string flag, either liner or nonlinear, defining combination type
    +282        :return: X with added resultant feature
    +283        """
     284
    -285        if combination_function is None:
    -286            if combination_type == 'linear':
    -287                combination_function = lambda x: np.sum(x, axis=1)
    -288            elif combination_type == 'nonlinear':
    -289                combination_function = lambda x: np.sin(np.sum(x, axis=1))
    -290        else:
    -291            combination_type = str(combination_function.__name__)
    -292
    -293        combination_result = combination_function(selected_features)
    +285        selected_features = X[:, feature_indices]
    +286
    +287        if combination_function is None:
    +288            if combination_type == 'linear':
    +289                combination_function = lambda x: np.sum(x, axis=1)
    +290            elif combination_type == 'nonlinear':
    +291                combination_function = lambda x: np.sin(np.sum(x, axis=1))
    +292        else:
    +293            combination_type = str(combination_function.__name__)
     294
    -295        combination_ix = len(X[0])
    +295        combination_result = combination_function(selected_features)
     296
    -297        self.dataset_info['combinations'].append({
    -298            'feature_indices': feature_indices,
    -299            'combination_type': combination_type,
    -300            'combination_ix': combination_ix,
    -301        })
    -302
    -303        return np.column_stack((X, combination_result))
    +297        combination_ix = len(X[0])
    +298
    +299        self.dataset_info['combinations'].append({
    +300            'feature_indices': feature_indices,
    +301            'combination_type': combination_type,
    +302            'combination_ix': combination_ix,
    +303        })
     304
    -305    def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray:
    -306        """
    -307        Performs bitwise XOR operation on two integer arrays
    -308        :param arr: features to perform XOR operation on
    -309        :return: bitwise XOR result
    -310        """
    -311        arrT = arr.T
    -312        arrT = arrT.astype(int)
    -313        out = np.bitwise_xor(arrT[0], arrT[1])
    -314        if len(arrT) > 2:
    -315            for i in range(2, len(arrT)):
    -316                out = np.bitwise_xor(out, arrT[i])
    -317
    -318        return out.T
    +305        return np.column_stack((X, combination_result))
    +306
    +307    def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +308        """
    +309        Performs bitwise XOR operation on two integer arrays
    +310        :param arr: features to perform XOR operation on
    +311        :return: bitwise XOR result
    +312        """
    +313        arrT = arr.T
    +314        arrT = arrT.astype(int)
    +315        out = np.bitwise_xor(arrT[0], arrT[1])
    +316        if len(arrT) > 2:
    +317            for i in range(2, len(arrT)):
    +318                out = np.bitwise_xor(out, arrT[i])
     319
    -320    def _and(self, arr: list[int] | ArrayLike) -> np.ndarray:
    -321        """
    -322        Performs bitwise AND operation on two integer arrays
    -323        :param arr: features to perform AND operation on
    -324        :return: bitwise AND result
    -325        """
    -326        arrT = arr.T
    -327        arrT = arrT.astype(int)
    -328        out = np.bitwise_xor(arrT[0], arrT[1])
    -329        if len(arrT) > 2:
    -330            for i in range(2, len(arrT)):
    -331                out = np.bitwise_and(out, arrT[i])
    -332
    -333        return out.T
    +320        return out.T
    +321
    +322    def _and(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +323        """
    +324        Performs bitwise AND operation on two integer arrays
    +325        :param arr: features to perform AND operation on
    +326        :return: bitwise AND result
    +327        """
    +328        arrT = arr.T
    +329        arrT = arrT.astype(int)
    +330        out = np.bitwise_xor(arrT[0], arrT[1])
    +331        if len(arrT) > 2:
    +332            for i in range(2, len(arrT)):
    +333                out = np.bitwise_and(out, arrT[i])
     334
    -335    def _or(self, arr: list[int] | ArrayLike) -> np.ndarray:
    -336        """
    -337        Performs bitwise OR operation on two integer arrays
    -338        :param arr: features to perform OR operation on
    -339        :return: bitwise OR result
    -340        """
    -341        arrT = arr.T
    -342        arrT = arrT.astype(int)
    -343        out = np.bitwise_xor(arrT[0], arrT[1])
    -344        if len(arrT) > 2:
    -345            for i in range(2, len(arrT)):
    -346                out = np.bitwise_or(out, arrT[i])
    -347
    -348        return out.T
    +335        return out.T
    +336
    +337    def _or(self, arr: list[int] | ArrayLike) -> np.ndarray:
    +338        """
    +339        Performs bitwise OR operation on two integer arrays
    +340        :param arr: features to perform OR operation on
    +341        :return: bitwise OR result
    +342        """
    +343        arrT = arr.T
    +344        arrT = arrT.astype(int)
    +345        out = np.bitwise_xor(arrT[0], arrT[1])
    +346        if len(arrT) > 2:
    +347            for i in range(2, len(arrT)):
    +348                out = np.bitwise_or(out, arrT[i])
     349
    -350    def generate_correlated(
    -351        self,
    -352        X: ArrayLike,
    -353        feature_indices: list[int] | ArrayLike,
    -354        r: float = 0.8,
    -355    ) -> np.ndarray:
    -356
    -357        """
    -358        Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
    -359        :param X: dataset
    -360        :param feature_indices: indices of features to generate correlated feature to
    -361        :param r: (Pearson) correlation factor
    -362        :return: X with generated correlated  features
    -363        """
    -364
    -365        if not isinstance(feature_indices, (list, np.ndarray)):
    -366            feature_indices = np.array([feature_indices])
    -367
    -368        if len(feature_indices) > 1:
    -369            correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
    -370        else:
    -371            correlated_ixs = len(X[0])
    -372
    -373        selected_features = X[:, feature_indices]
    -374        transposed = np.transpose(selected_features)
    -375        correlated_features = []
    -376
    -377        for t in transposed:
    -378            theta = np.arccos(r)
    -379            t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
    -380
    -381            rand = np.random.normal(0, 1, len(t_standard))
    -382            rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
    -383
    -384            M = np.column_stack((t_standard, rand))
    -385            M_centred = (M - np.mean(M, axis=0))
    -386
    -387            Id = np.eye(len(t))
    -388            Q = qr(M_centred[:, [0]], mode='economic')[0]
    -389            P = np.dot(Q, Q.T)
    -390            orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
    -391            M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
    -392
    -393            Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
    -394            corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
    -395
    -396            correlated_features.append(corr)
    +350        return out.T
    +351
    +352    def generate_correlated(
    +353        self,
    +354        X: ArrayLike,
    +355        feature_indices: list[int] | ArrayLike,
    +356        r: float = 0.8,
    +357    ) -> np.ndarray:
    +358
    +359        """
    +360        Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
    +361        :param X: dataset
    +362        :param feature_indices: indices of features to generate correlated feature to
    +363        :param r: (Pearson) correlation factor
    +364        :return: X with generated correlated  features
    +365        """
    +366
    +367        if not isinstance(feature_indices, (list, np.ndarray)):
    +368            feature_indices = np.array([feature_indices])
    +369
    +370        if len(feature_indices) > 1:
    +371            correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
    +372        else:
    +373            correlated_ixs = len(X[0])
    +374
    +375        selected_features = X[:, feature_indices]
    +376        transposed = np.transpose(selected_features)
    +377        correlated_features = []
    +378
    +379        for t in transposed:
    +380            theta = np.arccos(r)
    +381            t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
    +382
    +383            rand = np.random.normal(0, 1, len(t_standard))
    +384            rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
    +385
    +386            M = np.column_stack((t_standard, rand))
    +387            M_centred = (M - np.mean(M, axis=0))
    +388
    +389            Id = np.eye(len(t))
    +390            Q = qr(M_centred[:, [0]], mode='economic')[0]
    +391            P = np.dot(Q, Q.T)
    +392            orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
    +393            M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
    +394
    +395            Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
    +396            corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
     397
    -398        correlated_features = np.transpose(correlated_features)
    +398            correlated_features.append(corr)
     399
    -400        self.dataset_info['correlations'].append({
    -401            'feature_indices': feature_indices,
    -402            'correlated_indices': correlated_ixs,
    -403            'correlation_factor': r,
    -404        })
    -405
    -406        return np.column_stack((X, correlated_features))
    +400        correlated_features = np.transpose(correlated_features)
    +401
    +402        self.dataset_info['correlations'].append({
    +403            'feature_indices': feature_indices,
    +404            'correlated_indices': correlated_ixs,
    +405            'correlation_factor': r,
    +406        })
     407
    -408    def generate_duplicates(
    -409        self,
    -410        X: ArrayLike,
    -411        feature_indices: list[int] | ArrayLike,
    -412    ) -> np.ndarray:
    -413        """
    -414        Generates duplicate features
    -415        :param X: dataset
    -416        :param feature_indices: indices of features to duplicate
    -417        :return: dataset with duplicated features
    -418        """
    -419        if not isinstance(feature_indices, (list, np.ndarray)):
    -420            feature_indices = np.array([feature_indices])
    -421
    -422        duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
    +408        return np.column_stack((X, correlated_features))
    +409
    +410    def generate_duplicates(
    +411        self,
    +412        X: ArrayLike,
    +413        feature_indices: list[int] | ArrayLike,
    +414    ) -> np.ndarray:
    +415        """
    +416        Generates duplicate features
    +417        :param X: dataset
    +418        :param feature_indices: indices of features to duplicate
    +419        :return: dataset with duplicated features
    +420        """
    +421        if not isinstance(feature_indices, (list, np.ndarray)):
    +422            feature_indices = np.array([feature_indices])
     423
    -424        selected_features = X[:, feature_indices]
    +424        duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
     425
    -426        self.dataset_info['duplicates'].append({
    -427            'feature_indices': feature_indices,
    -428            'duplicate_indices': duplicated_ixs,
    -429        })
    -430
    -431        return np.column_stack((X, selected_features))
    +426        selected_features = X[:, feature_indices]
    +427
    +428        self.dataset_info['duplicates'].append({
    +429            'feature_indices': feature_indices,
    +430            'duplicate_indices': duplicated_ixs,
    +431        })
     432
    -433    def generate_labels(
    -434        self,
    -435        X: ArrayLike,
    -436        n: int = 2,
    -437        p: float | list[float] | ArrayLike = 0.5,
    -438        k: int | float = 2,
    -439        decision_function: Optional = None,
    -440        class_relation: str = 'linear',
    -441        balance: bool = False,
    -442    ):
    -443        """
    -444        Generates labels for dataset X
    -445        :param X: dataset
    -446        :param n: number of class labels
    -447        :param p: class distribution
    -448        :param k: constant
    -449        :param decision_function: optional user-defined decision function
    -450        :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    -451        :param balance: boolean, whether to balance clustering class labels
    -452        :return: array of labels, corresponding to dataset X
    -453        """
    -454
    -455        if isinstance(p, (list, np.ndarray)):
    -456            if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
    -457            if len(p) > n: raise ValueError('length of p must equal n')
    +433        return np.column_stack((X, selected_features))
    +434
    +435    def generate_labels(
    +436        self,
    +437        X: ArrayLike,
    +438        n: int = 2,
    +439        p: float | list[float] | ArrayLike = 0.5,
    +440        k: int | float = 2,
    +441        decision_function: Optional = None,
    +442        class_relation: Literal['linear', 'nonlinear', 'cluster'] = 'linear',
    +443        balance: bool = False,
    +444        random_state: int = 42,
    +445    ):
    +446        """
    +447        Generates labels for dataset X
    +448        :param X: dataset
    +449        :param n: number of class labels
    +450        :param p: class distribution
    +451        :param k: constant
    +452        :param decision_function: optional user-defined decision function
    +453        :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    +454        :param balance: boolean, whether to balance clustering class labels
    +455        :param random_state: seed for KMeans clustering, defaults to 42
    +456        :return: array of labels, corresponding to dataset X
    +457        """
     458
    -459        if p > 1: raise ValueError('p must be less than 1.0')
    -460
    -461        n_samples, n_features = X.shape
    +459        if isinstance(p, (list, np.ndarray)):
    +460            if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
    +461            if len(p) > n: raise ValueError('length of p must equal n')
     462
    -463        if decision_function is None:
    -464            if class_relation == 'linear':
    -465                decision_function = lambda x: np.sum(2 * x + 3, axis=1)
    -466            elif class_relation == 'nonlinear':
    -467                decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
    -468            elif class_relation == 'cluster':
    -469                decision_function = None
    -470        else:
    -471            class_relation = str(decision_function.__name__)
    -472
    -473        y = []
    -474        if decision_function is not None:
    -475            if n > 2:
    -476                if type(p) != list:
    -477                    p = 1 / n
    -478                    percentiles = [p * 100]
    -479                    for i in range(1, n - 1):
    -480                        percentiles.append(percentiles[i - 1] + (p * 100))
    -481
    -482                    decision_boundary = decision_function(X)
    -483                    p_points = np.percentile(decision_boundary, percentiles)
    -484
    -485                    y = np.zeros_like(decision_boundary, dtype=int)
    -486                    for p_point in p_points:
    -487                        y += (decision_boundary > p_point)
    -488                else:
    -489                    decision_boundary = decision_function(X)
    -490                    percentiles = [x * 100 for x in p]
    -491
    -492                    for i in range(1, len(percentiles) - 1):
    -493                        percentiles[i] += percentiles[i - 1]
    -494
    -495                    percentiles.insert(0, 0)
    -496                    percentiles.pop()
    -497                    print(percentiles)
    +463        if p > 1: raise ValueError('p must be less than 1.0')
    +464
    +465        n_samples, n_features = X.shape
    +466
    +467        if decision_function is None:
    +468            if class_relation == 'linear':
    +469                decision_function = lambda x: np.sum(2 * x + 3, axis=1)
    +470            elif class_relation == 'nonlinear':
    +471                decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
    +472            elif class_relation == 'cluster':
    +473                decision_function = None
    +474        else:
    +475            class_relation = str(decision_function.__name__)
    +476
    +477        y = []
    +478        if decision_function is not None:
    +479            if n > 2:
    +480                if type(p) != list:
    +481                    p = 1 / n
    +482                    percentiles = [p * 100]
    +483                    for i in range(1, n - 1):
    +484                        percentiles.append(percentiles[i - 1] + (p * 100))
    +485
    +486                    decision_boundary = decision_function(X)
    +487                    p_points = np.percentile(decision_boundary, percentiles)
    +488
    +489                    y = np.zeros_like(decision_boundary, dtype=int)
    +490                    for p_point in p_points:
    +491                        y += (decision_boundary > p_point)
    +492                else:
    +493                    decision_boundary = decision_function(X)
    +494                    percentiles = [x * 100 for x in p]
    +495
    +496                    for i in range(1, len(percentiles) - 1):
    +497                        percentiles[i] += percentiles[i - 1]
     498
    -499                    p_points = np.percentile(decision_boundary, percentiles)
    -500                    print(p_points)
    -501
    -502                    y = np.zeros_like(decision_boundary, dtype=int)
    -503                    for i in range(1, n):
    -504                        p_point = p_points[i]
    -505                        for j in range(len(decision_boundary)):
    -506                            if decision_boundary[j] > p_point:
    -507                                y[j] += 1
    -508            else:
    -509                decision_boundary = decision_function(X)
    -510                p_point = np.percentile(decision_boundary, p * 100)
    -511                y = np.where(decision_boundary > p_point, 1, 0)
    -512        else:
    -513            if p == 0.5:
    -514                p = 1.0
    -515            else:
    -516                p = [p, 1 - p]
    -517            y = self._cluster_data(X, n, p=p, balance=balance)
    -518
    -519        self.dataset_info.update({
    -520            'labels': {
    -521                'class_relation': class_relation,
    -522                'n_class': n,
    -523            },
    -524        })
    -525
    -526        return y
    -527
    -528    def _cluster_data(
    -529        self,
    -530        X: ArrayLike,
    -531        n: int,
    -532        p: float | list[float] | ArrayLike | None = 1.0,
    -533        balance: bool = False,
    -534    ) -> np.ndarray:
    -535        """
    -536        Cluster data using kmeans
    -537        :param X: dataset
    -538        :param n: number of clusters
    -539        :param p: class distribution
    -540        :param balance: balance the clusters according to p
    -541        :return: array of labels, corresponding to dataset X
    -542        """
    -543
    -544        kmeans = KMeans(n_clusters=n)
    -545
    -546        kmeans.fit(X)
    -547
    -548        cluster_labels = kmeans.labels_
    +499                    percentiles.insert(0, 0)
    +500                    percentiles.pop()
    +501                    print(percentiles)
    +502
    +503                    p_points = np.percentile(decision_boundary, percentiles)
    +504                    print(p_points)
    +505
    +506                    y = np.zeros_like(decision_boundary, dtype=int)
    +507                    for i in range(1, n):
    +508                        p_point = p_points[i]
    +509                        for j in range(len(decision_boundary)):
    +510                            if decision_boundary[j] > p_point:
    +511                                y[j] += 1
    +512            else:
    +513                decision_boundary = decision_function(X)
    +514                p_point = np.percentile(decision_boundary, p * 100)
    +515                y = np.where(decision_boundary > p_point, 1, 0)
    +516        else:
    +517            if p == 0.5:
    +518                p = 1.0
    +519            else:
    +520                p = [p, 1 - p]
    +521            y = self._cluster_data(X, n, p=p, balance=balance, random_state=random_state)
    +522
    +523        self.dataset_info.update({
    +524            'labels': {
    +525                'class_relation': class_relation,
    +526                'n_class': n,
    +527            },
    +528        })
    +529
    +530        return y
    +531
    +532    def _cluster_data(
    +533        self,
    +534        X: ArrayLike,
    +535        n: int,
    +536        p: float | list[float] | ArrayLike | None = 1.0,
    +537        balance: bool = False,
    +538        random_state: int = 42,
    +539    ) -> np.ndarray:
    +540        """
    +541        Cluster data using kmeans
    +542        :param X: dataset
    +543        :param n: number of clusters
    +544        :param p: class distribution
    +545        :param balance: balance the clusters according to p
    +546        :random_state: seed for KMeans clustering, defaults to 42
    +547        :return: array of labels, corresponding to dataset X
    +548        """
     549
    -550        if not isinstance(p, (list, np.ndarray)):  # Fully balanced clusters
    -551            samples_per_cluster = [len(X) // n] * n
    -552        else:
    -553            samples = len(X)
    -554            samples_per_cluster = []
    -555            if not isinstance(p, (list, np.ndarray)):
    -556                samples_per_cluster.append(int(samples * p) // n)
    -557                samples_per_cluster.append(int(samples * (1 - p)) // n)
    -558            else:
    -559                if len(p) == n:
    -560                    for val in p:
    -561                        samples_per_cluster.append(int(samples * val))
    -562                else:
    -563                    raise Exception('Length of balance parameter must equal number of clusters.')
    -564
    -565        # Adjust cluster sizes
    -566        if balance:
    -567            adjustments = []
    -568            overflow_samples = []
    -569            overflow_indices = []
    -570            for i in range(n):
    -571                cluster_size = np.sum(cluster_labels == i)
    -572
    -573                adjustment = samples_per_cluster[i] - cluster_size
    -574                adjustments.append(adjustment)
    -575
    -576                if adjustment < 0:  # Cluter is too large
    -577
    -578                    centroid = kmeans.cluster_centers_[i]
    -579                    dataset_indices = np.where(cluster_labels == i)[0]  # Indices of samples in dataset
    -580                    cluster_samples = np.copy(X[dataset_indices])
    -581
    -582                    distances = np.linalg.norm(
    -583                        cluster_samples - centroid,
    -584                        axis=1,
    -585                    )  # Distances of cluster samples to cluster centroid
    -586                    cluster_sample_indices = np.argsort(distances)
    -587                    dataset_indices_sorted = dataset_indices[
    -588                        cluster_sample_indices
    -589                    ]  # Indices of samples sorted by sample distance to cluster centroid
    -590
    -591                    overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:]  # Overflow samples
    -592                    dataset_indices_sorted = dataset_indices_sorted[
    -593                                             samples_per_cluster[i]:
    -594                    ]  # Dataset indices of overflow samples
    -595
    -596                    for i in range(len(overflow_sample_indices)):
    -597                        overflow_samples.append(cluster_samples[overflow_sample_indices[i]])
    -598                        overflow_indices.append(dataset_indices_sorted[i])
    -599
    -600            overflow_samples = np.array(overflow_samples)
    -601            overflow_indices = np.array(overflow_indices)
    -602
    -603            # Making adjustments
    -604            for i in range(n):
    -605
    -606                if adjustments[i] > 0:
    -607                    centroid = kmeans.cluster_centers_[i]
    -608                    distances = np.linalg.norm(overflow_samples - centroid, axis=1)
    -609
    -610                    closest_sample_indices = np.argsort(distances)
    -611
    -612                    overflow_indices_sorted = overflow_indices[closest_sample_indices]
    +550        kmeans = KMeans(n_clusters=n, random_state=random_state)
    +551
    +552        kmeans.fit(X)
    +553
    +554        cluster_labels = kmeans.labels_
    +555
    +556        # Fully balanced clusters
    +557        if not isinstance(p, (list, np.ndarray)):
    +558            samples_per_cluster = [len(X) // n] * n
    +559        else:
    +560            samples = len(X)
    +561            samples_per_cluster = []
    +562            if not isinstance(p, (list, np.ndarray)):
    +563                samples_per_cluster.append(int(samples * p) // n)
    +564                samples_per_cluster.append(int(samples * (1 - p)) // n)
    +565            else:
    +566                if len(p) == n:
    +567                    for val in p:
    +568                        samples_per_cluster.append(int(samples * val))
    +569                else:
    +570                    raise Exception('Length of balance parameter must equal number of clusters.')
    +571
    +572        # Adjust cluster sizes
    +573        if balance:
    +574            adjustments = []
    +575            overflow_samples = []
    +576            overflow_indices = []
    +577            for i in range(n):
    +578                cluster_size = np.sum(cluster_labels == i)
    +579
    +580                adjustment = samples_per_cluster[i] - cluster_size
    +581                adjustments.append(adjustment)
    +582
    +583                # Cluster is too large
    +584                if adjustment < 0:
    +585                    centroid = kmeans.cluster_centers_[i]
    +586                    # Indices of samples in dataset
    +587                    dataset_indices = np.where(cluster_labels == i)[0]
    +588                    cluster_samples = np.copy(X[dataset_indices])
    +589
    +590                    distances = np.linalg.norm(
    +591                        cluster_samples - centroid,
    +592                        axis=1,
    +593                    )  # Distances of cluster samples to cluster centroid
    +594                    cluster_sample_indices = np.argsort(distances)
    +595                    dataset_indices_sorted = dataset_indices[
    +596                        cluster_sample_indices
    +597                    ]  # Indices of samples sorted by sample distance to cluster centroid
    +598
    +599                    overflow_sample_indices = cluster_sample_indices[samples_per_cluster[i]:]  # Overflow samples
    +600                    dataset_indices_sorted = dataset_indices_sorted[
    +601                                             samples_per_cluster[i]:
    +602                    ]  # Dataset indices of overflow samples
    +603
    +604                    for i in range(len(overflow_sample_indices)):
    +605                        overflow_samples.append(cluster_samples[overflow_sample_indices[i]])
    +606                        overflow_indices.append(dataset_indices_sorted[i])
    +607
    +608            overflow_samples = np.array(overflow_samples)
    +609            overflow_indices = np.array(overflow_indices)
    +610
    +611            # Making adjustments
    +612            for i in range(n):
     613
    -614                    sample_indices_slice = closest_sample_indices[:adjustments[i]]
    -615                    overflow_indices_slice = overflow_indices_sorted[:adjustments[i]]
    -616
    -617                    cluster_labels[overflow_indices_slice] = i
    -618
    -619                    overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0)
    -620                    overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0)
    +614                if adjustments[i] > 0:
    +615                    centroid = kmeans.cluster_centers_[i]
    +616                    distances = np.linalg.norm(overflow_samples - centroid, axis=1)
    +617
    +618                    closest_sample_indices = np.argsort(distances)
    +619
    +620                    overflow_indices_sorted = overflow_indices[closest_sample_indices]
     621
    -622        return np.array(cluster_labels)
    -623
    -624    def generate_noise(
    -625        self,
    -626        X: ArrayLike,
    -627        y: list[int] | ArrayLike,
    -628        p: float = 0.2,
    -629        type: Literal = 'categorical',
    -630        missing_val: str | int | float = float('-inf'),
    -631    ) -> np.ndarray:
    -632
    -633        """
    -634        Simulates noise on given dataset X
    -635        :param X: dataset to apply noise to
    -636        :param y: required target labels for categorical noise generation
    -637        :param p: amount of noise to apply. Defaults to 0.2
    -638        :param type: type of noise to apply, either categorical or missing
    -639        :param missing_val: value to simulate missing values. Defaults to float('-inf')
    -640        :return: X with noise applied
    -641        """
    -642
    -643        self.dataset_info['noise'].append({
    -644            'type': type,
    -645            'amount': p,
    -646        })
    -647
    -648        if type == 'categorical':
    -649            label_values, label_count = np.unique(y, return_counts=True)
    -650            n_labels = len(label_values)
    -651
    -652            inds = y.argsort()
    -653            y_sort = y[inds]
    -654            X_sort = X[inds]
    +622                    sample_indices_slice = closest_sample_indices[:adjustments[i]]
    +623                    overflow_indices_slice = overflow_indices_sorted[:adjustments[i]]
    +624
    +625                    cluster_labels[overflow_indices_slice] = i
    +626
    +627                    overflow_samples = np.delete(overflow_samples, sample_indices_slice, axis=0)
    +628                    overflow_indices = np.delete(overflow_indices, sample_indices_slice, axis=0)
    +629
    +630        return np.array(cluster_labels)
    +631
    +632    def generate_noise(
    +633        self,
    +634        X: ArrayLike,
    +635        y: list[int] | ArrayLike,
    +636        p: float = 0.2,
    +637        type: Literal['categorical', 'missing'] = 'categorical',
    +638        missing_val: str | int | float = float('-inf'),
    +639    ) -> np.ndarray:
    +640
    +641        """
    +642        Simulates noise on given dataset X
    +643        :param X: dataset to apply noise to
    +644        :param y: required target labels for categorical noise generation
    +645        :param p: amount of noise to apply. Defaults to 0.2
    +646        :param type: type of noise to apply, either categorical or missing
    +647        :param missing_val: value to simulate missing values. Defaults to float('-inf')
    +648        :return: X with noise applied
    +649        """
    +650
    +651        self.dataset_info['noise'].append({
    +652            'type': type,
    +653            'amount': p,
    +654        })
     655
    -656            Xs_T = X_sort.T
    -657            n = Xs_T.shape[1]
    -658            n_flip = int(n * p)
    +656        if type == 'categorical':
    +657            label_values, label_count = np.unique(y, return_counts=True)
    +658            n_labels = len(label_values)
     659
    -660            for feature in Xs_T:
    -661                unique_per_label = {}
    -662
    -663                for i in range(n_labels):
    -664                    if i == 0:
    -665                        unique = np.unique(feature[:label_count[i]])
    -666                        unique_per_label[label_values[i]] = set(unique)
    -667                    else:
    -668                        unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
    -669                        unique_per_label[label_values[i]] = set(unique)
    +660            inds = y.argsort()
    +661            y_sort = y[inds]
    +662            X_sort = X[inds]
    +663
    +664            Xs_T = X_sort.T
    +665            n = Xs_T.shape[1]
    +666            n_flip = int(n * p)
    +667
    +668            for feature in Xs_T:
    +669                unique_per_label = {}
     670
    -671                ixs = np.random.choice(n, n_flip, replace=False)
    -672
    -673                for ix in ixs:
    -674                    current_label = y_sort[ix]
    -675                    possible_labels = np.where(label_values != current_label)[0]
    -676
    -677                    # find all unique values from labels != current label
    -678                    values = set()
    -679                    for key in possible_labels:
    -680                        values = values.union(unique_per_label[key])
    -681
    -682                    # remove any overlapping values, ensuring replacement values are unique & from a target label !=
    -683                    # current label
    -684                    for val in unique_per_label[current_label] & values:
    -685                        values.remove(val)
    -686
    -687                    if len(values) > 0:
    -688                        val = np.random.choice(list(values))
    +671                for i in range(n_labels):
    +672                    if i == 0:
    +673                        unique = np.unique(feature[:label_count[i]])
    +674                        unique_per_label[label_values[i]] = set(unique)
    +675                    else:
    +676                        unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
    +677                        unique_per_label[label_values[i]] = set(unique)
    +678
    +679                ixs = np.random.choice(n, n_flip, replace=False)
    +680
    +681                for ix in ixs:
    +682                    current_label = y_sort[ix]
    +683                    possible_labels = np.where(label_values != current_label)[0]
    +684
    +685                    # find all unique values from labels != current label
    +686                    values = set()
    +687                    for key in possible_labels:
    +688                        values = values.union(unique_per_label[key])
     689
    -690                    else:
    -691                        key = possible_labels[np.random.randint(len(possible_labels))]
    -692                        values = unique_per_label[key]
    -693                        val = np.random.choice(list(values))
    +690                    # remove any overlapping values, ensuring replacement values are unique & from a target label !=
    +691                    # current label
    +692                    for val in unique_per_label[current_label] & values:
    +693                        values.remove(val)
     694
    -695                    feature[ix] = val
    -696
    -697            rev_ind = inds.argsort()
    -698            X_noise = Xs_T.T
    -699            X_noise = X_noise[rev_ind]
    -700
    -701            return X_noise
    +695                    if len(values) > 0:
    +696                        val = np.random.choice(list(values))
    +697
    +698                    else:
    +699                        key = possible_labels[np.random.randint(len(possible_labels))]
    +700                        values = unique_per_label[key]
    +701                        val = np.random.choice(list(values))
     702
    -703        elif type == 'missing':
    -704            X_noise = np.copy(X)
    -705            Xn_T = X_noise.T
    -706            n = Xn_T.shape[1]
    -707            n_missing = int(n * p)
    -708            #print("n to delete:", n_missing)
    -709
    -710            for feature in Xn_T:
    -711                ixs = np.random.choice(n, n_missing, replace=False)
    -712
    -713                for ix in ixs:
    -714                    feature[ix] = missing_val
    -715
    -716            return Xn_T.T
    +703                    feature[ix] = val
    +704
    +705            rev_ind = inds.argsort()
    +706            X_noise = Xs_T.T
    +707            X_noise = X_noise[rev_ind]
    +708
    +709            return X_noise
    +710
    +711        elif type == 'missing':
    +712            X_noise = np.copy(X)
    +713            Xn_T = X_noise.T
    +714            n = Xn_T.shape[1]
    +715            n_missing = int(n * p)
    +716            #print("n to delete:", n_missing)
     717
    -718    def downsample_dataset(
    -719        self,
    -720        X: ArrayLike,
    -721        y: list[int] | ArrayLike,
    -722        N: int | None = None,
    -723        seed: int = 42,
    -724        reshuffle: bool = False,
    -725    ) -> tuple[np.ndarray, np.ndarray]:
    -726
    -727        """
    -728        Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
    -729        :param X: Dataset to downsample
    -730        :param y: Labels corresponding to X
    -731        :param N: Optional number of samples per class to downsample to
    -732        :param seed: Seed for random state of resample function
    -733        :param reshuffle: Reshuffle the dataset after downsampling
    -734        :return: Balanced X and y after downsampling
    -735        """
    -736
    -737        original_shape = X.shape
    -738
    -739        values, counts = np.unique(y, return_counts=True)
    -740        if N is None:
    -741            N = min(counts)
    -742
    -743        if N > min(counts):
    -744            raise ValueError('N must be equal to or less than the number of samples in minority class')
    -745
    -746        X_arrays_list = []
    -747        y_downsampled = []
    -748        for label in values:
    -749            X_label = [X[i] for i in range(len(y)) if y[i] == label]
    -750            X_label_downsample = resample(
    -751                X_label,
    -752                replace=True,
    -753                n_samples=N,
    -754                random_state=seed,
    -755            )
    -756            X_arrays_list.append(X_label_downsample)
    -757            ys = [label] * N
    -758            y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
    -759
    -760        X_downsampled = np.concatenate(X_arrays_list, axis=0)
    -761
    -762        if reshuffle:
    -763            indices = np.arange(len(X_downsampled))
    -764            np.random.shuffle(indices)
    -765            X_downsampled = X_downsampled[indices]
    -766            y_downsampled = y_downsampled[indices]
    -767
    -768        downsampled_shape = X_downsampled.shape
    -769
    -770        self.dataset_info.update({
    -771            'downsampling': {
    -772                'original_shape': original_shape,
    -773                'downsampled_shape': downsampled_shape,
    -774            },
    -775        })
    -776
    -777        return X_downsampled, y_downsampled
    +718            for feature in Xn_T:
    +719                ixs = np.random.choice(n, n_missing, replace=False)
    +720
    +721                for ix in ixs:
    +722                    feature[ix] = missing_val
    +723
    +724            return Xn_T.T
    +725
    +726        else:
    +727            raise ValueError(f'Type {type} not supported')
    +728
    +729    def downsample_dataset(
    +730        self,
    +731        X: ArrayLike,
    +732        y: list[int] | ArrayLike,
    +733        N: int | None = None,
    +734        seed: int = 42,
    +735        reshuffle: bool = False,
    +736    ) -> tuple[np.ndarray, np.ndarray]:
    +737
    +738        """
    +739        Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
    +740        :param X: Dataset to downsample
    +741        :param y: Labels corresponding to X
    +742        :param N: Optional number of samples per class to downsample to
    +743        :param seed: Seed for random state of resample function
    +744        :param reshuffle: Reshuffle the dataset after downsampling
    +745        :return: Balanced X and y after downsampling
    +746        """
    +747
    +748        original_shape = X.shape
    +749
    +750        values, counts = np.unique(y, return_counts=True)
    +751        if N is None:
    +752            N = min(counts)
    +753
    +754        if N > min(counts):
    +755            raise ValueError('N must be equal to or less than the number of samples in minority class')
    +756
    +757        X_arrays_list = []
    +758        y_downsampled = []
    +759        for label in values:
    +760            X_label = [X[i] for i in range(len(y)) if y[i] == label]
    +761            X_label_downsample = resample(
    +762                X_label,
    +763                replace=True,
    +764                n_samples=N,
    +765                random_state=seed,
    +766            )
    +767            X_arrays_list.append(X_label_downsample)
    +768            ys = [label] * N
    +769            y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
    +770
    +771        X_downsampled = np.concatenate(X_arrays_list, axis=0)
    +772
    +773        if reshuffle:
    +774            indices = np.arange(len(X_downsampled))
    +775            np.random.shuffle(indices)
    +776            X_downsampled = X_downsampled[indices]
    +777            y_downsampled = y_downsampled[indices]
     778
    -779    def print_dataset(
    -780        self,
    -781        X: ArrayLike,
    -782        y: ArrayLike,
    -783    ):
    -784        """
    -785        Prints given dataset
    -786        :param X: dataset
    -787        :param y: labels
    -788        :return:
    -789        """
    -790
    -791        n_samples, n_features = X.shape
    -792        n = 0
    -793        for arr in X:
    -794            print('[', end='')
    -795            for i in range(n_features):
    -796                if i == n_features - 1:
    -797                    print(arr[i], end='')
    -798                else:
    -799                    print(arr[i], end=', ')
    -800            print(f'], Label: {y[n]}')
    -801            n += 1
    -802
    -803    def summarize(self):
    -804
    -805        print(f"Number of features: {self.dataset_info['general']['n_features']}")
    -806        print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
    -807        if self.dataset_info['downsampling']:
    -808            print(
    -809                f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}",
    -810            )
    -811        print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
    -812        print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
    +779        downsampled_shape = X_downsampled.shape
    +780
    +781        self.dataset_info.update({
    +782            'downsampling': {
    +783                'original_shape': original_shape,
    +784                'downsampled_shape': downsampled_shape,
    +785            },
    +786        })
    +787
    +788        return X_downsampled, y_downsampled
    +789
    +790    def print_dataset(
    +791        self,
    +792        X: ArrayLike,
    +793        y: ArrayLike,
    +794    ):
    +795        """
    +796        Prints given dataset
    +797        :param X: dataset
    +798        :param y: labels
    +799        :return:
    +800        """
    +801
    +802        n_samples, n_features = X.shape
    +803        n = 0
    +804        for arr in X:
    +805            print('[', end='')
    +806            for i in range(n_features):
    +807                if i == n_features - 1:
    +808                    print(arr[i], end='')
    +809                else:
    +810                    print(arr[i], end=', ')
    +811            print(f'], Label: {y[n]}')
    +812            n += 1
     813
    -814        print('-------------------------------------')
    -815
    -816        if len(self.dataset_info['combinations']) > 0:
    -817            print('Combinations:')
    -818            for comb in self.dataset_info['combinations']:
    -819                print(
    -820                    f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}",
    -821                )
    -822            print('-------------------------------------')
    -823
    -824        if len(self.dataset_info['correlations']) > 0:
    -825            print('Correlations:')
    -826            for corr in self.dataset_info['correlations']:
    -827                print(
    -828                    f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}",
    -829                )
    -830            print('-------------------------------------')
    -831
    -832        if len(self.dataset_info['duplicates']) > 0:
    -833            print('Duplicates:')
    -834            for dup in self.dataset_info['duplicates']:
    -835                print(
    -836                    f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}",
    -837                )
    -838            print('-------------------------------------')
    -839
    -840        if len(self.dataset_info['noise']) > 0:
    -841            print('Simulated noise:')
    -842            for noise in self.dataset_info['noise']:
    -843                print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}")
    -844            print('-------------------------------------')
    -845
    -846        print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
    +814    """
    +815    def summarize(self):
    +816        # TODO: Logging function
    +817    """
    +
    + + + + +
    + +
    + + CategoricalClassification(seed: int = 42) + + + +
    + +
    20    def __init__(self, seed: int = 42):
    +21        np.random.seed(seed)
    +22        self.dataset_info = {
    +23            'general': {},
    +24            'combinations': [],
    +25            'correlations': [],
    +26            'duplicates': [],
    +27            'labels': {},
    +28            'noise': [],
    +29        }
     
    +
    dataset_info @@ -1803,137 +1771,138 @@

    -
     33    def generate_data(
    - 34        self,
    - 35        n_features: int,
    - 36        n_samples: int,
    - 37        cardinality: int = 5,
    - 38        structure: list | ArrayLike | None = None,
    - 39        ensure_rep: bool = False,
    - 40        random_values: bool | None = False,
    - 41        low: int | None = 0,
    - 42        high: int | None = 1000,
    - 43        seed: int = 42,
    - 44    ) -> np.ndarray:
    - 45
    - 46        """
    - 47        Generates dataset based on given parameters
    - 48        :param n_features: number of generated features
    - 49        :param n_samples: number of generated samples
    - 50        :param cardinality: default cardinality of the dataset
    - 51        :param structure: structure of the dataset
    - 52        :param ensure_rep: flag, ensures all given values represented
    - 53        :param random_values: flag, enables random (integer) feature values from set [low, high]
    - 54        :param low: sets lower bound of random feature values
    - 55        :param high: sets high bound of random feature values
    - 56        :param seed: sets seed of numpy random
    - 57        :return: X, 2D dataset
    - 58        """
    - 59
    - 60        self.dataset_info.update({
    - 61            'general': {
    - 62                'n_features': n_features,
    - 63                'n_samples': n_samples,
    - 64                'cardinality': cardinality,
    - 65                'structure': structure,
    - 66                'ensure_rep': ensure_rep,
    - 67                'seed': seed,
    - 68            },
    - 69        })
    - 70
    - 71        np.random.seed(seed)
    - 72        X = np.empty([n_features, n_samples])
    - 73
    - 74        if structure is None:
    - 75            # No specific structure parameter passed
    - 76            for i in range(n_features):
    - 77                x = self._generate_feature(
    - 78                    n_samples,
    - 79                    cardinality=cardinality,
    - 80                    ensure_rep=ensure_rep,
    - 81                    random_values=random_values,
    - 82                    low=low,
    - 83                    high=high,
    - 84                )
    - 85                X[i] = x
    - 86        else:
    - 87            # Structure parameter passed, building based on structure
    - 88            ix = 0
    - 89            for data in structure:
    - 90                if not isinstance(data[0], (list, np.ndarray)):
    - 91                    # Data in structure is a tuple of (feature index (integer), feature attributes)
    - 92                    feature_ix, feature_attributes = data
    - 93
    - 94                    if ix < feature_ix:
    - 95                        # Filling out the dataset up to column index feature_ix
    - 96                        for i in range(ix, feature_ix):
    - 97                            x = self._generate_feature(
    - 98                                n_samples,
    - 99                                cardinality=cardinality,
    -100                                ensure_rep=ensure_rep,
    -101                                random_values=random_values,
    -102                                low=low,
    -103                                high=high,
    -104                            )
    -105                            X[ix] = x
    -106                            ix += 1
    -107
    -108                    x = self._configure_generate_feature(
    -109                        feature_attributes,
    -110                        n_samples,
    -111                        ensure_rep=ensure_rep,
    -112                        random_values=random_values,
    -113                        low=low,
    -114                        high=high,
    -115                    )
    -116                    X[ix] = x
    -117                    ix += 1
    -118
    -119                else:
    -120                    # Data in structure is a tuple of (list of feature indexes, feature attributes)
    -121                    feature_ixs, feature_attributes = data
    -122
    -123                    for feature_ix in feature_ixs:
    -124                        # Filling out the dataset up to feature_ix
    -125                        if ix < feature_ix:
    -126                            for i in range(ix, feature_ix):
    -127                                x = self._generate_feature(
    -128                                    n_samples,
    -129                                    cardinality=cardinality,
    -130                                    ensure_rep=ensure_rep,
    -131                                    random_values=random_values,
    -132                                    low=low,
    -133                                    high=high,
    -134                                )
    -135                                X[ix] = x
    -136                                ix += 1
    -137
    -138                        x = self._configure_generate_feature(
    -139                            feature_attributes,
    -140                            n_samples,
    -141                            ensure_rep=ensure_rep,
    -142                            random_values=random_values,
    -143                            low=low,
    -144                            high=high,
    -145                        )
    -146
    -147                        X[ix] = x
    -148                        ix += 1
    -149
    -150            if ix < n_features:
    -151                # Fill out the rest of the dataset
    -152                for i in range(ix, n_features):
    -153                    x = self._generate_feature(
    -154                        n_samples,
    -155                        cardinality=cardinality,
    -156                        ensure_rep=ensure_rep,
    -157                        random_values=random_values,
    -158                        low=low,
    -159                        high=high,
    -160                    )
    -161                    X[i] = x
    -162
    -163        return X.T
    +            
     34    def generate_data(
    + 35        self,
    + 36        n_features: int,
    + 37        n_samples: int,
    + 38        cardinality: int = 5,
    + 39        structure: list | ArrayLike | None = None,
    + 40        ensure_rep: bool = False,
    + 41        random_values: bool | None = False,
    + 42        low: int | None = 0,
    + 43        high: int | None = 1000,
    + 44        seed: int = 42,
    + 45    ) -> np.ndarray:
    + 46
    + 47        """
    + 48        Generates dataset based on given parameters
    + 49        :param n_features: number of generated features
    + 50        :param n_samples: number of generated samples
    + 51        :param cardinality: default cardinality of the dataset
    + 52        :param structure: structure of the dataset
    + 53        :param ensure_rep: flag, ensures all given values represented
    + 54        :param random_values: flag, enables random (integer) feature values from set [low, high]
    + 55        :param low: sets lower bound of random feature values
    + 56        :param high: sets high bound of random feature values
    + 57        :param seed: sets seed of numpy random
    + 58        :return: X, 2D dataset
    + 59        """
    + 60
    + 61        self.dataset_info.update({
    + 62            'general': {
    + 63                'n_features': n_features,
    + 64                'n_samples': n_samples,
    + 65                'cardinality': cardinality,
    + 66                'structure': structure,
    + 67                'ensure_rep': ensure_rep,
    + 68                'seed': seed,
    + 69            },
    + 70        })
    + 71
    + 72        np.random.seed(seed)
    + 73        X = np.empty([n_features, n_samples])
    + 74
    + 75        # No specific structure parameter passed
    + 76        if structure is None:
    + 77            for i in range(n_features):
    + 78                x = self._generate_feature(
    + 79                    n_samples,
    + 80                    cardinality=cardinality,
    + 81                    ensure_rep=ensure_rep,
    + 82                    random_values=random_values,
    + 83                    low=low,
    + 84                    high=high,
    + 85                )
    + 86                X[i] = x
    + 87        # Structure parameter passed, building based on structure
    + 88        else:
    + 89            ix = 0
    + 90            for data in structure:
    + 91
    + 92                # Data in structure is a tuple of (feature index (integer), feature attributes)
    + 93                if not isinstance(data[0], (list, np.ndarray)):
    + 94                    feature_ix, feature_attributes = data
    + 95
    + 96                    # Filling out the dataset up to column index feature_ix
    + 97                    if ix < feature_ix:
    + 98                        for i in range(ix, feature_ix):
    + 99                            x = self._generate_feature(
    +100                                n_samples,
    +101                                cardinality=cardinality,
    +102                                ensure_rep=ensure_rep,
    +103                                random_values=random_values,
    +104                                low=low,
    +105                                high=high,
    +106                            )
    +107                            X[ix] = x
    +108                            ix += 1
    +109
    +110                    x = self._configure_generate_feature(
    +111                        feature_attributes,
    +112                        n_samples,
    +113                        ensure_rep=ensure_rep,
    +114                        random_values=random_values,
    +115                        low=low,
    +116                        high=high,
    +117                    )
    +118                    X[ix] = x
    +119                    ix += 1
    +120
    +121                # Data in structure is a tuple of (list of feature indexes, feature attributes)
    +122                else:
    +123                    feature_ixs, feature_attributes = data
    +124
    +125                    # Filling out the dataset up to feature_ix
    +126                    for feature_ix in feature_ixs:
    +127                        if ix < feature_ix:
    +128                            for i in range(ix, feature_ix):
    +129                                x = self._generate_feature(
    +130                                    n_samples,
    +131                                    cardinality=cardinality,
    +132                                    ensure_rep=ensure_rep,
    +133                                    random_values=random_values,
    +134                                    low=low,
    +135                                    high=high,
    +136                                )
    +137                                X[ix] = x
    +138                                ix += 1
    +139
    +140                        x = self._configure_generate_feature(
    +141                            feature_attributes,
    +142                            n_samples,
    +143                            ensure_rep=ensure_rep,
    +144                            random_values=random_values,
    +145                            low=low,
    +146                            high=high,
    +147                        )
    +148
    +149                        X[ix] = x
    +150                        ix += 1
    +151
    +152            # Fill out the rest of the dataset
    +153            if ix < n_features:
    +154                for i in range(ix, n_features):
    +155                    x = self._generate_feature(
    +156                        n_samples,
    +157                        cardinality=cardinality,
    +158                        ensure_rep=ensure_rep,
    +159                        random_values=random_values,
    +160                        low=low,
    +161                        high=high,
    +162                    )
    +163                    X[i] = x
    +164
    +165        return X.T
     
    @@ -1967,49 +1936,49 @@
    Returns
    def - generate_combinations( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], feature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], combination_function: Optional = None, combination_type: Literal = 'linear') -> numpy.ndarray: + generate_combinations( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], feature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], combination_function: Optional = None, combination_type: Literal['linear', 'nonlinear'] = 'linear') -> numpy.ndarray:
    -
    267    def generate_combinations(
    -268        self,
    -269        X: ArrayLike,
    -270        feature_indices: list[int] | ArrayLike,
    -271        combination_function: Optional = None,
    -272        combination_type: Literal = 'linear',
    -273    ) -> np.ndarray:
    -274        """
    -275        Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
    -276        :param X: dataset
    -277        :param feature_indices: indexes of features to be in combination
    -278        :param combination_function: optional custom function for combining feature vectors
    -279        :param combination_type: string flag, either liner or nonlinear, defining combination type
    -280        :return: X with added resultant feature
    -281        """
    -282
    -283        selected_features = X[:, feature_indices]
    +            
    269    def generate_combinations(
    +270        self,
    +271        X: ArrayLike,
    +272        feature_indices: list[int] | ArrayLike,
    +273        combination_function: Optional = None,
    +274        combination_type: Literal['linear', 'nonlinear'] = 'linear',
    +275    ) -> np.ndarray:
    +276        """
    +277        Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X
    +278        :param X: dataset
    +279        :param feature_indices: indexes of features to be in combination
    +280        :param combination_function: optional custom function for combining feature vectors
    +281        :param combination_type: string flag, either liner or nonlinear, defining combination type
    +282        :return: X with added resultant feature
    +283        """
     284
    -285        if combination_function is None:
    -286            if combination_type == 'linear':
    -287                combination_function = lambda x: np.sum(x, axis=1)
    -288            elif combination_type == 'nonlinear':
    -289                combination_function = lambda x: np.sin(np.sum(x, axis=1))
    -290        else:
    -291            combination_type = str(combination_function.__name__)
    -292
    -293        combination_result = combination_function(selected_features)
    +285        selected_features = X[:, feature_indices]
    +286
    +287        if combination_function is None:
    +288            if combination_type == 'linear':
    +289                combination_function = lambda x: np.sum(x, axis=1)
    +290            elif combination_type == 'nonlinear':
    +291                combination_function = lambda x: np.sin(np.sum(x, axis=1))
    +292        else:
    +293            combination_type = str(combination_function.__name__)
     294
    -295        combination_ix = len(X[0])
    +295        combination_result = combination_function(selected_features)
     296
    -297        self.dataset_info['combinations'].append({
    -298            'feature_indices': feature_indices,
    -299            'combination_type': combination_type,
    -300            'combination_ix': combination_ix,
    -301        })
    -302
    -303        return np.column_stack((X, combination_result))
    +297        combination_ix = len(X[0])
    +298
    +299        self.dataset_info['combinations'].append({
    +300            'feature_indices': feature_indices,
    +301            'combination_type': combination_type,
    +302            'combination_ix': combination_ix,
    +303        })
    +304
    +305        return np.column_stack((X, combination_result))
     
    @@ -2044,63 +2013,63 @@
    Returns
    -
    350    def generate_correlated(
    -351        self,
    -352        X: ArrayLike,
    -353        feature_indices: list[int] | ArrayLike,
    -354        r: float = 0.8,
    -355    ) -> np.ndarray:
    -356
    -357        """
    -358        Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
    -359        :param X: dataset
    -360        :param feature_indices: indices of features to generate correlated feature to
    -361        :param r: (Pearson) correlation factor
    -362        :return: X with generated correlated  features
    -363        """
    -364
    -365        if not isinstance(feature_indices, (list, np.ndarray)):
    -366            feature_indices = np.array([feature_indices])
    -367
    -368        if len(feature_indices) > 1:
    -369            correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
    -370        else:
    -371            correlated_ixs = len(X[0])
    -372
    -373        selected_features = X[:, feature_indices]
    -374        transposed = np.transpose(selected_features)
    -375        correlated_features = []
    -376
    -377        for t in transposed:
    -378            theta = np.arccos(r)
    -379            t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
    -380
    -381            rand = np.random.normal(0, 1, len(t_standard))
    -382            rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
    -383
    -384            M = np.column_stack((t_standard, rand))
    -385            M_centred = (M - np.mean(M, axis=0))
    -386
    -387            Id = np.eye(len(t))
    -388            Q = qr(M_centred[:, [0]], mode='economic')[0]
    -389            P = np.dot(Q, Q.T)
    -390            orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
    -391            M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
    -392
    -393            Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
    -394            corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
    -395
    -396            correlated_features.append(corr)
    +            
    352    def generate_correlated(
    +353        self,
    +354        X: ArrayLike,
    +355        feature_indices: list[int] | ArrayLike,
    +356        r: float = 0.8,
    +357    ) -> np.ndarray:
    +358
    +359        """
    +360        Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.
    +361        :param X: dataset
    +362        :param feature_indices: indices of features to generate correlated feature to
    +363        :param r: (Pearson) correlation factor
    +364        :return: X with generated correlated  features
    +365        """
    +366
    +367        if not isinstance(feature_indices, (list, np.ndarray)):
    +368            feature_indices = np.array([feature_indices])
    +369
    +370        if len(feature_indices) > 1:
    +371            correlated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices)), 1)
    +372        else:
    +373            correlated_ixs = len(X[0])
    +374
    +375        selected_features = X[:, feature_indices]
    +376        transposed = np.transpose(selected_features)
    +377        correlated_features = []
    +378
    +379        for t in transposed:
    +380            theta = np.arccos(r)
    +381            t_standard = (t - np.mean(t)) / (np.std(t) + 1e-10)
    +382
    +383            rand = np.random.normal(0, 1, len(t_standard))
    +384            rand = (rand - np.mean(rand)) / (np.std(rand) + 1e-10)
    +385
    +386            M = np.column_stack((t_standard, rand))
    +387            M_centred = (M - np.mean(M, axis=0))
    +388
    +389            Id = np.eye(len(t))
    +390            Q = qr(M_centred[:, [0]], mode='economic')[0]
    +391            P = np.dot(Q, Q.T)
    +392            orthogonal_projection = np.dot(Id - P, M_centred[:, 1])
    +393            M_orthogonal = np.column_stack((M_centred[:, 0], orthogonal_projection))
    +394
    +395            Y = np.dot(M_orthogonal, np.diag(1 / np.sqrt(np.sum(M_orthogonal ** 2, axis=0))))
    +396            corr = Y[:, 1] + (1 / np.tan(theta)) * Y[:, 0]
     397
    -398        correlated_features = np.transpose(correlated_features)
    +398            correlated_features.append(corr)
     399
    -400        self.dataset_info['correlations'].append({
    -401            'feature_indices': feature_indices,
    -402            'correlated_indices': correlated_ixs,
    -403            'correlation_factor': r,
    -404        })
    -405
    -406        return np.column_stack((X, correlated_features))
    +400        correlated_features = np.transpose(correlated_features)
    +401
    +402        self.dataset_info['correlations'].append({
    +403            'feature_indices': feature_indices,
    +404            'correlated_indices': correlated_ixs,
    +405            'correlation_factor': r,
    +406        })
    +407
    +408        return np.column_stack((X, correlated_features))
     
    @@ -2134,30 +2103,30 @@
    Returns
    -
    408    def generate_duplicates(
    -409        self,
    -410        X: ArrayLike,
    -411        feature_indices: list[int] | ArrayLike,
    -412    ) -> np.ndarray:
    -413        """
    -414        Generates duplicate features
    -415        :param X: dataset
    -416        :param feature_indices: indices of features to duplicate
    -417        :return: dataset with duplicated features
    -418        """
    -419        if not isinstance(feature_indices, (list, np.ndarray)):
    -420            feature_indices = np.array([feature_indices])
    -421
    -422        duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
    +            
    410    def generate_duplicates(
    +411        self,
    +412        X: ArrayLike,
    +413        feature_indices: list[int] | ArrayLike,
    +414    ) -> np.ndarray:
    +415        """
    +416        Generates duplicate features
    +417        :param X: dataset
    +418        :param feature_indices: indices of features to duplicate
    +419        :return: dataset with duplicated features
    +420        """
    +421        if not isinstance(feature_indices, (list, np.ndarray)):
    +422            feature_indices = np.array([feature_indices])
     423
    -424        selected_features = X[:, feature_indices]
    +424        duplicated_ixs = np.arange(len(X[0]), (len(X[0]) + len(feature_indices) - 1), 1)
     425
    -426        self.dataset_info['duplicates'].append({
    -427            'feature_indices': feature_indices,
    -428            'duplicate_indices': duplicated_ixs,
    -429        })
    -430
    -431        return np.column_stack((X, selected_features))
    +426        selected_features = X[:, feature_indices]
    +427
    +428        self.dataset_info['duplicates'].append({
    +429            'feature_indices': feature_indices,
    +430            'duplicate_indices': duplicated_ixs,
    +431        })
    +432
    +433        return np.column_stack((X, selected_features))
     
    @@ -2184,106 +2153,108 @@
    Returns
    def - generate_labels( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], n: int = 2, p: Union[float, list[float], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]] = 0.5, k: int | float = 2, decision_function: Optional = None, class_relation: str = 'linear', balance: bool = False): + generate_labels( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], n: int = 2, p: Union[float, list[float], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]] = 0.5, k: int | float = 2, decision_function: Optional = None, class_relation: Literal['linear', 'nonlinear', 'cluster'] = 'linear', balance: bool = False, random_state: int = 42):
    -
    433    def generate_labels(
    -434        self,
    -435        X: ArrayLike,
    -436        n: int = 2,
    -437        p: float | list[float] | ArrayLike = 0.5,
    -438        k: int | float = 2,
    -439        decision_function: Optional = None,
    -440        class_relation: str = 'linear',
    -441        balance: bool = False,
    -442    ):
    -443        """
    -444        Generates labels for dataset X
    -445        :param X: dataset
    -446        :param n: number of class labels
    -447        :param p: class distribution
    -448        :param k: constant
    -449        :param decision_function: optional user-defined decision function
    -450        :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    -451        :param balance: boolean, whether to balance clustering class labels
    -452        :return: array of labels, corresponding to dataset X
    -453        """
    -454
    -455        if isinstance(p, (list, np.ndarray)):
    -456            if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
    -457            if len(p) > n: raise ValueError('length of p must equal n')
    +            
    435    def generate_labels(
    +436        self,
    +437        X: ArrayLike,
    +438        n: int = 2,
    +439        p: float | list[float] | ArrayLike = 0.5,
    +440        k: int | float = 2,
    +441        decision_function: Optional = None,
    +442        class_relation: Literal['linear', 'nonlinear', 'cluster'] = 'linear',
    +443        balance: bool = False,
    +444        random_state: int = 42,
    +445    ):
    +446        """
    +447        Generates labels for dataset X
    +448        :param X: dataset
    +449        :param n: number of class labels
    +450        :param p: class distribution
    +451        :param k: constant
    +452        :param decision_function: optional user-defined decision function
    +453        :param class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    +454        :param balance: boolean, whether to balance clustering class labels
    +455        :param random_state: seed for KMeans clustering, defaults to 42
    +456        :return: array of labels, corresponding to dataset X
    +457        """
     458
    -459        if p > 1: raise ValueError('p must be less than 1.0')
    -460
    -461        n_samples, n_features = X.shape
    +459        if isinstance(p, (list, np.ndarray)):
    +460            if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
    +461            if len(p) > n: raise ValueError('length of p must equal n')
     462
    -463        if decision_function is None:
    -464            if class_relation == 'linear':
    -465                decision_function = lambda x: np.sum(2 * x + 3, axis=1)
    -466            elif class_relation == 'nonlinear':
    -467                decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
    -468            elif class_relation == 'cluster':
    -469                decision_function = None
    -470        else:
    -471            class_relation = str(decision_function.__name__)
    -472
    -473        y = []
    -474        if decision_function is not None:
    -475            if n > 2:
    -476                if type(p) != list:
    -477                    p = 1 / n
    -478                    percentiles = [p * 100]
    -479                    for i in range(1, n - 1):
    -480                        percentiles.append(percentiles[i - 1] + (p * 100))
    -481
    -482                    decision_boundary = decision_function(X)
    -483                    p_points = np.percentile(decision_boundary, percentiles)
    -484
    -485                    y = np.zeros_like(decision_boundary, dtype=int)
    -486                    for p_point in p_points:
    -487                        y += (decision_boundary > p_point)
    -488                else:
    -489                    decision_boundary = decision_function(X)
    -490                    percentiles = [x * 100 for x in p]
    -491
    -492                    for i in range(1, len(percentiles) - 1):
    -493                        percentiles[i] += percentiles[i - 1]
    -494
    -495                    percentiles.insert(0, 0)
    -496                    percentiles.pop()
    -497                    print(percentiles)
    +463        if p > 1: raise ValueError('p must be less than 1.0')
    +464
    +465        n_samples, n_features = X.shape
    +466
    +467        if decision_function is None:
    +468            if class_relation == 'linear':
    +469                decision_function = lambda x: np.sum(2 * x + 3, axis=1)
    +470            elif class_relation == 'nonlinear':
    +471                decision_function = lambda x: np.sum(k * np.sin(x) + k * np.cos(x), axis=1)
    +472            elif class_relation == 'cluster':
    +473                decision_function = None
    +474        else:
    +475            class_relation = str(decision_function.__name__)
    +476
    +477        y = []
    +478        if decision_function is not None:
    +479            if n > 2:
    +480                if type(p) != list:
    +481                    p = 1 / n
    +482                    percentiles = [p * 100]
    +483                    for i in range(1, n - 1):
    +484                        percentiles.append(percentiles[i - 1] + (p * 100))
    +485
    +486                    decision_boundary = decision_function(X)
    +487                    p_points = np.percentile(decision_boundary, percentiles)
    +488
    +489                    y = np.zeros_like(decision_boundary, dtype=int)
    +490                    for p_point in p_points:
    +491                        y += (decision_boundary > p_point)
    +492                else:
    +493                    decision_boundary = decision_function(X)
    +494                    percentiles = [x * 100 for x in p]
    +495
    +496                    for i in range(1, len(percentiles) - 1):
    +497                        percentiles[i] += percentiles[i - 1]
     498
    -499                    p_points = np.percentile(decision_boundary, percentiles)
    -500                    print(p_points)
    -501
    -502                    y = np.zeros_like(decision_boundary, dtype=int)
    -503                    for i in range(1, n):
    -504                        p_point = p_points[i]
    -505                        for j in range(len(decision_boundary)):
    -506                            if decision_boundary[j] > p_point:
    -507                                y[j] += 1
    -508            else:
    -509                decision_boundary = decision_function(X)
    -510                p_point = np.percentile(decision_boundary, p * 100)
    -511                y = np.where(decision_boundary > p_point, 1, 0)
    -512        else:
    -513            if p == 0.5:
    -514                p = 1.0
    -515            else:
    -516                p = [p, 1 - p]
    -517            y = self._cluster_data(X, n, p=p, balance=balance)
    -518
    -519        self.dataset_info.update({
    -520            'labels': {
    -521                'class_relation': class_relation,
    -522                'n_class': n,
    -523            },
    -524        })
    -525
    -526        return y
    +499                    percentiles.insert(0, 0)
    +500                    percentiles.pop()
    +501                    print(percentiles)
    +502
    +503                    p_points = np.percentile(decision_boundary, percentiles)
    +504                    print(p_points)
    +505
    +506                    y = np.zeros_like(decision_boundary, dtype=int)
    +507                    for i in range(1, n):
    +508                        p_point = p_points[i]
    +509                        for j in range(len(decision_boundary)):
    +510                            if decision_boundary[j] > p_point:
    +511                                y[j] += 1
    +512            else:
    +513                decision_boundary = decision_function(X)
    +514                p_point = np.percentile(decision_boundary, p * 100)
    +515                y = np.where(decision_boundary > p_point, 1, 0)
    +516        else:
    +517            if p == 0.5:
    +518                p = 1.0
    +519            else:
    +520                p = [p, 1 - p]
    +521            y = self._cluster_data(X, n, p=p, balance=balance, random_state=random_state)
    +522
    +523        self.dataset_info.update({
    +524            'labels': {
    +525                'class_relation': class_relation,
    +526                'n_class': n,
    +527            },
    +528        })
    +529
    +530        return y
     
    @@ -2299,6 +2270,7 @@
    Parameters
  • decision_function: optional user-defined decision function
  • class_relation: string, either 'linear', 'nonlinear', or 'cluster'
  • balance: boolean, whether to balance clustering class labels
  • +
  • random_state: seed for KMeans clustering, defaults to 42
  • Returns
    @@ -2315,105 +2287,108 @@
    Returns
    def - generate_noise( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], p: float = 0.2, type: Literal = 'categorical', missing_val: str | int | float = -inf) -> numpy.ndarray: + generate_noise( self, X: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], y: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]], p: float = 0.2, type: Literal['categorical', 'missing'] = 'categorical', missing_val: str | int | float = -inf) -> numpy.ndarray:
    -
    624    def generate_noise(
    -625        self,
    -626        X: ArrayLike,
    -627        y: list[int] | ArrayLike,
    -628        p: float = 0.2,
    -629        type: Literal = 'categorical',
    -630        missing_val: str | int | float = float('-inf'),
    -631    ) -> np.ndarray:
    -632
    -633        """
    -634        Simulates noise on given dataset X
    -635        :param X: dataset to apply noise to
    -636        :param y: required target labels for categorical noise generation
    -637        :param p: amount of noise to apply. Defaults to 0.2
    -638        :param type: type of noise to apply, either categorical or missing
    -639        :param missing_val: value to simulate missing values. Defaults to float('-inf')
    -640        :return: X with noise applied
    -641        """
    -642
    -643        self.dataset_info['noise'].append({
    -644            'type': type,
    -645            'amount': p,
    -646        })
    -647
    -648        if type == 'categorical':
    -649            label_values, label_count = np.unique(y, return_counts=True)
    -650            n_labels = len(label_values)
    -651
    -652            inds = y.argsort()
    -653            y_sort = y[inds]
    -654            X_sort = X[inds]
    +            
    632    def generate_noise(
    +633        self,
    +634        X: ArrayLike,
    +635        y: list[int] | ArrayLike,
    +636        p: float = 0.2,
    +637        type: Literal['categorical', 'missing'] = 'categorical',
    +638        missing_val: str | int | float = float('-inf'),
    +639    ) -> np.ndarray:
    +640
    +641        """
    +642        Simulates noise on given dataset X
    +643        :param X: dataset to apply noise to
    +644        :param y: required target labels for categorical noise generation
    +645        :param p: amount of noise to apply. Defaults to 0.2
    +646        :param type: type of noise to apply, either categorical or missing
    +647        :param missing_val: value to simulate missing values. Defaults to float('-inf')
    +648        :return: X with noise applied
    +649        """
    +650
    +651        self.dataset_info['noise'].append({
    +652            'type': type,
    +653            'amount': p,
    +654        })
     655
    -656            Xs_T = X_sort.T
    -657            n = Xs_T.shape[1]
    -658            n_flip = int(n * p)
    +656        if type == 'categorical':
    +657            label_values, label_count = np.unique(y, return_counts=True)
    +658            n_labels = len(label_values)
     659
    -660            for feature in Xs_T:
    -661                unique_per_label = {}
    -662
    -663                for i in range(n_labels):
    -664                    if i == 0:
    -665                        unique = np.unique(feature[:label_count[i]])
    -666                        unique_per_label[label_values[i]] = set(unique)
    -667                    else:
    -668                        unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
    -669                        unique_per_label[label_values[i]] = set(unique)
    +660            inds = y.argsort()
    +661            y_sort = y[inds]
    +662            X_sort = X[inds]
    +663
    +664            Xs_T = X_sort.T
    +665            n = Xs_T.shape[1]
    +666            n_flip = int(n * p)
    +667
    +668            for feature in Xs_T:
    +669                unique_per_label = {}
     670
    -671                ixs = np.random.choice(n, n_flip, replace=False)
    -672
    -673                for ix in ixs:
    -674                    current_label = y_sort[ix]
    -675                    possible_labels = np.where(label_values != current_label)[0]
    -676
    -677                    # find all unique values from labels != current label
    -678                    values = set()
    -679                    for key in possible_labels:
    -680                        values = values.union(unique_per_label[key])
    -681
    -682                    # remove any overlapping values, ensuring replacement values are unique & from a target label !=
    -683                    # current label
    -684                    for val in unique_per_label[current_label] & values:
    -685                        values.remove(val)
    -686
    -687                    if len(values) > 0:
    -688                        val = np.random.choice(list(values))
    +671                for i in range(n_labels):
    +672                    if i == 0:
    +673                        unique = np.unique(feature[:label_count[i]])
    +674                        unique_per_label[label_values[i]] = set(unique)
    +675                    else:
    +676                        unique = np.unique(feature[label_count[i - 1]:label_count[i - 1] + label_count[i] - 1])
    +677                        unique_per_label[label_values[i]] = set(unique)
    +678
    +679                ixs = np.random.choice(n, n_flip, replace=False)
    +680
    +681                for ix in ixs:
    +682                    current_label = y_sort[ix]
    +683                    possible_labels = np.where(label_values != current_label)[0]
    +684
    +685                    # find all unique values from labels != current label
    +686                    values = set()
    +687                    for key in possible_labels:
    +688                        values = values.union(unique_per_label[key])
     689
    -690                    else:
    -691                        key = possible_labels[np.random.randint(len(possible_labels))]
    -692                        values = unique_per_label[key]
    -693                        val = np.random.choice(list(values))
    +690                    # remove any overlapping values, ensuring replacement values are unique & from a target label !=
    +691                    # current label
    +692                    for val in unique_per_label[current_label] & values:
    +693                        values.remove(val)
     694
    -695                    feature[ix] = val
    -696
    -697            rev_ind = inds.argsort()
    -698            X_noise = Xs_T.T
    -699            X_noise = X_noise[rev_ind]
    -700
    -701            return X_noise
    +695                    if len(values) > 0:
    +696                        val = np.random.choice(list(values))
    +697
    +698                    else:
    +699                        key = possible_labels[np.random.randint(len(possible_labels))]
    +700                        values = unique_per_label[key]
    +701                        val = np.random.choice(list(values))
     702
    -703        elif type == 'missing':
    -704            X_noise = np.copy(X)
    -705            Xn_T = X_noise.T
    -706            n = Xn_T.shape[1]
    -707            n_missing = int(n * p)
    -708            #print("n to delete:", n_missing)
    -709
    -710            for feature in Xn_T:
    -711                ixs = np.random.choice(n, n_missing, replace=False)
    -712
    -713                for ix in ixs:
    -714                    feature[ix] = missing_val
    -715
    -716            return Xn_T.T
    +703                    feature[ix] = val
    +704
    +705            rev_ind = inds.argsort()
    +706            X_noise = Xs_T.T
    +707            X_noise = X_noise[rev_ind]
    +708
    +709            return X_noise
    +710
    +711        elif type == 'missing':
    +712            X_noise = np.copy(X)
    +713            Xn_T = X_noise.T
    +714            n = Xn_T.shape[1]
    +715            n_missing = int(n * p)
    +716            #print("n to delete:", n_missing)
    +717
    +718            for feature in Xn_T:
    +719                ixs = np.random.choice(n, n_missing, replace=False)
    +720
    +721                for ix in ixs:
    +722                    feature[ix] = missing_val
    +723
    +724            return Xn_T.T
    +725
    +726        else:
    +727            raise ValueError(f'Type {type} not supported')
     
    @@ -2449,66 +2424,66 @@
    Returns
    -
    718    def downsample_dataset(
    -719        self,
    -720        X: ArrayLike,
    -721        y: list[int] | ArrayLike,
    -722        N: int | None = None,
    -723        seed: int = 42,
    -724        reshuffle: bool = False,
    -725    ) -> tuple[np.ndarray, np.ndarray]:
    -726
    -727        """
    -728        Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
    -729        :param X: Dataset to downsample
    -730        :param y: Labels corresponding to X
    -731        :param N: Optional number of samples per class to downsample to
    -732        :param seed: Seed for random state of resample function
    -733        :param reshuffle: Reshuffle the dataset after downsampling
    -734        :return: Balanced X and y after downsampling
    -735        """
    -736
    -737        original_shape = X.shape
    -738
    -739        values, counts = np.unique(y, return_counts=True)
    -740        if N is None:
    -741            N = min(counts)
    -742
    -743        if N > min(counts):
    -744            raise ValueError('N must be equal to or less than the number of samples in minority class')
    -745
    -746        X_arrays_list = []
    -747        y_downsampled = []
    -748        for label in values:
    -749            X_label = [X[i] for i in range(len(y)) if y[i] == label]
    -750            X_label_downsample = resample(
    -751                X_label,
    -752                replace=True,
    -753                n_samples=N,
    -754                random_state=seed,
    -755            )
    -756            X_arrays_list.append(X_label_downsample)
    -757            ys = [label] * N
    -758            y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
    -759
    -760        X_downsampled = np.concatenate(X_arrays_list, axis=0)
    -761
    -762        if reshuffle:
    -763            indices = np.arange(len(X_downsampled))
    -764            np.random.shuffle(indices)
    -765            X_downsampled = X_downsampled[indices]
    -766            y_downsampled = y_downsampled[indices]
    -767
    -768        downsampled_shape = X_downsampled.shape
    -769
    -770        self.dataset_info.update({
    -771            'downsampling': {
    -772                'original_shape': original_shape,
    -773                'downsampled_shape': downsampled_shape,
    -774            },
    -775        })
    -776
    -777        return X_downsampled, y_downsampled
    +            
    729    def downsample_dataset(
    +730        self,
    +731        X: ArrayLike,
    +732        y: list[int] | ArrayLike,
    +733        N: int | None = None,
    +734        seed: int = 42,
    +735        reshuffle: bool = False,
    +736    ) -> tuple[np.ndarray, np.ndarray]:
    +737
    +738        """
    +739        Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
    +740        :param X: Dataset to downsample
    +741        :param y: Labels corresponding to X
    +742        :param N: Optional number of samples per class to downsample to
    +743        :param seed: Seed for random state of resample function
    +744        :param reshuffle: Reshuffle the dataset after downsampling
    +745        :return: Balanced X and y after downsampling
    +746        """
    +747
    +748        original_shape = X.shape
    +749
    +750        values, counts = np.unique(y, return_counts=True)
    +751        if N is None:
    +752            N = min(counts)
    +753
    +754        if N > min(counts):
    +755            raise ValueError('N must be equal to or less than the number of samples in minority class')
    +756
    +757        X_arrays_list = []
    +758        y_downsampled = []
    +759        for label in values:
    +760            X_label = [X[i] for i in range(len(y)) if y[i] == label]
    +761            X_label_downsample = resample(
    +762                X_label,
    +763                replace=True,
    +764                n_samples=N,
    +765                random_state=seed,
    +766            )
    +767            X_arrays_list.append(X_label_downsample)
    +768            ys = [label] * N
    +769            y_downsampled = np.concatenate((y_downsampled, ys), axis=0)
    +770
    +771        X_downsampled = np.concatenate(X_arrays_list, axis=0)
    +772
    +773        if reshuffle:
    +774            indices = np.arange(len(X_downsampled))
    +775            np.random.shuffle(indices)
    +776            X_downsampled = X_downsampled[indices]
    +777            y_downsampled = y_downsampled[indices]
    +778
    +779        downsampled_shape = X_downsampled.shape
    +780
    +781        self.dataset_info.update({
    +782            'downsampling': {
    +783                'original_shape': original_shape,
    +784                'downsampled_shape': downsampled_shape,
    +785            },
    +786        })
    +787
    +788        return X_downsampled, y_downsampled
     
    @@ -2544,29 +2519,29 @@
    Returns
    -
    779    def print_dataset(
    -780        self,
    -781        X: ArrayLike,
    -782        y: ArrayLike,
    -783    ):
    -784        """
    -785        Prints given dataset
    -786        :param X: dataset
    -787        :param y: labels
    -788        :return:
    -789        """
    -790
    -791        n_samples, n_features = X.shape
    -792        n = 0
    -793        for arr in X:
    -794            print('[', end='')
    -795            for i in range(n_features):
    -796                if i == n_features - 1:
    -797                    print(arr[i], end='')
    -798                else:
    -799                    print(arr[i], end=', ')
    -800            print(f'], Label: {y[n]}')
    -801            n += 1
    +            
    790    def print_dataset(
    +791        self,
    +792        X: ArrayLike,
    +793        y: ArrayLike,
    +794    ):
    +795        """
    +796        Prints given dataset
    +797        :param X: dataset
    +798        :param y: labels
    +799        :return:
    +800        """
    +801
    +802        n_samples, n_features = X.shape
    +803        n = 0
    +804        for arr in X:
    +805            print('[', end='')
    +806            for i in range(n_features):
    +807                if i == n_features - 1:
    +808                    print(arr[i], end='')
    +809                else:
    +810                    print(arr[i], end=', ')
    +811            print(f'], Label: {y[n]}')
    +812            n += 1
     
    @@ -2583,67 +2558,6 @@
    Returns
    -
    -
    - -
    - - def - summarize(self): - - - -
    - -
    803    def summarize(self):
    -804
    -805        print(f"Number of features: {self.dataset_info['general']['n_features']}")
    -806        print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}")
    -807        if self.dataset_info['downsampling']:
    -808            print(
    -809                f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}",
    -810            )
    -811        print(f"Number of classes: {self.dataset_info['labels']['n_class']}")
    -812        print(f"Class relation: {self.dataset_info['labels']['class_relation']}")
    -813
    -814        print('-------------------------------------')
    -815
    -816        if len(self.dataset_info['combinations']) > 0:
    -817            print('Combinations:')
    -818            for comb in self.dataset_info['combinations']:
    -819                print(
    -820                    f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}",
    -821                )
    -822            print('-------------------------------------')
    -823
    -824        if len(self.dataset_info['correlations']) > 0:
    -825            print('Correlations:')
    -826            for corr in self.dataset_info['correlations']:
    -827                print(
    -828                    f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}",
    -829                )
    -830            print('-------------------------------------')
    -831
    -832        if len(self.dataset_info['duplicates']) > 0:
    -833            print('Duplicates:')
    -834            for dup in self.dataset_info['duplicates']:
    -835                print(
    -836                    f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}",
    -837                )
    -838            print('-------------------------------------')
    -839
    -840        if len(self.dataset_info['noise']) > 0:
    -841            print('Simulated noise:')
    -842            for noise in self.dataset_info['noise']:
    -843                print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}")
    -844            print('-------------------------------------')
    -845
    -846        print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']")
    -
    - - - -

    diff --git a/docs/search.js b/docs/search.js index 871f608..904b8d8 100644 --- a/docs/search.js +++ b/docs/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();oWelcome to OutRank's documentation!

    \n\n

    All functions/methods can be searched-for (search bar on the left).

    \n\n

    This tool enables fast screening of feature-feature interactions. Its purpose is to give the user fast insight into potential redundancies/anomalies in the data.\nIt is implemented to operate in _mini batches_, it traverses the raw data incrementally, refining the rankings as it goes along. The core operation, interaction ranking, outputs triplets which look as follows:

    \n\n
    featureA    featureB    0.512\nfeatureA    featureC    0.125\n
    \n\n

    Setup

    \n\n
    \n
    pip install outrank\n
    \n
    \n\n

    and test a minimal cycle with

    \n\n
    \n
    outrank --task selftest\n
    \n
    \n\n

    if this passes, you can be pretty certain OutRank will perform as intended. OutRank's primary use case is as a CLI tool, begin exploring with

    \n\n
    \n
    outrank --help\n
    \n
    \n\n

    Example use cases

    \n\n
      \n
    • A minimal showcase of performing feature ranking on a generic CSV is demonstrated with this example.

    • \n
    • More examples demonstrating OutRank's capabilities are also available.

    • \n
    \n\n

    OutRank as a Python library

    \n\n

    Once installed, _OutRank_ can be used as any other Python library. For example, generic feature ranking algorithms can be accessed as

    \n\n
    \n
    from outrank.algorithms.feature_ranking.ranking_mi_numba import (\n    mutual_info_estimator_numba,\n)\n\n# Some synthetic minimal data (Numpy vectors)\na = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32)\n\nlowest = np.array(np.random.permutation(a), dtype=np.int32)\nmedium = np.array([1, 1, 0, 0, 1, 1, 1, 1], dtype=np.int32)\nhigh = np.array([1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32)\n\nlowest_score = mutual_info_estimator_numba(\n    a, lowest, np.float32(1.0), False,\n)\nmedium_score = mutual_info_estimator_numba(\n    a, medium, np.float32(1.0), False,\n)\nhigh_score = mutual_info_estimator_numba(\n    a, high, np.float32(1.0), False,\n)\n\nscores = [lowest_score, medium_score, high_score]\nsorted_score_indices = np.argsort(scores)\nassert np.sum(np.array([0, 1, 2]) - sorted_score_indices) ==  0\n
    \n
    \n\n
    \n\n

    Creating a simple dataset

    \n\n
    \n
    from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification\n\ncc = CategoricalClassification()\n\n# Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35\nX = cc.generate_data(9, \n                     10000, \n                     cardinality=35, \n                     ensure_rep=True, \n                     random_values=True, \n                     low=0, \n                     high=40)\n\n# Creates target labels via clustering\ny = cc.generate_labels(X, n=2, class_relation='cluster')\n
    \n
    \n"}, {"fullname": "outrank.algorithms", "modulename": "outrank.algorithms", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking", "modulename": "outrank.algorithms.feature_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "numba_unique", "kind": "function", "doc": "

    Identify unique elements in an array, fast

    \n", "signature": "(a):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_conditional_entropy", "kind": "function", "doc": "

    \n", "signature": "(\tY_classes,\tclass_values,\tclass_var_shape,\tinitial_prob,\tnonzero_counts):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_entropies", "kind": "function", "doc": "

    Core entropy computation function

    \n", "signature": "(X, Y, all_events, f_values, f_value_counts, cardinality_correction):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "stratified_subsampling", "kind": "function", "doc": "

    \n", "signature": "(Y, X, approximation_factor, _f_values_X):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "mutual_info_estimator_numba", "kind": "function", "doc": "

    Core estimator logic. Compute unique elements, subset if required

    \n", "signature": "(Y, X, approximation_factor=1.0, cardinality_correction=False):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator", "modulename": "outrank.algorithms.importance_estimator", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.importance_estimator.logger", "modulename": "outrank.algorithms.importance_estimator", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.algorithms.importance_estimator.num_folds", "modulename": "outrank.algorithms.importance_estimator", "qualname": "num_folds", "kind": "variable", "doc": "

    \n", "default_value": "4"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_MI", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_MI", "kind": "function", "doc": "

    \n", "signature": "(vector_first: Any, vector_second: Any) -> float:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_surrogate", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_surrogate", "kind": "function", "doc": "

    \n", "signature": "(\tvector_first: Any,\tvector_second: Any,\tX: Any,\tsurrogate_model: str) -> float:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.numba_mi", "modulename": "outrank.algorithms.importance_estimator", "qualname": "numba_mi", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_mi_adj", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_mi_adj", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_pairwise", "kind": "function", "doc": "

    A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.

    \n", "signature": "(combination, reference_model_features, args, tmp_df):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.rank_features_3MR", "modulename": "outrank.algorithms.importance_estimator", "qualname": "rank_features_3MR", "kind": "function", "doc": "

    \n", "signature": "(\trelevance_dict: dict[str, float],\tredundancy_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\trelational_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\tstrategy: str = 'median',\talpha: float = 1,\tbeta: float = 1) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_nonmyopic", "kind": "function", "doc": "

    \n", "signature": "(args: Any, tmp_df: pandas.core.frame.DataFrame):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.initialize_classifier", "modulename": "outrank.algorithms.importance_estimator", "qualname": "initialize_classifier", "kind": "function", "doc": "

    \n", "signature": "(surrogate_model: str):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches", "modulename": "outrank.algorithms.sketches", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms", "modulename": "outrank.algorithms.sketches.counting_cms", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.cms_hash", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "cms_hash", "kind": "function", "doc": "

    \n", "signature": "(x, seed, width):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.__init__", "kind": "function", "doc": "

    \n", "signature": "(depth=6, width=32768, M=None)"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.depth", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.width", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.width", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.hash_seeds", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.M", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.M", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.add", "kind": "function", "doc": "

    \n", "signature": "(self, x, delta=1):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst, delta=1):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.query", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.query", "kind": "function", "doc": "

    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.get_matrix", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.__init__", "kind": "function", "doc": "

    \n", "signature": "(bound: int = 30000)"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.max_bound_thr", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.default_counter", "kind": "variable", "doc": "

    \n", "annotation": ": collections.Counter"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.add", "kind": "function", "doc": "

    \n", "signature": "(self, val):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "kind": "module", "doc": "

    This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.__init__", "kind": "function", "doc": "

    \n", "signature": "(error_rate=0.005)"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.p", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.m", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_set", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_size", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.width", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.hll_flag", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.add", "kind": "function", "doc": "

    \n", "signature": "(self, value):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators", "modulename": "outrank.algorithms.synthetic_data_generators", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.dataset_info", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.dataset_info", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_data", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_data", "kind": "function", "doc": "

    Generates dataset based on given parameters

    \n\n
    Parameters
    \n\n
      \n
    • n_features: number of generated features
    • \n
    • n_samples: number of generated samples
    • \n
    • cardinality: default cardinality of the dataset
    • \n
    • structure: structure of the dataset
    • \n
    • ensure_rep: flag, ensures all given values represented
    • \n
    • random_values: flag, enables random (integer) feature values from set [low, high]
    • \n
    • low: sets lower bound of random feature values
    • \n
    • high: sets high bound of random feature values
    • \n
    • seed: sets seed of numpy random
    • \n
    \n\n
    Returns
    \n\n
    \n

    X, 2D dataset

    \n
    \n", "signature": "(\tself,\tn_features: int,\tn_samples: int,\tcardinality: int = 5,\tstructure: Union[list, numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]], NoneType] = None,\tensure_rep: bool = False,\trandom_values: bool | None = False,\tlow: int | None = 0,\thigh: int | None = 1000,\tseed: int = 42) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_combinations", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_combinations", "kind": "function", "doc": "

    Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indexes of features to be in combination
    • \n
    • combination_function: optional custom function for combining feature vectors
    • \n
    • combination_type: string flag, either liner or nonlinear, defining combination type
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with added resultant feature

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tcombination_function: Optional = None,\tcombination_type: Literal = 'linear') -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_correlated", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_correlated", "kind": "function", "doc": "

    Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indices of features to generate correlated feature to
    • \n
    • r: (Pearson) correlation factor
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with generated correlated features

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tr: float = 0.8) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_duplicates", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_duplicates", "kind": "function", "doc": "

    Generates duplicate features

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indices of features to duplicate
    • \n
    \n\n
    Returns
    \n\n
    \n

    dataset with duplicated features

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_labels", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_labels", "kind": "function", "doc": "

    Generates labels for dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • n: number of class labels
    • \n
    • p: class distribution
    • \n
    • k: constant
    • \n
    • decision_function: optional user-defined decision function
    • \n
    • class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    • \n
    • balance: boolean, whether to balance clustering class labels
    • \n
    \n\n
    Returns
    \n\n
    \n

    array of labels, corresponding to dataset X

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tn: int = 2,\tp: Union[float, list[float], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]] = 0.5,\tk: int | float = 2,\tdecision_function: Optional = None,\tclass_relation: str = 'linear',\tbalance: bool = False):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_noise", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_noise", "kind": "function", "doc": "

    Simulates noise on given dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset to apply noise to
    • \n
    • y: required target labels for categorical noise generation
    • \n
    • p: amount of noise to apply. Defaults to 0.2
    • \n
    • type: type of noise to apply, either categorical or missing
    • \n
    • missing_val: value to simulate missing values. Defaults to float('-inf')
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with noise applied

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tp: float = 0.2,\ttype: Literal = 'categorical',\tmissing_val: str | int | float = -inf) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.downsample_dataset", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.downsample_dataset", "kind": "function", "doc": "

    Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.

    \n\n
    Parameters
    \n\n
      \n
    • X: Dataset to downsample
    • \n
    • y: Labels corresponding to X
    • \n
    • N: Optional number of samples per class to downsample to
    • \n
    • seed: Seed for random state of resample function
    • \n
    • reshuffle: Reshuffle the dataset after downsampling
    • \n
    \n\n
    Returns
    \n\n
    \n

    Balanced X and y after downsampling

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tN: int | None = None,\tseed: int = 42,\treshuffle: bool = False) -> tuple[numpy.ndarray, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.print_dataset", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.print_dataset", "kind": "function", "doc": "

    Prints given dataset

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • y: labels
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.summarize", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.summarize", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "qualname": "generate_random_matrix", "kind": "function", "doc": "

    \n", "signature": "(num_features=100, size=20000):", "funcdef": "def"}, {"fullname": "outrank.core_ranking", "modulename": "outrank.core_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_ranking.logger", "modulename": "outrank.core_ranking", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_CARDINALITY_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, {"fullname": "outrank.core_ranking.GLOBAL_COUNTS_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_COUNTS_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, {"fullname": "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_RARE_VALUE_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]", "default_value": "Counter()"}, {"fullname": "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_PRIOR_COMB_COUNTS", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, int]", "default_value": "Counter()"}, {"fullname": "outrank.core_ranking.IGNORED_VALUES", "modulename": "outrank.core_ranking", "qualname": "IGNORED_VALUES", "kind": "variable", "doc": "

    \n", "default_value": "set()"}, {"fullname": "outrank.core_ranking.HYPERLL_ERROR_BOUND", "modulename": "outrank.core_ranking", "qualname": "HYPERLL_ERROR_BOUND", "kind": "variable", "doc": "

    \n", "default_value": "0.02"}, {"fullname": "outrank.core_ranking.MAX_FEATURES_3MR", "modulename": "outrank.core_ranking", "qualname": "MAX_FEATURES_3MR", "kind": "variable", "doc": "

    \n", "default_value": "10000"}, {"fullname": "outrank.core_ranking.prior_combinations_sample", "modulename": "outrank.core_ranking", "qualname": "prior_combinations_sample", "kind": "function", "doc": "

    Make sure only relevant subspace of combinations is selected based on prior counts

    \n", "signature": "(\tcombinations: list[tuple[typing.Any, ...]],\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.get_combinations_from_columns", "modulename": "outrank.core_ranking", "qualname": "get_combinations_from_columns", "kind": "function", "doc": "

    Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope

    \n", "signature": "(\tall_columns: pandas.core.indexes.base.Index,\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.mixed_rank_graph", "modulename": "outrank.core_ranking", "qualname": "mixed_rank_graph", "kind": "function", "doc": "

    Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tcpu_pool: Any,\tpbar: Any) -> outrank.core_utils.BatchRankingSummary:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.enrich_with_transformations", "modulename": "outrank.core_ranking", "qualname": "enrich_with_transformations", "kind": "function", "doc": "

    Construct a collection of new features based on pre-defined transformations/rules

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnum_col_types: set[str],\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_combined_features", "modulename": "outrank.core_ranking", "qualname": "compute_combined_features", "kind": "function", "doc": "

    Compute higher order features via xxhash-based trick.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tpbar: Any,\tis_3mr: bool = False) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_expanded_multivalue_features", "modulename": "outrank.core_ranking", "qualname": "compute_expanded_multivalue_features", "kind": "function", "doc": "

    Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value \"a,b,c\" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_subfeatures", "modulename": "outrank.core_ranking", "qualname": "compute_subfeatures", "kind": "function", "doc": "

    Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.\n->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.\n<->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.include_noisy_features", "modulename": "outrank.core_ranking", "qualname": "include_noisy_features", "kind": "function", "doc": "

    Add randomized features that serve as a sanity check

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_coverage", "modulename": "outrank.core_ranking", "qualname": "compute_coverage", "kind": "function", "doc": "

    Compute coverage of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_feature_memory_consumption", "modulename": "outrank.core_ranking", "qualname": "compute_feature_memory_consumption", "kind": "function", "doc": "

    An approximation of how much feature take up

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_value_counts", "modulename": "outrank.core_ranking", "qualname": "compute_value_counts", "kind": "function", "doc": "

    Update the count structure

    \n", "signature": "(input_dataframe: pandas.core.frame.DataFrame, args: Any):", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_cardinalities", "modulename": "outrank.core_ranking", "qualname": "compute_cardinalities", "kind": "function", "doc": "

    Compute cardinalities of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tpbar: Any,\tmax_unique_hist_constraint: int) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_bounds_increment", "modulename": "outrank.core_ranking", "qualname": "compute_bounds_increment", "kind": "function", "doc": "

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnumeric_column_types: set[str]) -> dict[str, typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_batch_ranking", "modulename": "outrank.core_ranking", "qualname": "compute_batch_ranking", "kind": "function", "doc": "

    Enrich the feature space and compute the batch importances

    \n", "signature": "(\tline_tmp_storage: list[list[typing.Any]],\tnumeric_column_types: set[str],\targs: Any,\tcpu_pool: Any,\tcolumn_descriptions: list[str],\tlogger: Any,\tpbar: Any) -> tuple[outrank.core_utils.BatchRankingSummary, dict[str, typing.Any], dict[str, set[str]], dict[str, set[str]]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.get_grouped_df", "modulename": "outrank.core_ranking", "qualname": "get_grouped_df", "kind": "function", "doc": "

    A helper method that enables median-based aggregation after processing

    \n", "signature": "(\timportances_df_list: list[tuple[str, str, float]]) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.checkpoint_importances_df", "modulename": "outrank.core_ranking", "qualname": "checkpoint_importances_df", "kind": "function", "doc": "

    A helper which stores intermediary state - useful for longer runs

    \n", "signature": "(importances_batch: list[tuple[str, str, float]]) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.estimate_importances_minibatches", "modulename": "outrank.core_ranking", "qualname": "estimate_importances_minibatches", "kind": "function", "doc": "

    Interaction score estimator - suitable for example for csv-like input data types.\nThis type of data is normally a single large csv, meaning that minibatch processing needs to\nhappen during incremental handling of the file (that\"s not the case for pre-separated ob data)

    \n", "signature": "(\tinput_file: str,\tcolumn_descriptions: list,\tfw_col_mapping: dict[str, str],\tnumeric_column_types: set,\tbatch_size: int = 100000,\targs: Any = None,\tdata_encoding: str = 'utf-8',\tcpu_pool: Any = None,\tdelimiter: str = '\\t',\tfeature_construction_mode: bool = False,\tlogger: Any = None) -> tuple[list[dict[str, typing.Any]], typing.Any, dict[typing.Any, typing.Any], list[dict[str, typing.Any]], list[dict[str, set[str]]], collections.defaultdict[str, list[set[str]]], dict[str, typing.Any], dict[str, typing.Any], dict[str, typing.Any]]:", "funcdef": "def"}, {"fullname": "outrank.core_selftest", "modulename": "outrank.core_selftest", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_utils", "modulename": "outrank.core_utils", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_utils.pro_tips", "modulename": "outrank.core_utils", "qualname": "pro_tips", "kind": "variable", "doc": "

    \n", "default_value": "['OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', 'Visualization part also includes clustering - this might be very insightful!', 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', 'Give it as many threads as physically possible (--num_threads).', 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', 'Your target can be any feature! (explaining one feature with others)', 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.']"}, {"fullname": "outrank.core_utils.write_json_dump_to_file", "modulename": "outrank.core_utils", "qualname": "write_json_dump_to_file", "kind": "function", "doc": "

    \n", "signature": "(args: Any, config_name: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.internal_hash", "modulename": "outrank.core_utils", "qualname": "internal_hash", "kind": "function", "doc": "

    A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure

    \n", "signature": "(input_obj: str) -> str:", "funcdef": "def"}, {"fullname": "outrank.core_utils.DatasetInformationStorage", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage", "kind": "class", "doc": "

    A generic class for holding properties of a given type of dataset

    \n"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.__init__", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdata_path: str,\tcolumn_names: list[str],\tcolumn_types: set[str],\tcol_delimiter: str | None,\tencoding: str,\tfw_map: dict[str, str] | None)"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.data_path", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.data_path", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.column_names", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_names", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.column_types", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_types", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.col_delimiter", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.col_delimiter", "kind": "variable", "doc": "

    \n", "annotation": ": str | None"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.encoding", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.encoding", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.fw_map", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.fw_map", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, str] | None"}, {"fullname": "outrank.core_utils.NumericFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tfeature_name: str,\tminimum: float,\tmaximum: float,\tmedian: float,\tnum_unique: int)"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.minimum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.minimum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.maximum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.maximum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.median", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.median", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, {"fullname": "outrank.core_utils.NominalFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(feature_name: str, num_unique: int)"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, {"fullname": "outrank.core_utils.BatchRankingSummary", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary", "kind": "class", "doc": "

    A generic class representing batched ranking results

    \n"}, {"fullname": "outrank.core_utils.BatchRankingSummary.__init__", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttriplet_scores: list[tuple[str, str, float]],\tstep_times: dict[str, typing.Any])"}, {"fullname": "outrank.core_utils.BatchRankingSummary.triplet_scores", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.triplet_scores", "kind": "variable", "doc": "

    \n", "annotation": ": list[tuple[str, str, float]]"}, {"fullname": "outrank.core_utils.BatchRankingSummary.step_times", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.step_times", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]"}, {"fullname": "outrank.core_utils.display_random_tip", "modulename": "outrank.core_utils", "qualname": "display_random_tip", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.get_dataset_info", "modulename": "outrank.core_utils", "qualname": "get_dataset_info", "kind": "function", "doc": "

    \n", "signature": "(args: Any):", "funcdef": "def"}, {"fullname": "outrank.core_utils.display_tool_name", "modulename": "outrank.core_utils", "qualname": "display_tool_name", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_line", "kind": "function", "doc": "

    Outbrain line parsing - generic TSVs

    \n", "signature": "(line_string: str, delimiter: str = '\\t', args: Any = None) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_line_vw", "modulename": "outrank.core_utils", "qualname": "parse_ob_line_vw", "kind": "function", "doc": "

    Parse a sparse vw line into a pandas df with pre-defined namespace

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping=None,\ttable_header=None,\tinclude_namespace_info=False) -> list[str | None]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_csv_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_csv_line", "kind": "function", "doc": "

    Data can have commas within JSON field dumps

    \n", "signature": "(line_string: str, delimiter: str = ',', args: Any = None) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.generic_line_parser", "modulename": "outrank.core_utils", "qualname": "generic_line_parser", "kind": "function", "doc": "

    A generic method aimed to parse data from different sources.

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping: Any = None,\ttable_header: Any = None) -> list[typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.read_reference_json", "modulename": "outrank.core_utils", "qualname": "read_reference_json", "kind": "function", "doc": "

    A helper method for reading a JSON

    \n", "signature": "(json_path) -> dict[str, dict]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_namespace", "modulename": "outrank.core_utils", "qualname": "parse_namespace", "kind": "function", "doc": "

    Parse the feature namespace for type awareness

    \n", "signature": "(namespace_path: str) -> tuple[set[str], dict[str, str]]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.read_column_names", "modulename": "outrank.core_utils", "qualname": "read_column_names", "kind": "function", "doc": "

    Read the col. header

    \n", "signature": "(mapping_file: str) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_vw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_vw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_raw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_raw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_csv_with_description_information", "modulename": "outrank.core_utils", "qualname": "parse_csv_with_description_information", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_csv_raw", "modulename": "outrank.core_utils", "qualname": "parse_csv_raw", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.extract_features_from_reference_JSON", "modulename": "outrank.core_utils", "qualname": "extract_features_from_reference_JSON", "kind": "function", "doc": "

    Given a model's JSON, extract unique features

    \n", "signature": "(\tjson_path: str,\tcombined_features_only=False,\tall_features=False) -> set[typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.summarize_feature_bounds_for_transformers", "modulename": "outrank.core_utils", "qualname": "summarize_feature_bounds_for_transformers", "kind": "function", "doc": "

    summarization auxilliary method for generating JSON-based specs

    \n", "signature": "(\tbounds_object_storage: Any,\tfeature_types: list[str],\ttask_name: str,\tlabel_name: str,\tgranularity: int = 15,\toutput_summary_table_only: bool = False):", "funcdef": "def"}, {"fullname": "outrank.core_utils.summarize_rare_counts", "modulename": "outrank.core_utils", "qualname": "summarize_rare_counts", "kind": "function", "doc": "

    Write rare values

    \n", "signature": "(\tterm_counter: Any,\targs: Any,\tcardinality_object: Any,\tobject_info: outrank.core_utils.DatasetInformationStorage) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.is_prior_heuristic", "modulename": "outrank.core_utils", "qualname": "is_prior_heuristic", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> bool:", "funcdef": "def"}, {"fullname": "outrank.core_utils.get_num_of_instances", "modulename": "outrank.core_utils", "qualname": "get_num_of_instances", "kind": "function", "doc": "

    Count the number of lines in a file, fast - useful for progress logging

    \n", "signature": "(fname: str) -> int:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations", "modulename": "outrank.feature_transformations", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault", "modulename": "outrank.feature_transformations.feature_transformer_vault", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "MINIMAL_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "DEFAULT_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "FW_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)', '_tr_fw_sqrt_res_1_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*1,0), 0))', '_tr_fw_log_res_1_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*1,0), 0))', '_tr_fw_log_res_1_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*1,0), 0))', '_tr_fw_log_res_1_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*1,0), 0))', '_tr_fw_log_res_1_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*1,0), 0))', '_tr_fw_log_res_1_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*1,0), 0))', '_tr_fw_log_res_1_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*1,0), 0))', '_tr_fw_log_res_1_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*1,0), 0))', '_tr_fw_log_res_1_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*1,0), 0))', '_tr_fw_sqrt_res_10_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*10,0), 0))', '_tr_fw_log_res_10_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*10,0), 0))', '_tr_fw_log_res_10_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*10,0), 0))', '_tr_fw_log_res_10_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*10,0), 0))', '_tr_fw_log_res_10_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*10,0), 0))', '_tr_fw_log_res_10_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*10,0), 0))', '_tr_fw_log_res_10_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*10,0), 0))', '_tr_fw_log_res_10_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*10,0), 0))', '_tr_fw_log_res_10_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*10,0), 0))', '_tr_fw_sqrt_res_50_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*50,0), 0))', '_tr_fw_log_res_50_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*50,0), 0))', '_tr_fw_log_res_50_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*50,0), 0))', '_tr_fw_log_res_50_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*50,0), 0))', '_tr_fw_log_res_50_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*50,0), 0))', '_tr_fw_log_res_50_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*50,0), 0))', '_tr_fw_log_res_50_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*50,0), 0))', '_tr_fw_log_res_50_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*50,0), 0))', '_tr_fw_log_res_50_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*50,0), 0))', '_tr_fw_sqrt_res_100_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*100,0), 0))', '_tr_fw_log_res_100_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*100,0), 0))', '_tr_fw_log_res_100_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*100,0), 0))', '_tr_fw_log_res_100_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*100,0), 0))', '_tr_fw_log_res_100_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*100,0), 0))', '_tr_fw_log_res_100_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*100,0), 0))', '_tr_fw_log_res_100_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*100,0), 0))', '_tr_fw_log_res_100_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*100,0), 0))', '_tr_fw_log_res_100_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*100,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*1,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*10,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*50,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*100,0), 0))'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "resolution_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 10, 50, 100]"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "greater_than_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 2, 4, 8, 16, 32, 64, 96]"}, {"fullname": "outrank.feature_transformations.ranking_transformers", "modulename": "outrank.feature_transformations.ranking_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.noise_preset", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.construct_new_features", "kind": "function", "doc": "

    Generate a few standard noise distributions

    \n", "signature": "(self, dataframe: pandas.core.frame.DataFrame, label_column=None):", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.__init__", "kind": "function", "doc": "

    \n", "signature": "(numeric_column_names: set[str], preset: str = 'default')"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.numeric_column_names", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.constructed_feature_names", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.max_maj_support", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.nan_prop_support", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.get_vals", "kind": "function", "doc": "

    \n", "signature": "(self, tmp_df: pandas.core.frame.DataFrame, col_name: str) -> Any:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_baseline_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_new_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.task_generators", "modulename": "outrank.task_generators", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_generators.logger", "modulename": "outrank.task_generators", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.task_generators.outrank_task_generate_data_set", "modulename": "outrank.task_generators", "qualname": "outrank_task_generate_data_set", "kind": "function", "doc": "

    Core method for generating data sets

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking", "modulename": "outrank.task_instance_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_instance_ranking.shannon_ent", "modulename": "outrank.task_instance_ranking", "qualname": "shannon_ent", "kind": "function", "doc": "

    \n", "signature": "(string: str) -> float:", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.compute_entropy_avg", "modulename": "outrank.task_instance_ranking", "qualname": "compute_entropy_avg", "kind": "function", "doc": "

    \n", "signature": "(line: list) -> float:", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.score_line", "modulename": "outrank.task_instance_ranking", "qualname": "score_line", "kind": "function", "doc": "

    \n", "signature": "(line):", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.outrank_task_rank_instances", "modulename": "outrank.task_instance_ranking", "qualname": "outrank_task_rank_instances", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> None:", "funcdef": "def"}, {"fullname": "outrank.task_ranking", "modulename": "outrank.task_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_ranking.outrank_task_conduct_ranking", "modulename": "outrank.task_ranking", "qualname": "outrank_task_conduct_ranking", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> None:", "funcdef": "def"}, {"fullname": "outrank.task_selftest", "modulename": "outrank.task_selftest", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_selftest.logger", "modulename": "outrank.task_selftest", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.task_selftest.conduct_self_test", "modulename": "outrank.task_selftest", "qualname": "conduct_self_test", "kind": "function", "doc": "

    \n", "signature": "():", "funcdef": "def"}, {"fullname": "outrank.task_summary", "modulename": "outrank.task_summary", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_summary.outrank_task_result_summary", "modulename": "outrank.task_summary", "qualname": "outrank_task_result_summary", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.task_visualization", "modulename": "outrank.task_visualization", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_visualization.outrank_task_visualize_results", "modulename": "outrank.task_visualization", "qualname": "outrank_task_visualize_results", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.visualizations", "modulename": "outrank.visualizations", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.visualizations.ranking_visualization", "modulename": "outrank.visualizations.ranking_visualization", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_hierarchical_clusters", "kind": "function", "doc": "

    A method for visualization of hierarchical clusters w.r.t. different linkage functions

    \n", "signature": "(\ttriplet_dataframe: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str = 'png',\tmax_num_clusters: int = 100) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_heatmap", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_heatmap", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_barplots", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_barplots", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\treference_json: str,\timage_format: str,\tlabel: str,\theuristic: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_all", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_all", "kind": "function", "doc": "

    A method for visualization of the obtained feature interaction maps.

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\tlabel: str = '',\treference_json: str = '',\timage_format: str = 'png',\theuristic: str = 'MI') -> None:", "funcdef": "def"}]; + /** pdoc search index */const docs = [{"fullname": "outrank", "modulename": "outrank", "kind": "module", "doc": "

    Welcome to OutRank's documentation!

    \n\n

    All functions/methods can be searched-for (search bar on the left).

    \n\n

    This tool enables fast screening of feature-feature interactions. Its purpose is to give the user fast insight into potential redundancies/anomalies in the data.\nIt is implemented to operate in _mini batches_, it traverses the raw data incrementally, refining the rankings as it goes along. The core operation, interaction ranking, outputs triplets which look as follows:

    \n\n
    featureA    featureB    0.512\nfeatureA    featureC    0.125\n
    \n\n

    Setup

    \n\n
    \n
    pip install outrank\n
    \n
    \n\n

    and test a minimal cycle with

    \n\n
    \n
    outrank --task selftest\n
    \n
    \n\n

    if this passes, you can be pretty certain OutRank will perform as intended. OutRank's primary use case is as a CLI tool, begin exploring with

    \n\n
    \n
    outrank --help\n
    \n
    \n\n

    Example use cases

    \n\n
      \n
    • A minimal showcase of performing feature ranking on a generic CSV is demonstrated with this example.

    • \n
    • More examples demonstrating OutRank's capabilities are also available.

    • \n
    \n\n

    OutRank as a Python library

    \n\n

    Once installed, _OutRank_ can be used as any other Python library. For example, generic feature ranking algorithms can be accessed as

    \n\n
    \n
    from outrank.algorithms.feature_ranking.ranking_mi_numba import (\n    mutual_info_estimator_numba,\n)\n\n# Some synthetic minimal data (Numpy vectors)\na = np.array([1, 0, 0, 0, 1, 1, 1, 0], dtype=np.int32)\n\nlowest = np.array(np.random.permutation(a), dtype=np.int32)\nmedium = np.array([1, 1, 0, 0, 1, 1, 1, 1], dtype=np.int32)\nhigh = np.array([1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int32)\n\nlowest_score = mutual_info_estimator_numba(\n    a, lowest, np.float32(1.0), False,\n)\nmedium_score = mutual_info_estimator_numba(\n    a, medium, np.float32(1.0), False,\n)\nhigh_score = mutual_info_estimator_numba(\n    a, high, np.float32(1.0), False,\n)\n\nscores = [lowest_score, medium_score, high_score]\nsorted_score_indices = np.argsort(scores)\nassert np.sum(np.array([0, 1, 2]) - sorted_score_indices) ==  0\n
    \n
    \n\n
    \n\n

    Creating a simple dataset

    \n\n
    \n
    from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification\n\ncc = CategoricalClassification()\n\n# Creates a simple dataset of 10 features, 10k samples, with feature cardinality of all features being 35\nX = cc.generate_data(9, \n                     10000, \n                     cardinality=35, \n                     ensure_rep=True, \n                     random_values=True, \n                     low=0, \n                     high=40)\n\n# Creates target labels via clustering\ny = cc.generate_labels(X, n=2, class_relation='cluster')\n
    \n
    \n"}, {"fullname": "outrank.algorithms", "modulename": "outrank.algorithms", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking", "modulename": "outrank.algorithms.feature_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.numba_unique", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "numba_unique", "kind": "function", "doc": "

    Identify unique elements in an array, fast

    \n", "signature": "(a):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_conditional_entropy", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_conditional_entropy", "kind": "function", "doc": "

    \n", "signature": "(\tY_classes,\tclass_values,\tclass_var_shape,\tinitial_prob,\tnonzero_counts):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.compute_entropies", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "compute_entropies", "kind": "function", "doc": "

    Core entropy computation function

    \n", "signature": "(X, Y, all_events, f_values, f_value_counts, cardinality_correction):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.stratified_subsampling", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "stratified_subsampling", "kind": "function", "doc": "

    \n", "signature": "(Y, X, approximation_factor, _f_values_X):", "funcdef": "def"}, {"fullname": "outrank.algorithms.feature_ranking.ranking_mi_numba.mutual_info_estimator_numba", "modulename": "outrank.algorithms.feature_ranking.ranking_mi_numba", "qualname": "mutual_info_estimator_numba", "kind": "function", "doc": "

    Core estimator logic. Compute unique elements, subset if required

    \n", "signature": "(Y, X, approximation_factor=1.0, cardinality_correction=False):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator", "modulename": "outrank.algorithms.importance_estimator", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.importance_estimator.logger", "modulename": "outrank.algorithms.importance_estimator", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.algorithms.importance_estimator.num_folds", "modulename": "outrank.algorithms.importance_estimator", "qualname": "num_folds", "kind": "variable", "doc": "

    \n", "default_value": "4"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_MI", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_MI", "kind": "function", "doc": "

    \n", "signature": "(vector_first: Any, vector_second: Any) -> float:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_surrogate", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_surrogate", "kind": "function", "doc": "

    \n", "signature": "(\tvector_first: Any,\tvector_second: Any,\tX: Any,\tsurrogate_model: str) -> float:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.numba_mi", "modulename": "outrank.algorithms.importance_estimator", "qualname": "numba_mi", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.sklearn_mi_adj", "modulename": "outrank.algorithms.importance_estimator", "qualname": "sklearn_mi_adj", "kind": "function", "doc": "

    \n", "signature": "(vector_first, vector_second):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_pairwise", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_pairwise", "kind": "function", "doc": "

    A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.

    \n", "signature": "(combination, reference_model_features, args, tmp_df):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.rank_features_3MR", "modulename": "outrank.algorithms.importance_estimator", "qualname": "rank_features_3MR", "kind": "function", "doc": "

    \n", "signature": "(\trelevance_dict: dict[str, float],\tredundancy_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\trelational_dict: dict[tuple[typing.Any, typing.Any], typing.Any],\tstrategy: str = 'median',\talpha: float = 1,\tbeta: float = 1) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.get_importances_estimate_nonmyopic", "modulename": "outrank.algorithms.importance_estimator", "qualname": "get_importances_estimate_nonmyopic", "kind": "function", "doc": "

    \n", "signature": "(args: Any, tmp_df: pandas.core.frame.DataFrame):", "funcdef": "def"}, {"fullname": "outrank.algorithms.importance_estimator.initialize_classifier", "modulename": "outrank.algorithms.importance_estimator", "qualname": "initialize_classifier", "kind": "function", "doc": "

    \n", "signature": "(surrogate_model: str):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches", "modulename": "outrank.algorithms.sketches", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms", "modulename": "outrank.algorithms.sketches.counting_cms", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.cms_hash", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "cms_hash", "kind": "function", "doc": "

    \n", "signature": "(x, seed, width):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.__init__", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.__init__", "kind": "function", "doc": "

    \n", "signature": "(depth=6, width=32768, M=None)"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.depth", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.depth", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.width", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.width", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.hash_seeds", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.hash_seeds", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.M", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.M", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.add", "kind": "function", "doc": "

    \n", "signature": "(self, x, delta=1):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.batch_add", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst, delta=1):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.query", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.query", "kind": "function", "doc": "

    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_cms.CountMinSketch.get_matrix", "modulename": "outrank.algorithms.sketches.counting_cms", "qualname": "CountMinSketch.get_matrix", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter", "kind": "class", "doc": "

    A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT.

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.__init__", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.__init__", "kind": "function", "doc": "

    \n", "signature": "(bound: int = 30000)"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.max_bound_thr", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.max_bound_thr", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.default_counter", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.default_counter", "kind": "variable", "doc": "

    \n", "annotation": ": collections.Counter"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.batch_add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.batch_add", "kind": "function", "doc": "

    \n", "signature": "(self, lst):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_counters_ordinary.PrimitiveConstrainedCounter.add", "modulename": "outrank.algorithms.sketches.counting_counters_ordinary", "qualname": "PrimitiveConstrainedCounter.add", "kind": "function", "doc": "

    \n", "signature": "(self, val):", "funcdef": "def"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "kind": "module", "doc": "

    This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.__init__", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.__init__", "kind": "function", "doc": "

    \n", "signature": "(error_rate=0.005)"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.p", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.p", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.m", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.m", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_set", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_set", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.warmup_size", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.warmup_size", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.width", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.width", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.hll_flag", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.hll_flag", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.sketches.counting_ultiloglog.HyperLogLogWCache.add", "modulename": "outrank.algorithms.sketches.counting_ultiloglog", "qualname": "HyperLogLogWCache.add", "kind": "function", "doc": "

    \n", "signature": "(self, value):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators", "modulename": "outrank.algorithms.synthetic_data_generators", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.__init__", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.__init__", "kind": "function", "doc": "

    \n", "signature": "(seed: int = 42)"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.dataset_info", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.dataset_info", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_data", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_data", "kind": "function", "doc": "

    Generates dataset based on given parameters

    \n\n
    Parameters
    \n\n
      \n
    • n_features: number of generated features
    • \n
    • n_samples: number of generated samples
    • \n
    • cardinality: default cardinality of the dataset
    • \n
    • structure: structure of the dataset
    • \n
    • ensure_rep: flag, ensures all given values represented
    • \n
    • random_values: flag, enables random (integer) feature values from set [low, high]
    • \n
    • low: sets lower bound of random feature values
    • \n
    • high: sets high bound of random feature values
    • \n
    • seed: sets seed of numpy random
    • \n
    \n\n
    Returns
    \n\n
    \n

    X, 2D dataset

    \n
    \n", "signature": "(\tself,\tn_features: int,\tn_samples: int,\tcardinality: int = 5,\tstructure: Union[list, numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]], NoneType] = None,\tensure_rep: bool = False,\trandom_values: bool | None = False,\tlow: int | None = 0,\thigh: int | None = 1000,\tseed: int = 42) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_combinations", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_combinations", "kind": "function", "doc": "

    Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indexes of features to be in combination
    • \n
    • combination_function: optional custom function for combining feature vectors
    • \n
    • combination_type: string flag, either liner or nonlinear, defining combination type
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with added resultant feature

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tcombination_function: Optional = None,\tcombination_type: Literal['linear', 'nonlinear'] = 'linear') -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_correlated", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_correlated", "kind": "function", "doc": "

    Generates correlated features using the given feature indices. Correlation is based on cosine of angle between vectors with mean 0.

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indices of features to generate correlated feature to
    • \n
    • r: (Pearson) correlation factor
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with generated correlated features

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tr: float = 0.8) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_duplicates", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_duplicates", "kind": "function", "doc": "

    Generates duplicate features

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • feature_indices: indices of features to duplicate
    • \n
    \n\n
    Returns
    \n\n
    \n

    dataset with duplicated features

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tfeature_indices: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_labels", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_labels", "kind": "function", "doc": "

    Generates labels for dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • n: number of class labels
    • \n
    • p: class distribution
    • \n
    • k: constant
    • \n
    • decision_function: optional user-defined decision function
    • \n
    • class_relation: string, either 'linear', 'nonlinear', or 'cluster'
    • \n
    • balance: boolean, whether to balance clustering class labels
    • \n
    • random_state: seed for KMeans clustering, defaults to 42
    • \n
    \n\n
    Returns
    \n\n
    \n

    array of labels, corresponding to dataset X

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tn: int = 2,\tp: Union[float, list[float], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]] = 0.5,\tk: int | float = 2,\tdecision_function: Optional = None,\tclass_relation: Literal['linear', 'nonlinear', 'cluster'] = 'linear',\tbalance: bool = False,\trandom_state: int = 42):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.generate_noise", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.generate_noise", "kind": "function", "doc": "

    Simulates noise on given dataset X

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset to apply noise to
    • \n
    • y: required target labels for categorical noise generation
    • \n
    • p: amount of noise to apply. Defaults to 0.2
    • \n
    • type: type of noise to apply, either categorical or missing
    • \n
    • missing_val: value to simulate missing values. Defaults to float('-inf')
    • \n
    \n\n
    Returns
    \n\n
    \n

    X with noise applied

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tp: float = 0.2,\ttype: Literal['categorical', 'missing'] = 'categorical',\tmissing_val: str | int | float = -inf) -> numpy.ndarray:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.downsample_dataset", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.downsample_dataset", "kind": "function", "doc": "

    Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.

    \n\n
    Parameters
    \n\n
      \n
    • X: Dataset to downsample
    • \n
    • y: Labels corresponding to X
    • \n
    • N: Optional number of samples per class to downsample to
    • \n
    • seed: Seed for random state of resample function
    • \n
    • reshuffle: Reshuffle the dataset after downsampling
    • \n
    \n\n
    Returns
    \n\n
    \n

    Balanced X and y after downsampling

    \n
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[list[int], numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\tN: int | None = None,\tseed: int = 42,\treshuffle: bool = False) -> tuple[numpy.ndarray, numpy.ndarray]:", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.cc_generator.CategoricalClassification.print_dataset", "modulename": "outrank.algorithms.synthetic_data_generators.cc_generator", "qualname": "CategoricalClassification.print_dataset", "kind": "function", "doc": "

    Prints given dataset

    \n\n
    Parameters
    \n\n
      \n
    • X: dataset
    • \n
    • y: labels
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\tX: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]],\ty: Union[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]], numpy._typing._nested_sequence._NestedSequence[numpy._typing._array_like._SupportsArray[numpy.dtype[Any]]], bool, int, float, complex, str, bytes, numpy._typing._nested_sequence._NestedSequence[Union[bool, int, float, complex, str, bytes]]]):", "funcdef": "def"}, {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.algorithms.synthetic_data_generators.generator_naive.generate_random_matrix", "modulename": "outrank.algorithms.synthetic_data_generators.generator_naive", "qualname": "generate_random_matrix", "kind": "function", "doc": "

    \n", "signature": "(num_features=100, size=20000):", "funcdef": "def"}, {"fullname": "outrank.core_ranking", "modulename": "outrank.core_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_ranking.logger", "modulename": "outrank.core_ranking", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.core_ranking.GLOBAL_CARDINALITY_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_CARDINALITY_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, {"fullname": "outrank.core_ranking.GLOBAL_COUNTS_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_COUNTS_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, typing.Any]", "default_value": "{}"}, {"fullname": "outrank.core_ranking.GLOBAL_RARE_VALUE_STORAGE", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_RARE_VALUE_STORAGE", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]", "default_value": "Counter()"}, {"fullname": "outrank.core_ranking.GLOBAL_PRIOR_COMB_COUNTS", "modulename": "outrank.core_ranking", "qualname": "GLOBAL_PRIOR_COMB_COUNTS", "kind": "variable", "doc": "

    \n", "annotation": ": dict[typing.Any, int]", "default_value": "Counter()"}, {"fullname": "outrank.core_ranking.IGNORED_VALUES", "modulename": "outrank.core_ranking", "qualname": "IGNORED_VALUES", "kind": "variable", "doc": "

    \n", "default_value": "set()"}, {"fullname": "outrank.core_ranking.HYPERLL_ERROR_BOUND", "modulename": "outrank.core_ranking", "qualname": "HYPERLL_ERROR_BOUND", "kind": "variable", "doc": "

    \n", "default_value": "0.02"}, {"fullname": "outrank.core_ranking.MAX_FEATURES_3MR", "modulename": "outrank.core_ranking", "qualname": "MAX_FEATURES_3MR", "kind": "variable", "doc": "

    \n", "default_value": "10000"}, {"fullname": "outrank.core_ranking.prior_combinations_sample", "modulename": "outrank.core_ranking", "qualname": "prior_combinations_sample", "kind": "function", "doc": "

    Make sure only relevant subspace of combinations is selected based on prior counts

    \n", "signature": "(\tcombinations: list[tuple[typing.Any, ...]],\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.get_combinations_from_columns", "modulename": "outrank.core_ranking", "qualname": "get_combinations_from_columns", "kind": "function", "doc": "

    Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope

    \n", "signature": "(\tall_columns: pandas.core.indexes.base.Index,\targs: Any) -> list[tuple[typing.Any, ...]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.mixed_rank_graph", "modulename": "outrank.core_ranking", "qualname": "mixed_rank_graph", "kind": "function", "doc": "

    Compute the full mixed rank graph corresponding to all pairwise feature interactions based on the selected heuristic

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tcpu_pool: Any,\tpbar: Any) -> outrank.core_utils.BatchRankingSummary:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.enrich_with_transformations", "modulename": "outrank.core_ranking", "qualname": "enrich_with_transformations", "kind": "function", "doc": "

    Construct a collection of new features based on pre-defined transformations/rules

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnum_col_types: set[str],\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_combined_features", "modulename": "outrank.core_ranking", "qualname": "compute_combined_features", "kind": "function", "doc": "

    Compute higher order features via xxhash-based trick.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any,\tpbar: Any,\tis_3mr: bool = False) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_expanded_multivalue_features", "modulename": "outrank.core_ranking", "qualname": "compute_expanded_multivalue_features", "kind": "function", "doc": "

    Compute one-hot encoded feature space based on each designated multivalue feature. E.g., feature with value \"a,b,c\" becomes three features, values of which are presence of a given value in a mutlivalue feature of choice.

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_subfeatures", "modulename": "outrank.core_ranking", "qualname": "compute_subfeatures", "kind": "function", "doc": "

    Compute derived features that are more fine-grained. Implements logic around two operators that govern feature construction.\n->: One sided construction - every value from left side is fine, separate ones from the right side feature will be considered.\n<->: Two sided construction - two-sided values present. This means that each value from a is combined with each from b, forming |A|*|B| new features (one-hot encoded)

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any,\tpbar: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.include_noisy_features", "modulename": "outrank.core_ranking", "qualname": "include_noisy_features", "kind": "function", "doc": "

    Add randomized features that serve as a sanity check

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tlogger: Any,\targs: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_coverage", "modulename": "outrank.core_ranking", "qualname": "compute_coverage", "kind": "function", "doc": "

    Compute coverage of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_feature_memory_consumption", "modulename": "outrank.core_ranking", "qualname": "compute_feature_memory_consumption", "kind": "function", "doc": "

    An approximation of how much feature take up

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\targs: Any) -> dict[str, set[str]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_value_counts", "modulename": "outrank.core_ranking", "qualname": "compute_value_counts", "kind": "function", "doc": "

    Update the count structure

    \n", "signature": "(input_dataframe: pandas.core.frame.DataFrame, args: Any):", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_cardinalities", "modulename": "outrank.core_ranking", "qualname": "compute_cardinalities", "kind": "function", "doc": "

    Compute cardinalities of features, incrementally

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tpbar: Any,\tmax_unique_hist_constraint: int) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_bounds_increment", "modulename": "outrank.core_ranking", "qualname": "compute_bounds_increment", "kind": "function", "doc": "

    \n", "signature": "(\tinput_dataframe: pandas.core.frame.DataFrame,\tnumeric_column_types: set[str]) -> dict[str, typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.compute_batch_ranking", "modulename": "outrank.core_ranking", "qualname": "compute_batch_ranking", "kind": "function", "doc": "

    Enrich the feature space and compute the batch importances

    \n", "signature": "(\tline_tmp_storage: list[list[typing.Any]],\tnumeric_column_types: set[str],\targs: Any,\tcpu_pool: Any,\tcolumn_descriptions: list[str],\tlogger: Any,\tpbar: Any) -> tuple[outrank.core_utils.BatchRankingSummary, dict[str, typing.Any], dict[str, set[str]], dict[str, set[str]]]:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.get_grouped_df", "modulename": "outrank.core_ranking", "qualname": "get_grouped_df", "kind": "function", "doc": "

    A helper method that enables median-based aggregation after processing

    \n", "signature": "(\timportances_df_list: list[tuple[str, str, float]]) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.checkpoint_importances_df", "modulename": "outrank.core_ranking", "qualname": "checkpoint_importances_df", "kind": "function", "doc": "

    A helper which stores intermediary state - useful for longer runs

    \n", "signature": "(importances_batch: list[tuple[str, str, float]]) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_ranking.estimate_importances_minibatches", "modulename": "outrank.core_ranking", "qualname": "estimate_importances_minibatches", "kind": "function", "doc": "

    Interaction score estimator - suitable for example for csv-like input data types.\nThis type of data is normally a single large csv, meaning that minibatch processing needs to\nhappen during incremental handling of the file (that\"s not the case for pre-separated ob data)

    \n", "signature": "(\tinput_file: str,\tcolumn_descriptions: list,\tfw_col_mapping: dict[str, str],\tnumeric_column_types: set,\tbatch_size: int = 100000,\targs: Any = None,\tdata_encoding: str = 'utf-8',\tcpu_pool: Any = None,\tdelimiter: str = '\\t',\tfeature_construction_mode: bool = False,\tlogger: Any = None) -> tuple[list[dict[str, typing.Any]], typing.Any, dict[typing.Any, typing.Any], list[dict[str, typing.Any]], list[dict[str, set[str]]], collections.defaultdict[str, list[set[str]]], dict[str, typing.Any], dict[str, typing.Any], dict[str, typing.Any]]:", "funcdef": "def"}, {"fullname": "outrank.core_selftest", "modulename": "outrank.core_selftest", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_utils", "modulename": "outrank.core_utils", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.core_utils.pro_tips", "modulename": "outrank.core_utils", "qualname": "pro_tips", "kind": "variable", "doc": "

    \n", "default_value": "['OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', 'Visualization part also includes clustering - this might be very insightful!', 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', 'Give it as many threads as physically possible (--num_threads).', 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', 'Your target can be any feature! (explaining one feature with others)', 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.']"}, {"fullname": "outrank.core_utils.write_json_dump_to_file", "modulename": "outrank.core_utils", "qualname": "write_json_dump_to_file", "kind": "function", "doc": "

    \n", "signature": "(args: Any, config_name: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.internal_hash", "modulename": "outrank.core_utils", "qualname": "internal_hash", "kind": "function", "doc": "

    A generic internal hash used throughout ranking procedure - let's hardcode seed here for sure

    \n", "signature": "(input_obj: str) -> str:", "funcdef": "def"}, {"fullname": "outrank.core_utils.DatasetInformationStorage", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage", "kind": "class", "doc": "

    A generic class for holding properties of a given type of dataset

    \n"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.__init__", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdata_path: str,\tcolumn_names: list[str],\tcolumn_types: set[str],\tcol_delimiter: str | None,\tencoding: str,\tfw_map: dict[str, str] | None)"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.data_path", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.data_path", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.column_names", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_names", "kind": "variable", "doc": "

    \n", "annotation": ": list[str]"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.column_types", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.column_types", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.col_delimiter", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.col_delimiter", "kind": "variable", "doc": "

    \n", "annotation": ": str | None"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.encoding", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.encoding", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.DatasetInformationStorage.fw_map", "modulename": "outrank.core_utils", "qualname": "DatasetInformationStorage.fw_map", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, str] | None"}, {"fullname": "outrank.core_utils.NumericFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tfeature_name: str,\tminimum: float,\tmaximum: float,\tmedian: float,\tnum_unique: int)"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.minimum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.minimum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.maximum", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.maximum", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.median", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.median", "kind": "variable", "doc": "

    \n", "annotation": ": float"}, {"fullname": "outrank.core_utils.NumericFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NumericFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, {"fullname": "outrank.core_utils.NominalFeatureSummary", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary", "kind": "class", "doc": "

    A generic class storing numeric feature statistics

    \n"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.__init__", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(feature_name: str, num_unique: int)"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.feature_name", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.feature_name", "kind": "variable", "doc": "

    \n", "annotation": ": str"}, {"fullname": "outrank.core_utils.NominalFeatureSummary.num_unique", "modulename": "outrank.core_utils", "qualname": "NominalFeatureSummary.num_unique", "kind": "variable", "doc": "

    \n", "annotation": ": int"}, {"fullname": "outrank.core_utils.BatchRankingSummary", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary", "kind": "class", "doc": "

    A generic class representing batched ranking results

    \n"}, {"fullname": "outrank.core_utils.BatchRankingSummary.__init__", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttriplet_scores: list[tuple[str, str, float]],\tstep_times: dict[str, typing.Any])"}, {"fullname": "outrank.core_utils.BatchRankingSummary.triplet_scores", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.triplet_scores", "kind": "variable", "doc": "

    \n", "annotation": ": list[tuple[str, str, float]]"}, {"fullname": "outrank.core_utils.BatchRankingSummary.step_times", "modulename": "outrank.core_utils", "qualname": "BatchRankingSummary.step_times", "kind": "variable", "doc": "

    \n", "annotation": ": dict[str, typing.Any]"}, {"fullname": "outrank.core_utils.display_random_tip", "modulename": "outrank.core_utils", "qualname": "display_random_tip", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.get_dataset_info", "modulename": "outrank.core_utils", "qualname": "get_dataset_info", "kind": "function", "doc": "

    \n", "signature": "(args: Any):", "funcdef": "def"}, {"fullname": "outrank.core_utils.display_tool_name", "modulename": "outrank.core_utils", "qualname": "display_tool_name", "kind": "function", "doc": "

    \n", "signature": "() -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_line", "kind": "function", "doc": "

    Outbrain line parsing - generic TSVs

    \n", "signature": "(line_string: str, delimiter: str = '\\t', args: Any = None) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_line_vw", "modulename": "outrank.core_utils", "qualname": "parse_ob_line_vw", "kind": "function", "doc": "

    Parse a sparse vw line into a pandas df with pre-defined namespace

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping=None,\ttable_header=None,\tinclude_namespace_info=False) -> list[str | None]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_csv_line", "modulename": "outrank.core_utils", "qualname": "parse_ob_csv_line", "kind": "function", "doc": "

    Data can have commas within JSON field dumps

    \n", "signature": "(line_string: str, delimiter: str = ',', args: Any = None) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.generic_line_parser", "modulename": "outrank.core_utils", "qualname": "generic_line_parser", "kind": "function", "doc": "

    A generic method aimed to parse data from different sources.

    \n", "signature": "(\tline_string: str,\tdelimiter: str,\targs: Any = None,\tfw_col_mapping: Any = None,\ttable_header: Any = None) -> list[typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.read_reference_json", "modulename": "outrank.core_utils", "qualname": "read_reference_json", "kind": "function", "doc": "

    A helper method for reading a JSON

    \n", "signature": "(json_path) -> dict[str, dict]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_namespace", "modulename": "outrank.core_utils", "qualname": "parse_namespace", "kind": "function", "doc": "

    Parse the feature namespace for type awareness

    \n", "signature": "(namespace_path: str) -> tuple[set[str], dict[str, str]]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.read_column_names", "modulename": "outrank.core_utils", "qualname": "read_column_names", "kind": "function", "doc": "

    Read the col. header

    \n", "signature": "(mapping_file: str) -> list[str]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_vw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_vw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_raw_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_raw_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_ob_feature_information", "modulename": "outrank.core_utils", "qualname": "parse_ob_feature_information", "kind": "function", "doc": "

    A generic parser of ob-based data

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_csv_with_description_information", "modulename": "outrank.core_utils", "qualname": "parse_csv_with_description_information", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.parse_csv_raw", "modulename": "outrank.core_utils", "qualname": "parse_csv_raw", "kind": "function", "doc": "

    \n", "signature": "(data_path) -> outrank.core_utils.DatasetInformationStorage:", "funcdef": "def"}, {"fullname": "outrank.core_utils.extract_features_from_reference_JSON", "modulename": "outrank.core_utils", "qualname": "extract_features_from_reference_JSON", "kind": "function", "doc": "

    Given a model's JSON, extract unique features

    \n", "signature": "(\tjson_path: str,\tcombined_features_only=False,\tall_features=False) -> set[typing.Any]:", "funcdef": "def"}, {"fullname": "outrank.core_utils.summarize_feature_bounds_for_transformers", "modulename": "outrank.core_utils", "qualname": "summarize_feature_bounds_for_transformers", "kind": "function", "doc": "

    summarization auxilliary method for generating JSON-based specs

    \n", "signature": "(\tbounds_object_storage: Any,\tfeature_types: list[str],\ttask_name: str,\tlabel_name: str,\tgranularity: int = 15,\toutput_summary_table_only: bool = False):", "funcdef": "def"}, {"fullname": "outrank.core_utils.summarize_rare_counts", "modulename": "outrank.core_utils", "qualname": "summarize_rare_counts", "kind": "function", "doc": "

    Write rare values

    \n", "signature": "(\tterm_counter: Any,\targs: Any,\tcardinality_object: Any,\tobject_info: outrank.core_utils.DatasetInformationStorage) -> None:", "funcdef": "def"}, {"fullname": "outrank.core_utils.is_prior_heuristic", "modulename": "outrank.core_utils", "qualname": "is_prior_heuristic", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> bool:", "funcdef": "def"}, {"fullname": "outrank.core_utils.get_num_of_instances", "modulename": "outrank.core_utils", "qualname": "get_num_of_instances", "kind": "function", "doc": "

    Count the number of lines in a file, fast - useful for progress logging

    \n", "signature": "(fname: str) -> int:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations", "modulename": "outrank.feature_transformations", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault", "modulename": "outrank.feature_transformations.feature_transformer_vault", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.MINIMAL_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "MINIMAL_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.default_transformers.DEFAULT_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.default_transformers", "qualname": "DEFAULT_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.FW_TRANSFORMERS", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "FW_TRANSFORMERS", "kind": "variable", "doc": "

    \n", "default_value": "{'_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', '_tr_sqrt(abs(x))': 'np.sqrt(np.abs(X))', '_tr_log(abs(x)+1)': 'np.log(np.abs(X) + 1)', '_tr_div(x,abs(x))*log(abs(x))': 'np.divide(X, np.abs(X)) * np.log(np.abs(X))', '_tr_log(x + sqrt(pow(x,2), 1)': 'np.log(X + np.sqrt(np.power(X, 2) + 1))', '_tr_log*sqrt': 'np.log(X + 1) * np.sqrt(X)', '_tr_log*100': 'np.round(np.log(X + 1) * 100, 0)', '_tr_nonzero': 'np.where(X != 0, 1, 0)', '_tr_round(div(x,max))': 'np.round(np.divide(X, np.max(X)), 0)', '_tr_fw_sqrt_res_1_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*1,0), 0))', '_tr_fw_log_res_1_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*1,0), 0))', '_tr_fw_log_res_1_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*1,0), 0))', '_tr_fw_log_res_1_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*1,0), 0))', '_tr_fw_log_res_1_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*1,0), 0))', '_tr_fw_log_res_1_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*1,0), 0))', '_tr_fw_log_res_1_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*1,0), 0))', '_tr_fw_log_res_1_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*1,0), 0))', '_tr_fw_sqrt_res_1_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*1,0), 0))', '_tr_fw_log_res_1_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*1,0), 0))', '_tr_fw_sqrt_res_10_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*10,0), 0))', '_tr_fw_log_res_10_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*10,0), 0))', '_tr_fw_log_res_10_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*10,0), 0))', '_tr_fw_log_res_10_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*10,0), 0))', '_tr_fw_log_res_10_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*10,0), 0))', '_tr_fw_log_res_10_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*10,0), 0))', '_tr_fw_log_res_10_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*10,0), 0))', '_tr_fw_log_res_10_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*10,0), 0))', '_tr_fw_sqrt_res_10_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*10,0), 0))', '_tr_fw_log_res_10_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*10,0), 0))', '_tr_fw_sqrt_res_50_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*50,0), 0))', '_tr_fw_log_res_50_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*50,0), 0))', '_tr_fw_log_res_50_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*50,0), 0))', '_tr_fw_log_res_50_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*50,0), 0))', '_tr_fw_log_res_50_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*50,0), 0))', '_tr_fw_log_res_50_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*50,0), 0))', '_tr_fw_log_res_50_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*50,0), 0))', '_tr_fw_log_res_50_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*50,0), 0))', '_tr_fw_sqrt_res_50_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*50,0), 0))', '_tr_fw_log_res_50_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*50,0), 0))', '_tr_fw_sqrt_res_100_gt_1': 'np.where(X < 1, X, np.where(X>1 ,np.round(np.sqrt(X-1)*100,0), 0))', '_tr_fw_log_res_100_gt_1': 'np.where(X <1, X, np.where(X >1, np.round(np.log(X-1)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_2': 'np.where(X < 2, X, np.where(X>2 ,np.round(np.sqrt(X-2)*100,0), 0))', '_tr_fw_log_res_100_gt_2': 'np.where(X <2, X, np.where(X >2, np.round(np.log(X-2)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_4': 'np.where(X < 4, X, np.where(X>4 ,np.round(np.sqrt(X-4)*100,0), 0))', '_tr_fw_log_res_100_gt_4': 'np.where(X <4, X, np.where(X >4, np.round(np.log(X-4)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_8': 'np.where(X < 8, X, np.where(X>8 ,np.round(np.sqrt(X-8)*100,0), 0))', '_tr_fw_log_res_100_gt_8': 'np.where(X <8, X, np.where(X >8, np.round(np.log(X-8)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_16': 'np.where(X < 16, X, np.where(X>16 ,np.round(np.sqrt(X-16)*100,0), 0))', '_tr_fw_log_res_100_gt_16': 'np.where(X <16, X, np.where(X >16, np.round(np.log(X-16)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_32': 'np.where(X < 32, X, np.where(X>32 ,np.round(np.sqrt(X-32)*100,0), 0))', '_tr_fw_log_res_100_gt_32': 'np.where(X <32, X, np.where(X >32, np.round(np.log(X-32)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_64': 'np.where(X < 64, X, np.where(X>64 ,np.round(np.sqrt(X-64)*100,0), 0))', '_tr_fw_log_res_100_gt_64': 'np.where(X <64, X, np.where(X >64, np.round(np.log(X-64)*100,0), 0))', '_tr_fw_sqrt_res_100_gt_96': 'np.where(X < 96, X, np.where(X>96 ,np.round(np.sqrt(X-96)*100,0), 0))', '_tr_fw_log_res_100_gt_96': 'np.where(X <96, X, np.where(X >96, np.round(np.log(X-96)*100,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*1,0), 0))', '_tr_fw_prob_sqrt_res_1_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*1,0), 0))', '_tr_fw_prob_log_res_1_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*1,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*10,0), 0))', '_tr_fw_prob_sqrt_res_10_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*10,0), 0))', '_tr_fw_prob_log_res_10_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*10,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*50,0), 0))', '_tr_fw_prob_sqrt_res_50_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*50,0), 0))', '_tr_fw_prob_log_res_50_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*50,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.01': 'np.where(X < 0.01, X, np.where(X>0.01, np.round(np.sqrt(X-0.01)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.01': 'np.where(X <0.01,X, np.where(X>0.01, np.round(np.log(X-0.01)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.02': 'np.where(X < 0.02, X, np.where(X>0.02, np.round(np.sqrt(X-0.02)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.02': 'np.where(X <0.02,X, np.where(X>0.02, np.round(np.log(X-0.02)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.04': 'np.where(X < 0.04, X, np.where(X>0.04, np.round(np.sqrt(X-0.04)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.04': 'np.where(X <0.04,X, np.where(X>0.04, np.round(np.log(X-0.04)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.08': 'np.where(X < 0.08, X, np.where(X>0.08, np.round(np.sqrt(X-0.08)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.08': 'np.where(X <0.08,X, np.where(X>0.08, np.round(np.log(X-0.08)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.16': 'np.where(X < 0.16, X, np.where(X>0.16, np.round(np.sqrt(X-0.16)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.16': 'np.where(X <0.16,X, np.where(X>0.16, np.round(np.log(X-0.16)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.32': 'np.where(X < 0.32, X, np.where(X>0.32, np.round(np.sqrt(X-0.32)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.32': 'np.where(X <0.32,X, np.where(X>0.32, np.round(np.log(X-0.32)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.64': 'np.where(X < 0.64, X, np.where(X>0.64, np.round(np.sqrt(X-0.64)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.64': 'np.where(X <0.64,X, np.where(X>0.64, np.round(np.log(X-0.64)*100,0), 0))', '_tr_fw_prob_sqrt_res_100_gt_0.96': 'np.where(X < 0.96, X, np.where(X>0.96, np.round(np.sqrt(X-0.96)*100,0), 0))', '_tr_fw_prob_log_res_100_gt_0.96': 'np.where(X <0.96,X, np.where(X>0.96, np.round(np.log(X-0.96)*100,0), 0))'}"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.resolution_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "resolution_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 10, 50, 100]"}, {"fullname": "outrank.feature_transformations.feature_transformer_vault.fw_transformers.greater_than_range", "modulename": "outrank.feature_transformations.feature_transformer_vault.fw_transformers", "qualname": "greater_than_range", "kind": "variable", "doc": "

    \n", "default_value": "[1, 2, 4, 8, 16, 32, 64, 96]"}, {"fullname": "outrank.feature_transformations.ranking_transformers", "modulename": "outrank.feature_transformations.ranking_transformers", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.noise_preset", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.noise_preset", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerNoise.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerNoise.construct_new_features", "kind": "function", "doc": "

    Generate a few standard noise distributions

    \n", "signature": "(self, dataframe: pandas.core.frame.DataFrame, label_column=None):", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric", "kind": "class", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.__init__", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.__init__", "kind": "function", "doc": "

    \n", "signature": "(numeric_column_names: set[str], preset: str = 'default')"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.numeric_column_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.numeric_column_names", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.constructed_feature_names", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.constructed_feature_names", "kind": "variable", "doc": "

    \n", "annotation": ": set[str]"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.max_maj_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.max_maj_support", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.nan_prop_support", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.nan_prop_support", "kind": "variable", "doc": "

    \n"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.get_vals", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.get_vals", "kind": "function", "doc": "

    \n", "signature": "(self, tmp_df: pandas.core.frame.DataFrame, col_name: str) -> Any:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_baseline_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_baseline_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.feature_transformations.ranking_transformers.FeatureTransformerGeneric.construct_new_features", "modulename": "outrank.feature_transformations.ranking_transformers", "qualname": "FeatureTransformerGeneric.construct_new_features", "kind": "function", "doc": "

    \n", "signature": "(self, dataframe: Any) -> pandas.core.frame.DataFrame:", "funcdef": "def"}, {"fullname": "outrank.task_generators", "modulename": "outrank.task_generators", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_generators.logger", "modulename": "outrank.task_generators", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.task_generators.outrank_task_generate_data_set", "modulename": "outrank.task_generators", "qualname": "outrank_task_generate_data_set", "kind": "function", "doc": "

    Core method for generating data sets

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking", "modulename": "outrank.task_instance_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_instance_ranking.shannon_ent", "modulename": "outrank.task_instance_ranking", "qualname": "shannon_ent", "kind": "function", "doc": "

    \n", "signature": "(string: str) -> float:", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.compute_entropy_avg", "modulename": "outrank.task_instance_ranking", "qualname": "compute_entropy_avg", "kind": "function", "doc": "

    \n", "signature": "(line: list) -> float:", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.score_line", "modulename": "outrank.task_instance_ranking", "qualname": "score_line", "kind": "function", "doc": "

    \n", "signature": "(line):", "funcdef": "def"}, {"fullname": "outrank.task_instance_ranking.outrank_task_rank_instances", "modulename": "outrank.task_instance_ranking", "qualname": "outrank_task_rank_instances", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> None:", "funcdef": "def"}, {"fullname": "outrank.task_ranking", "modulename": "outrank.task_ranking", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_ranking.outrank_task_conduct_ranking", "modulename": "outrank.task_ranking", "qualname": "outrank_task_conduct_ranking", "kind": "function", "doc": "

    \n", "signature": "(args: Any) -> None:", "funcdef": "def"}, {"fullname": "outrank.task_selftest", "modulename": "outrank.task_selftest", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_selftest.logger", "modulename": "outrank.task_selftest", "qualname": "logger", "kind": "variable", "doc": "

    \n", "default_value": "<Logger syn-logger (DEBUG)>"}, {"fullname": "outrank.task_selftest.conduct_self_test", "modulename": "outrank.task_selftest", "qualname": "conduct_self_test", "kind": "function", "doc": "

    \n", "signature": "():", "funcdef": "def"}, {"fullname": "outrank.task_summary", "modulename": "outrank.task_summary", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_summary.outrank_task_result_summary", "modulename": "outrank.task_summary", "qualname": "outrank_task_result_summary", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.task_visualization", "modulename": "outrank.task_visualization", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.task_visualization.outrank_task_visualize_results", "modulename": "outrank.task_visualization", "qualname": "outrank_task_visualize_results", "kind": "function", "doc": "

    \n", "signature": "(args):", "funcdef": "def"}, {"fullname": "outrank.visualizations", "modulename": "outrank.visualizations", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.visualizations.ranking_visualization", "modulename": "outrank.visualizations.ranking_visualization", "kind": "module", "doc": "

    \n"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_hierarchical_clusters", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_hierarchical_clusters", "kind": "function", "doc": "

    A method for visualization of hierarchical clusters w.r.t. different linkage functions

    \n", "signature": "(\ttriplet_dataframe: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str = 'png',\tmax_num_clusters: int = 100) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_heatmap", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_heatmap", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\timage_format: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_barplots", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_barplots", "kind": "function", "doc": "

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\treference_json: str,\timage_format: str,\tlabel: str,\theuristic: str) -> None:", "funcdef": "def"}, {"fullname": "outrank.visualizations.ranking_visualization.visualize_all", "modulename": "outrank.visualizations.ranking_visualization", "qualname": "visualize_all", "kind": "function", "doc": "

    A method for visualization of the obtained feature interaction maps.

    \n", "signature": "(\ttriplets: pandas.core.frame.DataFrame,\toutput_folder: str,\tlabel: str = '',\treference_json: str = '',\timage_format: str = 'png',\theuristic: str = 'MI') -> None:", "funcdef": "def"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough. From cad09f1ba888d01c6c81d750c9a54e2917843bc6 Mon Sep 17 00:00:00 2001 From: 98MM <47939788+98MM@users.noreply.github.com> Date: Mon, 15 Jul 2024 11:57:46 +0200 Subject: [PATCH 9/9] fixed conflicting file --- .../synthetic_data_generators/cc_generator.py | 101 +++++++----------- 1 file changed, 36 insertions(+), 65 deletions(-) diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py index f370580..1084021 100644 --- a/outrank/algorithms/synthetic_data_generators/cc_generator.py +++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py @@ -16,13 +16,14 @@ class CategoricalClassification: - def __init__(self): + def __init__(self, seed: int = 42): + np.random.seed(seed) self.dataset_info = { 'general': {}, 'combinations': [], 'correlations': [], 'duplicates': [], - 'labels': [], + 'labels': {}, 'noise': [], } @@ -70,8 +71,8 @@ def generate_data( np.random.seed(seed) X = np.empty([n_features, n_samples]) + # No specific structure parameter passed if structure is None: - # No specific structure parameter passed for i in range(n_features): x = self._generate_feature( n_samples, @@ -82,16 +83,17 @@ def generate_data( high=high, ) X[i] = x + # Structure parameter passed, building based on structure else: - # Structure parameter passed, building based on structure ix = 0 for data in structure: + + # Data in structure is a tuple of (feature index (integer), feature attributes) if not isinstance(data[0], (list, np.ndarray)): - # Data in structure is a tuple of (feature index (integer), feature attributes) feature_ix, feature_attributes = data + # Filling out the dataset up to column index feature_ix if ix < feature_ix: - # Filling out the dataset up to column index feature_ix for i in range(ix, feature_ix): x = self._generate_feature( n_samples, @@ -115,12 +117,12 @@ def generate_data( X[ix] = x ix += 1 + # Data in structure is a tuple of (list of feature indexes, feature attributes) else: - # Data in structure is a tuple of (list of feature indexes, feature attributes) feature_ixs, feature_attributes = data + # Filling out the dataset up to feature_ix for feature_ix in feature_ixs: - # Filling out the dataset up to feature_ix if ix < feature_ix: for i in range(ix, feature_ix): x = self._generate_feature( @@ -146,8 +148,8 @@ def generate_data( X[ix] = x ix += 1 + # Fill out the rest of the dataset if ix < n_features: - # Fill out the rest of the dataset for i in range(ix, n_features): x = self._generate_feature( n_samples, @@ -182,9 +184,9 @@ def _configure_generate_feature( :return: feature vector """ + # feature_cardinality is just an integer, generate feature either with random values or + # [low, low+cardinality] if not isinstance(feature_attributes, (list, np.ndarray)): - # feature_cardinality is just an integer, generate feature either with random values or - # [low, low+cardinality] x = self._generate_feature( n_samples, cardinality=feature_attributes, @@ -193,8 +195,8 @@ def _configure_generate_feature( low=low, high=high, ) + # feature_cardinality is a list of [value_domain, value_frequencies] else: - # feature_cardinality is a list of [value_domain, value_frequencies] if isinstance(feature_attributes[0], (list, np.ndarray)): value_domain, value_frequencies = feature_attributes x = self._generate_feature( @@ -203,8 +205,8 @@ def _configure_generate_feature( ensure_rep=ensure_rep, p=value_frequencies, ) + # feature_cardinality is value_domain (list of values for feature) else: - # feature_cardinality is value_domain (list of values for feature) value_domain = feature_attributes x = self._generate_feature( n_samples, @@ -268,7 +270,7 @@ def generate_combinations( X: ArrayLike, feature_indices: list[int] | ArrayLike, combination_function: Optional = None, - combination_type: Literal = 'linear', + combination_type: Literal['linear', 'nonlinear'] = 'linear', ) -> np.ndarray: """ Generates linear, nonlinear, or custom combinations within feature vectors in given dataset X @@ -436,8 +438,9 @@ def generate_labels( p: float | list[float] | ArrayLike = 0.5, k: int | float = 2, decision_function: Optional = None, - class_relation: str = 'linear', + class_relation: Literal['linear', 'nonlinear', 'cluster'] = 'linear', balance: bool = False, + random_state: int = 42, ): """ Generates labels for dataset X @@ -448,6 +451,7 @@ def generate_labels( :param decision_function: optional user-defined decision function :param class_relation: string, either 'linear', 'nonlinear', or 'cluster' :param balance: boolean, whether to balance clustering class labels + :param random_state: seed for KMeans clustering, defaults to 42 :return: array of labels, corresponding to dataset X """ @@ -513,7 +517,7 @@ def generate_labels( p = 1.0 else: p = [p, 1 - p] - y = self._cluster_data(X, n, p=p, balance=balance) + y = self._cluster_data(X, n, p=p, balance=balance, random_state=random_state) self.dataset_info.update({ 'labels': { @@ -530,6 +534,7 @@ def _cluster_data( n: int, p: float | list[float] | ArrayLike | None = 1.0, balance: bool = False, + random_state: int = 42, ) -> np.ndarray: """ Cluster data using kmeans @@ -537,16 +542,18 @@ def _cluster_data( :param n: number of clusters :param p: class distribution :param balance: balance the clusters according to p + :random_state: seed for KMeans clustering, defaults to 42 :return: array of labels, corresponding to dataset X """ - kmeans = KMeans(n_clusters=n) + kmeans = KMeans(n_clusters=n, random_state=random_state) kmeans.fit(X) cluster_labels = kmeans.labels_ - if not isinstance(p, (list, np.ndarray)): # Fully balanced clusters + # Fully balanced clusters + if not isinstance(p, (list, np.ndarray)): samples_per_cluster = [len(X) // n] * n else: samples = len(X) @@ -572,10 +579,11 @@ def _cluster_data( adjustment = samples_per_cluster[i] - cluster_size adjustments.append(adjustment) - if adjustment < 0: # Cluter is too large - + # Cluster is too large + if adjustment < 0: centroid = kmeans.cluster_centers_[i] - dataset_indices = np.where(cluster_labels == i)[0] # Indices of samples in dataset + # Indices of samples in dataset + dataset_indices = np.where(cluster_labels == i)[0] cluster_samples = np.copy(X[dataset_indices]) distances = np.linalg.norm( @@ -625,7 +633,7 @@ def generate_noise( X: ArrayLike, y: list[int] | ArrayLike, p: float = 0.2, - type: Literal = 'categorical', + type: Literal['categorical', 'missing'] = 'categorical', missing_val: str | int | float = float('-inf'), ) -> np.ndarray: @@ -714,6 +722,9 @@ def generate_noise( return Xn_T.T + else: + raise ValueError(f'Type {type} not supported') + def downsample_dataset( self, X: ArrayLike, @@ -799,47 +810,7 @@ def print_dataset( print(f'], Label: {y[n]}') n += 1 + """ def summarize(self): - - print(f"Number of features: {self.dataset_info['general']['n_features']}") - print(f"Number of generated samples: {self.dataset_info['general']['n_samples']}") - if self.dataset_info['downsampling']: - print( - f"Dataset downsampled from shape {self.dataset_info['downsampling']['original_shape']}, to shape {self.dataset_info['downsampling']['downsampled_shape']}", - ) - print(f"Number of classes: {self.dataset_info['labels']['n_class']}") - print(f"Class relation: {self.dataset_info['labels']['class_relation']}") - - print('-------------------------------------') - - if len(self.dataset_info['combinations']) > 0: - print('Combinations:') - for comb in self.dataset_info['combinations']: - print( - f"Features {comb['feature_indices']} are in {comb['combination_type']} combination, result in {comb['combination_ix']}", - ) - print('-------------------------------------') - - if len(self.dataset_info['correlations']) > 0: - print('Correlations:') - for corr in self.dataset_info['correlations']: - print( - f"Features {corr['feature_indices']} are correlated to {corr['correlated_indices']} with a factor of {corr['correlation_factor']}", - ) - print('-------------------------------------') - - if len(self.dataset_info['duplicates']) > 0: - print('Duplicates:') - for dup in self.dataset_info['duplicates']: - print( - f"Features {dup['feature_indices']} are duplicated, duplicate indexes are {dup['duplicate_indices']}", - ) - print('-------------------------------------') - - if len(self.dataset_info['noise']) > 0: - print('Simulated noise:') - for noise in self.dataset_info['noise']: - print(f"Simulated {noise['type']} noise, amount of {noise['noise_amount']}") - print('-------------------------------------') - - print("\nFor more information on dataset structure, print cc.dataset_info['general']['structure']") + # TODO: Logging function + """