From 1a2370a1ab8a68ba08f87a73de0a9bdc120884d9 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 18 Oct 2024 14:32:59 +0200 Subject: [PATCH] some more refactorings --- outrank/algorithms/importance_estimator.py | 203 +++++++-------------- outrank/core_utils.py | 30 +-- outrank/task_instance_ranking.py | 94 +++++----- outrank/task_summary.py | 72 ++++---- 4 files changed, 148 insertions(+), 251 deletions(-) diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index 68fe5ab..1e8028f 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -1,18 +1,15 @@ -# A module for pairwise computation of importances -- entrypoint for the core ranking algorighm(s) from __future__ import annotations import logging import operator import traceback -from typing import Any -from typing import Dict +from typing import Any, Dict, List, Tuple import numpy as np import pandas as pd from scipy.stats import pearsonr from sklearn.feature_selection import mutual_info_classif -from sklearn.linear_model import LogisticRegression -from sklearn.linear_model import SGDClassifier +from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.metrics import adjusted_mutual_info_score from sklearn.model_selection import cross_val_score from sklearn.preprocessing import OneHotEncoder @@ -28,35 +25,24 @@ try: from outrank.algorithms.feature_ranking import ranking_mi_numba - numba_available = True - -except Exception as es: - traceback.print_exc(0) +except ImportError: + traceback.print_exc() numba_available = False - -def sklearn_MI(vector_first: Any, vector_second: Any) -> float: - estimate_feature_importance = mutual_info_classif( - vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True, +def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float: + return mutual_info_classif( + vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True )[0] - return estimate_feature_importance - def sklearn_surrogate( - vector_first: Any, vector_second: Any, X: Any, surrogate_model: str, + vector_first: np.ndarray, vector_second: np.ndarray, X: np.ndarray, surrogate_model: str ) -> float: - clf = initialize_classifier(surrogate_model) - transf = OneHotEncoder() - # They do not commute, swap if needed - if len(np.unique(vector_second) > 2): - vector_third = vector_second - vector_second = vector_first - vector_first = vector_third - del vector_third + if len(np.unique(vector_second)) > 2: + vector_first, vector_second = vector_second, vector_first if X.size <= 1: X = vector_first.reshape(-1, 1) @@ -64,164 +50,97 @@ def sklearn_surrogate( X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1) X = transf.fit_transform(X) - estimate_feature_importance_list = cross_val_score( - clf, X, vector_second, scoring='neg_log_loss', cv=num_folds, - ) - estimate_feature_importance = 1 + \ - np.median(estimate_feature_importance_list) - - return estimate_feature_importance - - -def numba_mi(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio): - if heuristic == 'MI-numba-randomized': - cardinality_correction = True - - else: - cardinality_correction = False - - estimate_feature_importance = ranking_mi_numba.mutual_info_estimator_numba( - vector_first.reshape(-1).astype(np.int32), - vector_second.reshape(-1).astype(np.int32), + scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds) + return 1 + np.median(scores) + +def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float: + cardinality_correction = heuristic == 'MI-numba-randomized' + return ranking_mi_numba.mutual_info_estimator_numba( + vector_first.astype(np.int32), + vector_second.astype(np.int32), approximation_factor=np.float32(mi_stratified_sampling_ratio), cardinality_correction=cardinality_correction, ) - return estimate_feature_importance - - -def sklearn_mi_adj(vector_first, vector_second): - # AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))] - estimate_feature_importance = adjusted_mutual_info_score( - vector_first.reshape(-1), vector_second.reshape(-1), - ) - return estimate_feature_importance - - -def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df): - """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.""" +def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float: + return adjusted_mutual_info_score(vector_first, vector_second) - feature_one = combination[0] - feature_two = combination[1] +def get_importances_estimate_pairwise(combination: Tuple[str, str], reference_model_features: List[str], args: Any, tmp_df: pd.DataFrame) -> Tuple[str, str, float]: + feature_one, feature_two = combination - if feature_one not in tmp_df.columns: - logging.info(f'{feature_one} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.') - return [feature_one, feature_two, 0] - elif feature_two not in tmp_df.columns: - logging.info(f'{feature_two} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.') - return [feature_one, feature_two, 0] + if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns: + logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.') + return feature_one, feature_two, 0.0 - vector_first = tmp_df[[feature_one]].values.ravel() - vector_second = tmp_df[[feature_two]].values.ravel() + vector_first = tmp_df[feature_one].values + vector_second = tmp_df[feature_two].values - if len(vector_first) == 0 or len(vector_second) == 0: - return [feature_one, feature_two, 0] + if vector_first.size == 0 or vector_second.size == 0: + return feature_one, feature_two, 0.0 - # Compute score based on the selected heuristic. if args.heuristic == 'MI': - # Compute the infoGain - estimate_feature_importance = sklearn_MI(vector_first, vector_second) - + score = sklearn_MI(vector_first, vector_second) elif 'surrogate-' in args.heuristic: - X = np.array(float) - if is_prior_heuristic(args) and (len(reference_model_features) > 0): - X = tmp_df[reference_model_features].values - - estimate_feature_importance = sklearn_surrogate( - vector_first, vector_second, X, args.heuristic, - ) + X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([]) + score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic) elif 'max-value-coverage' in args.heuristic: - estimate_feature_importance = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second) - + score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second) elif 'MI-numba' in args.heuristic: - estimate_feature_importance = numba_mi( - vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio, - ) - + score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio) elif args.heuristic == 'AMI': - estimate_feature_importance = sklearn_mi_adj( - vector_first, vector_second, - ) - + score = sklearn_mi_adj(vector_first, vector_second) elif args.heuristic == 'correlation-Pearson': - estimate_feature_importance = pearsonr(vector_first, vector_second)[0] - + score = pearsonr(vector_first, vector_second)[0] elif args.heuristic == 'Constant': - estimate_feature_importance = 0.0 - + score = 0.0 else: - raise ValueError( - 'Please select one of the possible heuristics (MI, chi2)', - ) - - return (feature_one, feature_two, estimate_feature_importance) + raise ValueError('Please select a valid heuristic (MI, chi2, etc.).') + return feature_one, feature_two, score def rank_features_3MR( - relevance_dict: dict[str, float], - redundancy_dict: dict[tuple[Any, Any], Any], - relational_dict: dict[tuple[Any, Any], Any], + relevance_dict: Dict[str, float], + redundancy_dict: Dict[Tuple[Any, Any], Any], + relational_dict: Dict[Tuple[Any, Any], Any], strategy: str = 'median', - alpha: float = 1, - beta: float = 1, + alpha: float = 1.0, + beta: float = 1.0, ) -> pd.DataFrame: - all_features = relevance_dict.keys() - most_important_feature = max( - relevance_dict.items(), key=operator.itemgetter(1), - )[0] + all_features = set(relevance_dict.keys()) + most_important_feature = max(relevance_dict.items(), key=operator.itemgetter(1))[0] ranked_features = [most_important_feature] - def calc_higher_order(feature, is_redundancy=True): + def calc_higher_order(feature: str, is_redundancy: bool = True) -> float: values = [] for feat in ranked_features: interaction_tuple = (feat, feature) if is_redundancy: - if interaction_tuple in redundancy_dict: - values.append(redundancy_dict[interaction_tuple]) - else: - logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.') + values.append(redundancy_dict.get(interaction_tuple, 0)) else: - if interaction_tuple in relational_dict: - values.append(relational_dict[interaction_tuple]) - else: - logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.') - - if strategy == 'sum': - return sum(values) - if strategy == 'mean': - return np.mean(values) - return np.median(values) - - while len(ranked_features) != len(all_features): - top_importance = 0 - most_important_feature = '' - - for ind, feat in enumerate(set(all_features) - set(ranked_features)): + values.append(relational_dict.get(interaction_tuple, 0)) + return np.median(values) if strategy == 'median' else (np.mean(values) if strategy == 'mean' else sum(values)) + + while len(ranked_features) < len(all_features): + top_importance = -np.inf + most_important_feature = None + + for feat in all_features - set(ranked_features): feature_redundancy = calc_higher_order(feat) feature_relation = calc_higher_order(feat, False) feature_relevance = relevance_dict[feat] - importance = ( - feature_relevance - alpha * feature_redundancy + beta * feature_relation - ) + importance = feature_relevance - alpha * feature_redundancy + beta * feature_relation - if (importance > top_importance) or (ind == 0): + if importance > top_importance: top_importance = importance most_important_feature = feat + ranked_features.append(most_important_feature) - return pd.DataFrame( - { - 'Feature': ranked_features, - '3mr_ranking': list(range(1, len(ranked_features) + 1)), - }, - ) + return pd.DataFrame({'Feature': ranked_features, '3MR_Ranking': range(1, len(ranked_features) + 1)}) def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame): - # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label - # TODO - this is to be executed directly on df - no need for parallel kernel(s) pass - def initialize_classifier(surrogate_model: str): if 'surrogate-LR' in surrogate_model: return LogisticRegression(max_iter=100000) @@ -230,5 +149,5 @@ def initialize_classifier(surrogate_model: str): elif 'surrogate-SGD' in surrogate_model: return SGDClassifier(max_iter=100000, loss='log_loss') else: - logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD') + logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD') return SGDClassifier(max_iter=100000, loss='log_loss') diff --git a/outrank/core_utils.py b/outrank/core_utils.py index 1ccbe6e..1be4993 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -23,22 +23,22 @@ logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) pro_tips = [ - 'OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"', - 'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).', - 'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!', - 'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).', - 'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)', - 'Visualization part also includes clustering - this might be very insightful!', - 'By default OutRank includes feature cardinality and coverage in feature names (card; cov)', - 'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.', - 'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).', - 'Give it as many threads as physically possible (--num_threads).', - 'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.', - 'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).', - 'Your target can be any feature! (explaining one feature with others)', - 'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).', + 'OutRank can construct subfeatures based on subspaces. Use: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e".', + 'Heuristic MI-numba-randomized offers a great balance of speed and performance.', + 'Heuristic surrogate-lr performs internal cross-validation; keep that in mind.', + 'Consider running OutRank on a smaller sample first; it might be sufficient (--subsampling = a lot).', + 'OutRank supports two types of combinations: unsupervised pairwise ranking (--target_ranking_only=False) and supervised combinations (--interaction_order > 1).', + 'The visualization includes clustering, which can be very insightful!', + 'By default, OutRank includes feature cardinality and coverage in feature names (e.g., card; cov).', + 'Intermediary checkpoints (tmp_checkpoint.tsv) can provide insights during longer runs.', + 'You can rank redundancies of combined features using --interaction_order and --target_ranking_only=False.', + 'Use as many threads as possible for better performance (--num_threads).', + 'Speed up ranking by reducing the feature buffer size (--combination_number_upper_bound) and using --subsampling together.', + 'Not sure which feature transformations to choose? Use --transformers=default for a solid baseline (includes common DS transformations).', + 'Your target can be any feature, allowing you to explain one feature with others.', + 'OutRank uses HyperLogLog for cardinality estimation, useful for understanding cardinalities across datasets.', 'Each feature is named as featureName(cardinality, coverage in percents) in the final files.', - 'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.', + 'Generate candidate feature transformation ranges using --task=feature_summary_transformers.', ] diff --git a/outrank/task_instance_ranking.py b/outrank/task_instance_ranking.py index 2e40244..6555068 100644 --- a/outrank/task_instance_ranking.py +++ b/outrank/task_instance_ranking.py @@ -2,74 +2,61 @@ import gzip import os -from collections import Counter -from collections import defaultdict +from collections import Counter, defaultdict from typing import Any import numpy as np import pandas as pd import tqdm -from outrank.core_utils import generic_line_parser -from outrank.core_utils import get_dataset_info -from outrank.core_utils import get_num_of_instances +from outrank.core_utils import generic_line_parser, get_dataset_info, get_num_of_instances try: import matplotlib.pyplot as plt -except: - pass +except ImportError: + plt = None - -def shannon_ent(string: str) -> float: +def shannon_entropy(string: str) -> float: counts = Counter(string) - frequencies = ((i / len(string)) for i in counts.values()) - return -np.sum(f * np.log2(f) for f in frequencies) - - -def compute_entropy_avg(line: list) -> float: - joint_ent = 0 - for field in line: - joint_ent += shannon_ent(field) - return joint_ent - + frequencies = (i / len(string) for i in counts.values()) + return -sum(f * np.log2(f) for f in frequencies) + +def compute_average_entropy(line: list[str]) -> float: + return sum(shannon_entropy(field) for field in line) + +def score_line(line: list[str]) -> dict[str, float]: + total_fields = len(line) + nan_prop = line.count('') / total_fields + empty_dict_prop = line.count('{}') / total_fields + all_empty_prop = (line.count('{}') + line.count('')) / total_fields + all_zero_prop = line.count('0') / total_fields + + out_struct = { + 'empty_string_prop': nan_prop, + 'empty_dict': empty_dict_prop, + 'all_empty': all_empty_prop, + 'all_zero': all_zero_prop, + 'row_entropy': compute_average_entropy(line) + } -def score_line(line): - nan_prop = line.count('') / len(line) - out_struct = {} - out_struct['empty_string_prop'] = nan_prop - out_struct['empty_dict'] = line.count('{}') / len(line) - out_struct['all_empty'] = (line.count('{}') + line.count('')) / len(line) - out_struct['all_zero'] = line.count('0') / len(line) for j in [30, 60, 100, 200, 300]: - out_struct[f'all_more_{j}_chars'] = len( - [x for x in line if len(x) > j], ) / len(line) - out_struct['row_entropy'] = compute_entropy_avg(line) - return out_struct + out_struct[f'all_more_{j}_chars'] = sum(len(x) > j for x in line) / total_fields + return out_struct def outrank_task_rank_instances(args: Any) -> None: - + dataset_info = get_dataset_info(args) + data_path = dataset_info.data_path data_encoding = 'utf-8' delimiter = '\t' - dataset_info = get_dataset_info(args) - local_pbar = tqdm.tqdm( - total=get_num_of_instances(dataset_info.data_path) - 1, - position=0, - disable=args.disable_tqdm == 'True', - ) + + total_lines = get_num_of_instances(data_path) - 1 + local_pbar = tqdm.tqdm(total=total_lines, position=0, disable=args.disable_tqdm == 'True') local_pbar.set_description('Starting ranking computation') - _, file_extension = os.path.splitext(dataset_info.data_path) + _, file_extension = os.path.splitext(data_path) + file_stream = gzip.open(data_path, 'rt', encoding=data_encoding) if file_extension == '.gz' else open(data_path, encoding=data_encoding) - if file_extension == '.gz': - file_stream = gzip.open( - dataset_info.data_path, - 'rt', - encoding=data_encoding, - ) - - else: - file_stream = open(dataset_info.data_path, encoding=data_encoding) line_counter = 0 out_scores_lab = defaultdict(list) @@ -89,26 +76,27 @@ def outrank_task_rank_instances(args: Any) -> None: break out_scores_lab[line[0]].append(score_line(parsed_line)) + file_stream.close() + + os.makedirs(args.output_folder, exist_ok=True) for label, out_scores in out_scores_lab.items(): out_df = pd.DataFrame(out_scores) - os.makedirs(args.output_folder, exist_ok=True) for col in out_df.columns: sorted_vals = out_df[col].sort_values() plt.figure(figsize=(5, 5), dpi=300) - plt.title(col + f' label: {label}') + plt.title(f'{col} label: {label}') plt.hist( x=sorted_vals * 100, color='black', density=True, bins=100, ) - if 'entropy' not in col: - plt.xlabel('Proportion of namespaces (%)') - else: - plt.xlabel('Row entropy') + plt.xlabel('Proportion of namespaces (%)' if 'entropy' not in col else 'Row entropy') plt.ylabel('Density') plt.tight_layout() fname = f'distPlot{col}_{label}.pdf' plt.savefig(os.path.join(args.output_folder, fname), dpi=300) plt.cla() plt.clf() + + local_pbar.close() diff --git a/outrank/task_summary.py b/outrank/task_summary.py index 458c9b3..bebf5e7 100644 --- a/outrank/task_summary.py +++ b/outrank/task_summary.py @@ -11,67 +11,57 @@ def outrank_task_result_summary(args): - triplets = pd.read_csv( - os.path.join(args.output_folder, 'pairwise_ranks.tsv'), sep='\t', - ) - triplets = triplets.sort_values(by=['Score'], ascending=False) + triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') + triplets = pd.read_csv(triplets_path, sep='\t') + triplets = triplets.sort_values(by='Score', ascending=False) + final_ranking = [] - for enx, row in triplets.iterrows(): - final_row = None - if args.label_column == row['FeatureA'].split('-')[0]: - final_row = [row['FeatureB'], row['Score']] - if args.label_column == row['FeatureB'].split('-')[0]: - final_row = [row['FeatureA'], row['Score']] - if final_row and args.label_column != final_row[0]: - final_ranking.append(final_row) + for _, row in triplets.iterrows(): + feature_a, feature_b = row['FeatureA'], row['FeatureB'] + score = row['Score'] + if args.label_column == feature_a.split('-')[0]: + final_ranking.append([feature_b, score]) + elif args.label_column == feature_b.split('-')[0]: + final_ranking.append([feature_a, score]) - final_df = pd.DataFrame(final_ranking) - final_df.columns = ['Feature', f'Score {args.heuristic}'] - final_df.index = np.arange(1, final_df.shape[0] + 1, 1) + final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {args.heuristic}']) final_df = ( - final_df.groupby(by=['Feature']) + final_df.groupby('Feature') .median() .reset_index() - .sort_values(by=[f'Score {args.heuristic}'], ascending=False) + .sort_values(by=f'Score {args.heuristic}', ascending=False) ) - min_score = np.min(final_df[f'Score {args.heuristic}'].values) - max_score = np.max(final_df[f'Score {args.heuristic}'].values) if "MI" in args.heuristic: - final_df[f'Score {args.heuristic}'] = ( - final_df[f'Score {args.heuristic}'] - min_score - ) / (max_score - min_score) + min_score = final_df[f'Score {args.heuristic}'].min() + max_score = final_df[f'Score {args.heuristic}'].max() + final_df[f'Score {args.heuristic}'] = (final_df[f'Score {args.heuristic}'] - min_score) / (max_score - min_score) + logging.info(f'Storing summary files to {args.output_folder}') pd.set_option('display.max_rows', None, 'display.max_columns', None) + singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') - final_df = final_df.reset_index(drop=True) - final_df.to_csv(singles_path, sep='\t') + final_df.to_csv(singles_path, sep='\t', index=False) if args.interaction_order > 1: feature_store = defaultdict(list) - for enx, row in final_df.iterrows(): + for _, row in final_df.iterrows(): fname = row['Feature'] score = row[f'Score {args.heuristic}'] if 'AND' in fname: for el in fname.split('-')[0].split(' AND '): feature_store[el].append(score) - final_aggregate_df = [] - for k, v in feature_store.items(): - final_aggregate_df.append( - { - 'Feature': k, - f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median( - v, - ), - }, - ) - final_aggregate_df = pd.DataFrame(final_aggregate_df) + final_aggregate_df = pd.DataFrame([ + { + 'Feature': k, + f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(v), + } + for k, v in feature_store.items() + ]) final_aggregate_df.to_csv( - os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', + os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False ) - final_df = final_df[final_df['Feature'].str.contains('_tr_')] - final_df.to_csv( - singles_path.replace('.tsv', '_transformers_only_imp.tsv'), sep='\t', - ) + transformers_only_path = singles_path.replace('.tsv', '_transformers_only_imp.tsv') + final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False)