From 7f40ea6923c464600a10493b29c871d886509e2e Mon Sep 17 00:00:00 2001 From: bskrlj Date: Wed, 23 Oct 2024 11:22:33 +0200 Subject: [PATCH 01/15] tldr mode --- outrank/__main__.py | 7 ++ outrank/task_summary.py | 72 ++++++++++++++----- .../visualizations/ranking_visualization.py | 14 ++-- 3 files changed, 68 insertions(+), 25 deletions(-) diff --git a/outrank/__main__.py b/outrank/__main__.py index f261dc5..3ad89d7 100644 --- a/outrank/__main__.py +++ b/outrank/__main__.py @@ -204,6 +204,13 @@ def main(): help='Relevant for task data_generator -- how many features.', ) + parser.add_argument( + '--tldr', + type=str, + default='True', + help='If enabled, it will output some of the main results on the screen after finishing.', + ) + parser.add_argument( '--num_synthetic_rows', type=int, diff --git a/outrank/task_summary.py b/outrank/task_summary.py index bebf5e7..2fbc6e9 100644 --- a/outrank/task_summary.py +++ b/outrank/task_summary.py @@ -3,6 +3,8 @@ import logging import os from collections import defaultdict +from typing import Any +from typing import List import numpy as np import pandas as pd @@ -10,44 +12,62 @@ logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) -def outrank_task_result_summary(args): - triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') +def read_and_sort_triplets(triplets_path: str) -> pd.DataFrame: + """Read triplets from a file and sort by the 'Score' column.""" triplets = pd.read_csv(triplets_path, sep='\t') - triplets = triplets.sort_values(by='Score', ascending=False) + return triplets.sort_values(by='Score', ascending=False) + +def generate_final_ranking(triplets: pd.DataFrame, label_column: str) -> list[list[Any]]: + """Generate final ranking based on the label column.""" final_ranking = [] for _, row in triplets.iterrows(): feature_a, feature_b = row['FeatureA'], row['FeatureB'] score = row['Score'] - if args.label_column == feature_a.split('-')[0]: + if label_column == feature_a.split('-')[0]: final_ranking.append([feature_b, score]) - elif args.label_column == feature_b.split('-')[0]: + elif label_column == feature_b.split('-')[0]: final_ranking.append([feature_a, score]) + return final_ranking - final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {args.heuristic}']) + +def create_final_dataframe(final_ranking: list[list[Any]], heuristic: str) -> pd.DataFrame: + """Create a final DataFrame and normalize if necessary.""" + final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {heuristic}']) final_df = ( final_df.groupby('Feature') .median() .reset_index() - .sort_values(by=f'Score {args.heuristic}', ascending=False) + .sort_values(by=f'Score {heuristic}', ascending=False) ) - if "MI" in args.heuristic: - min_score = final_df[f'Score {args.heuristic}'].min() - max_score = final_df[f'Score {args.heuristic}'].max() - final_df[f'Score {args.heuristic}'] = (final_df[f'Score {args.heuristic}'] - min_score) / (max_score - min_score) + if 'MI' in heuristic: + min_score = final_df[f'Score {heuristic}'].min() + max_score = final_df[f'Score {heuristic}'].max() + final_df[f'Score {heuristic}'] = (final_df[f'Score {heuristic}'] - min_score) / (max_score - min_score) + + return final_df - logging.info(f'Storing summary files to {args.output_folder}') + +def store_summary_files(final_df: pd.DataFrame, output_folder: str, heuristic: str, tldr: bool) -> None: + """Store the summary files and optionally print the head of the DataFrame.""" + logging.info(f'Storing summary files to {output_folder}') pd.set_option('display.max_rows', None, 'display.max_columns', None) - singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') + singles_path = os.path.join(output_folder, 'feature_singles.tsv') final_df.to_csv(singles_path, sep='\t', index=False) - if args.interaction_order > 1: + if tldr: + print(final_df.head(20)) + + +def handle_interaction_order(final_df: pd.DataFrame, output_folder: str, heuristic: str, interaction_order: int) -> None: + """Handle the interaction order if it is greater than 1.""" + if interaction_order > 1: feature_store = defaultdict(list) for _, row in final_df.iterrows(): fname = row['Feature'] - score = row[f'Score {args.heuristic}'] + score = row[f'Score {heuristic}'] if 'AND' in fname: for el in fname.split('-')[0].split(' AND '): feature_store[el].append(score) @@ -55,13 +75,29 @@ def outrank_task_result_summary(args): final_aggregate_df = pd.DataFrame([ { 'Feature': k, - f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(v), + f'Combined score (order: {interaction_order}, {heuristic})': np.median(v), } for k, v in feature_store.items() ]) final_aggregate_df.to_csv( - os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False + os.path.join(output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False, ) - transformers_only_path = singles_path.replace('.tsv', '_transformers_only_imp.tsv') + +def filter_transformers_only(final_df: pd.DataFrame, output_folder: str) -> None: + """Filter the DataFrame to include only transformer features and store the result.""" + transformers_only_path = os.path.join(output_folder, 'feature_singles_transformers_only_imp.tsv') final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False) + + +def outrank_task_result_summary(args) -> None: + """Main function to generate a summary of outrank task results.""" + triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') + triplets = read_and_sort_triplets(triplets_path) + + final_ranking = generate_final_ranking(triplets, args.label_column) + final_df = create_final_dataframe(final_ranking, args.heuristic) + + store_summary_files(final_df, args.output_folder, args.heuristic, args.tldr) + handle_interaction_order(final_df, args.output_folder, args.heuristic, args.interaction_order) + filter_transformers_only(final_df, args.output_folder) diff --git a/outrank/visualizations/ranking_visualization.py b/outrank/visualizations/ranking_visualization.py index 0719d8e..d2ae48a 100644 --- a/outrank/visualizations/ranking_visualization.py +++ b/outrank/visualizations/ranking_visualization.py @@ -43,7 +43,7 @@ def visualize_hierarchical_clusters( values='Score', index='FeatureA', columns='FeatureB', - aggfunc=np.mean, + aggfunc='mean', # Updated from np.mean to 'mean' ) pivot_table.fillna(0, inplace=True) @@ -59,7 +59,7 @@ def visualize_hierarchical_clusters( ) plt.title(f'Linkage function: {linkage_heuristic}') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}' plt.savefig(out_path, dpi=300) @@ -95,7 +95,7 @@ def visualize_hierarchical_clusters( dfx.columns = ['Silhouette', 'threshold', 'numClusters'] sns.lineplot(x='numClusters', y='Silhouette', data=dfx, color='black') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() out_path = f'{output_folder}/SilhouetteProfile.{image_format}' plt.savefig(out_path, dpi=300) @@ -113,7 +113,7 @@ def visualize_hierarchical_clusters( projected_data['ClusterID'] = top_clustering.astype(str) sns.scatterplot(x='Dim1', y='Dim2', hue='ClusterID', data=projected_data, palette='Set2') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() plt.savefig(f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300) plt.clf() @@ -130,7 +130,7 @@ def visualize_heatmap( sns.set(font_scale=2) fig, ax = plt.subplots() pivot_table = pd.pivot_table( - triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc=np.mean, + triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc='mean', # Updated from np.mean to 'mean' ) mask = np.zeros_like(pivot_table.values) mask[np.triu_indices_from(mask)] = True @@ -160,7 +160,7 @@ def visualize_heatmap( plt.xlabel('') plt.ylabel('') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500) plt.clf() @@ -245,7 +245,7 @@ def visualize_barplots( plt.xlabel(f'Feature importance (based on heuristic {heuristic})') plt.ylabel('') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() plt.savefig(f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300) plt.clf() From 88121feeef8fd38a6af2713b304fc96473aa2e18 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Wed, 23 Oct 2024 11:22:44 +0200 Subject: [PATCH 02/15] tldr --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6e2fd4f..c004f3e 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def _read_description(): packages = [x for x in setuptools.find_packages() if x != 'test'] setuptools.setup( name='outrank', - version='0.97.3', + version='0.97.4', description='OutRank: Feature ranking for massive sparse data sets.', long_description=_read_description(), long_description_content_type='text/markdown', From 8664d876db853435d57b33f21ca9e2b0726e3744 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Wed, 23 Oct 2024 14:52:22 +0200 Subject: [PATCH 03/15] docs --- examples/README.md | 21 +++++++ examples/recursive_ranking.py | 110 ++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 examples/README.md create mode 100644 examples/recursive_ranking.py diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..370c9e5 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,21 @@ +# Feature Evolution via Ranking + +This script facilitates the process of feature evolution through iterative ranking using the `outrank` tool. It automates the process of running multiple iterations of feature ranking, extracting the best features, and updating the model specifications accordingly. + +## Overview + +The script performs the following steps: +1. **Initialization**: Sets up the initial model specification directory and creates the initial model JSON file. +2. **Iteration**: Runs the `outrank` task for a specified number of iterations. +3. **Feature Extraction**: Processes the results of each iteration to extract the best feature. +4. **Model Update**: Updates the model specification JSON with the newly identified best feature. + +## Prerequisites + +- Ensure that the `outrank` tool is installed and accessible from the command line. +- Python 3.6 or higher. +- Required Python packages: `pandas`, `argparse`, `json`, `shutil`, and `logging`. + +## Installation + +Install the required Python packages using pip (`pip install outrank --upgrade`) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py new file mode 100644 index 0000000..7e076c6 --- /dev/null +++ b/examples/recursive_ranking.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import argparse +import json +import logging +import os +import shutil +import subprocess +from typing import Optional + +import pandas as pd + +# Configure logging +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s') +logger = logging.getLogger('syn-logger') + +# Configuration constants +DATA_PATH = os.path.expanduser('~/datasets/toy') +MODEL_SPEC_DIR = 'model_spec_dir' +LABEL_COLUMN_NAME = 'label' +HEURISTIC = 'surrogate-SGD-prior' +DATA_FORMAT = 'ob-vw' +NUM_THREADS = 6 +INTERACTION_ORDER = 2 +COMBINATION_NUMBER_BOUND = 300 +MINIBATCH_SIZE = 30_000 +SUBSAMPLING = 1 + +def run_outrank_task(reference_model_json: str, output_folder: str) -> None: + """Run the outrank task with the specified parameters.""" + outrank_command = ( + f'outrank --task all --data_path {DATA_PATH} --data_source {DATA_FORMAT} ' + f'--target_ranking_only True --combination_number_upper_bound {COMBINATION_NUMBER_BOUND} ' + f'--num_threads {NUM_THREADS} --interaction_order {INTERACTION_ORDER} ' + f'--output_folder {output_folder} --reference_model_JSON {reference_model_json} ' + f'--heuristic {HEURISTIC} --label_column {LABEL_COLUMN_NAME} ' + f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm True;' + ) + logger.info(f'Running outrank command: {outrank_command}') + subprocess.run(outrank_command, shell=True, check=True) + logger.info(f'Outrank task completed for {reference_model_json}') + +def process_results(output_folder: str) -> str: + """Read the results and extract the best feature.""" + results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t') + logger.info(f'Results head:\n{results.head(5)}') + best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1]) + best_feature = ','.join(best_feature.split(' AND ')) + logger.info(f'Best feature: {best_feature}') + return best_feature + +def update_model_spec(model_index: int, best_feature: str) -> None: + """Update the model specification JSON with the new best feature.""" + current_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index}.json') + next_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index + 1}.json') + + with open(current_model_path) as file: + model_spec = json.load(file) + + current_features = model_spec['desc']['features'] + current_features.append(best_feature) + logger.info(f'Updated features: {current_features}') + + with open(next_model_path, 'w') as file: + new_model_spec = {'desc': {'features': current_features}} + json.dump(new_model_spec, file) + +def initialize_model_spec_dir() -> None: + """Initialize the model specification directory with the initial JSON file.""" + command = ( + 'mkdir -p model_spec_dir && ' + 'rm -rvf model_spec_dir/* && ' + 'echo \'{"desc": {"features": []}}\' > ./model_spec_dir/model_0.json' + ) + subprocess.run(command, shell=True, check=True) + logger.info('Initialized model specification directory with model_0.json') + +def run_evolution(iterations: int) -> None: + """Main function to run the test for multiple iterations.""" + for i in range(iterations): + reference_model_json = os.path.join(MODEL_SPEC_DIR, f'model_{i}.json') + output_folder = f'output_dir_{i}' + + if os.path.isdir(output_folder): + shutil.rmtree(output_folder) + os.mkdir(output_folder) + + try: + run_outrank_task(reference_model_json, output_folder) + best_feature = process_results(output_folder) + update_model_spec(i, best_feature) + except Exception as e: + logger.error(f'An error occurred during iteration {i}: {e}') + continue + +def parse_arguments() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser(description='Run the outrank evolution process.') + parser.add_argument( + '--iterations', + type=int, + default=10, + help='Number of iterations to run (default: 10)', + ) + return parser.parse_args() + +if __name__ == '__main__': + args = parse_arguments() + initialize_model_spec_dir() + run_evolution(args.iterations) From b60b5b041155a8f472211009f8670f4c74d06b6f Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 15:00:40 +0200 Subject: [PATCH 04/15] randomized heuristic --- examples/recursive_ranking.py | 7 +-- outrank/algorithms/importance_estimator.py | 70 ++++++++++++++++------ outrank/core_utils.py | 2 +- 3 files changed, 55 insertions(+), 24 deletions(-) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index 7e076c6..2ffe820 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -18,12 +18,12 @@ DATA_PATH = os.path.expanduser('~/datasets/toy') MODEL_SPEC_DIR = 'model_spec_dir' LABEL_COLUMN_NAME = 'label' -HEURISTIC = 'surrogate-SGD-prior' +HEURISTIC = 'MI-numba-randomized' DATA_FORMAT = 'ob-vw' NUM_THREADS = 6 INTERACTION_ORDER = 2 -COMBINATION_NUMBER_BOUND = 300 -MINIBATCH_SIZE = 30_000 +COMBINATION_NUMBER_BOUND = 1_000 +MINIBATCH_SIZE = 10_000 SUBSAMPLING = 1 def run_outrank_task(reference_model_json: str, output_folder: str) -> None: @@ -43,7 +43,6 @@ def run_outrank_task(reference_model_json: str, output_folder: str) -> None: def process_results(output_folder: str) -> str: """Read the results and extract the best feature.""" results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t') - logger.info(f'Results head:\n{results.head(5)}') best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1]) best_feature = ','.join(best_feature.split(' AND ')) logger.info(f'Best feature: {best_feature}') diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index f9e8241..18f2a7d 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -64,6 +64,12 @@ def sklearn_surrogate( def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float: cardinality_correction = heuristic == 'MI-numba-randomized' + + if vector_first.shape[1] == 1: + vector_first = vector_first.reshape(-1) + else: + vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1) + return ranking_mi_numba.mutual_info_estimator_numba( vector_first.astype(np.int32), vector_second.astype(np.int32), @@ -74,38 +80,64 @@ def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float: return adjusted_mutual_info_score(vector_first, vector_second) -def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]: +def generate_data_for_ranking(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple(np.ndarray, np.ndrray): feature_one, feature_two = combination - if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns: - logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.') - return feature_one, feature_two, 0.0 + if feature_one == args.label_column: + feature_one = feature_two + feature_two = args.label_column + + if args.reference_model_JSON != '' and args.reference_model_JSON is not None: + vector_first = tmp_df[list(reference_model_features) + [feature_one]].values + else: + vector_first = tmp_df[feature_one].values - vector_first = tmp_df[feature_one].values vector_second = tmp_df[feature_two].values + return vector_first, vector_second - if vector_first.size == 0 or vector_second.size == 0: - return feature_one, feature_two, 0.0 - if args.heuristic == 'MI': +def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, args: Any) -> float: + + heuristic = args.heuristic + score = 0.0 + + if heuristic == 'MI': score = sklearn_MI(vector_first, vector_second) - elif 'surrogate-' in args.heuristic: - X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([]) - score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic, is_target=True if feature_two == 'label' else False) - elif 'max-value-coverage' in args.heuristic: + + elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-prior'}: + logger.warning('surrogate-based models currently not available .. Try a MI-based one (e.g., MI-numba-randomized).') + score = 0.0 + + elif heuristic == 'max-value-coverage': score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second) - elif 'MI-numba' in args.heuristic: - score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio) - elif args.heuristic == 'AMI': + + elif heuristic == 'MI-numba-randomized': + score = numba_mi(vector_first, vector_second, heuristic, args.mi_stratified_sampling_ratio) + + elif heuristic == 'AMI': score = sklearn_mi_adj(vector_first, vector_second) - elif args.heuristic == 'correlation-Pearson': + + elif heuristic == 'correlation-Pearson': score = pearsonr(vector_first, vector_second)[0] - elif args.heuristic == 'Constant': + + elif heuristic == 'Constant': score = 0.0 + else: - raise ValueError('Please select a valid heuristic (MI, chi2, etc.).') + logger.warning(f'{heuristic} not defined!') + score = 0.0 + + return score + +def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]: + + feature_one, feature_two = combination + inputs_encoded, output_encoded = generate_data_for_ranking(combination, reference_model_features, args, tmp_df) + + ranking_score = conduct_feature_ranking(inputs_encoded, output_encoded, args) + + return feature_one, feature_two, ranking_score - return feature_one, feature_two, score def rank_features_3MR( relevance_dict: dict[str, float], diff --git a/outrank/core_utils.py b/outrank/core_utils.py index 1be4993..b50cc7c 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -647,7 +647,7 @@ def summarize_rare_counts( def is_prior_heuristic(args: Any) -> bool: - if '-prior' in args.heuristic and args.reference_model_JSON: + if 'MI-numba-randomized' and args.reference_model_JSON: return True return False From 3d499cb882661246493db12ac44c24ef03c2817a Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 15:11:07 +0200 Subject: [PATCH 05/15] le tests --- examples/recursive_ranking.py | 2 +- outrank/algorithms/importance_estimator.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index 2ffe820..74c3bcf 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -21,7 +21,7 @@ HEURISTIC = 'MI-numba-randomized' DATA_FORMAT = 'ob-vw' NUM_THREADS = 6 -INTERACTION_ORDER = 2 +INTERACTION_ORDER = 3 COMBINATION_NUMBER_BOUND = 1_000 MINIBATCH_SIZE = 10_000 SUBSAMPLING = 1 diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index 18f2a7d..a999e26 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -65,10 +65,11 @@ def sklearn_surrogate( def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float: cardinality_correction = heuristic == 'MI-numba-randomized' - if vector_first.shape[1] == 1: - vector_first = vector_first.reshape(-1) - else: - vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1) + if vector_first.size == 2: + if vector_first.shape[1] == 1: + vector_first = vector_first.reshape(-1) + else: + vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1) return ranking_mi_numba.mutual_info_estimator_numba( vector_first.astype(np.int32), From 8af0e17370db3ce459059a084e534c74580f0dc5 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 17:20:57 +0200 Subject: [PATCH 06/15] some refactoring --- examples/recursive_ranking.py | 4 +-- outrank/algorithms/importance_estimator.py | 33 +++++++---------- outrank/core_ranking.py | 42 +++++++++++----------- outrank/core_utils.py | 2 +- 4 files changed, 35 insertions(+), 46 deletions(-) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index 74c3bcf..a2bdec3 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -18,7 +18,7 @@ DATA_PATH = os.path.expanduser('~/datasets/toy') MODEL_SPEC_DIR = 'model_spec_dir' LABEL_COLUMN_NAME = 'label' -HEURISTIC = 'MI-numba-randomized' +HEURISTIC = 'surrogate-SGD-SVD' DATA_FORMAT = 'ob-vw' NUM_THREADS = 6 INTERACTION_ORDER = 3 @@ -34,7 +34,7 @@ def run_outrank_task(reference_model_json: str, output_folder: str) -> None: f'--num_threads {NUM_THREADS} --interaction_order {INTERACTION_ORDER} ' f'--output_folder {output_folder} --reference_model_JSON {reference_model_json} ' f'--heuristic {HEURISTIC} --label_column {LABEL_COLUMN_NAME} ' - f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm True;' + f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm False;' ) logger.info(f'Running outrank command: {outrank_command}') subprocess.run(outrank_command, shell=True, check=True) diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index a999e26..f5c0714 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd from scipy.stats import pearsonr +from sklearn.decomposition import TruncatedSVD from sklearn.feature_selection import mutual_info_classif from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier @@ -25,7 +26,7 @@ logger = logging.getLogger('syn-logger') logger.setLevel(logging.DEBUG) -num_folds = 2 +num_folds = 3 try: from outrank.algorithms.feature_ranking import ranking_mi_numba @@ -40,36 +41,24 @@ def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float: )[0] def sklearn_surrogate( - vector_first: np.ndarray, vector_second: np.ndarray, X: np.ndarray, surrogate_model: str - , is_target: bool=False, + vector_first: np.ndarray, vector_second: np.ndarray, surrogate_model: str, ) -> float: clf = initialize_classifier(surrogate_model) transf = OneHotEncoder() - - if not is_target: - return 1.0 - - if len(np.unique(vector_second)) > 2: - vector_first, vector_second = vector_second, vector_first - - if X.size <= 1: - X = vector_first.reshape(-1, 1) - else: - X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1) - - X = transf.fit_transform(X) - + X = transf.fit_transform(vector_first) scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds) return 1 + np.median(scores) def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float: cardinality_correction = heuristic == 'MI-numba-randomized' - if vector_first.size == 2: + try: if vector_first.shape[1] == 1: vector_first = vector_first.reshape(-1) else: vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1) + except: + logger.warning('Reshaping for MI computation in place - you are considering many-one mapping') return ranking_mi_numba.mutual_info_estimator_numba( vector_first.astype(np.int32), @@ -105,9 +94,8 @@ def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, if heuristic == 'MI': score = sklearn_MI(vector_first, vector_second) - elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-prior'}: - logger.warning('surrogate-based models currently not available .. Try a MI-based one (e.g., MI-numba-randomized).') - score = 0.0 + elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'}: + score = sklearn_surrogate(vector_first, vector_second, heuristic) elif heuristic == 'max-value-coverage': score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second) @@ -190,6 +178,9 @@ def initialize_classifier(surrogate_model: str): return SVC(gamma='auto', probability=True) elif 'surrogate-SGD' in surrogate_model: return SGDClassifier(max_iter=100000, loss='log_loss') + elif 'surrogate-SGD-SVD' in surrogate_model: + clf = Pipeline([('svd', TruncatedSVD(n_components=2**5)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))]) + return clf else: logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD') return SGDClassifier(max_iter=100000, loss='log_loss') diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 24d9e3e..ad36a70 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -186,6 +186,11 @@ def enrich_with_transformations( return transformed_df +import pandas as pd +import itertools +import numpy as np +import xxhash # Assuming xxhash is used for hashing + def compute_combined_features( input_dataframe: pd.DataFrame, args: Any, @@ -200,18 +205,16 @@ def compute_combined_features( join_string = ' AND_REL ' if is_3mr else ' AND ' interaction_order = 2 if is_3mr else args.interaction_order - model_combinations = [] full_combination_space = [] - if args.interaction_order > 1: - full_combination_space = list( - itertools.combinations(all_columns, interaction_order), - ) + full_combination_space = list( + itertools.combinations(all_columns, interaction_order), + ) full_combination_space = prior_combinations_sample(full_combination_space, args) if args.reference_model_JSON != '': - model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True) + model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only=True) model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations] if not is_prior_heuristic(args): full_combination_space = model_combinations @@ -219,25 +222,20 @@ def compute_combined_features( if is_prior_heuristic(args): full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space] + def combine_features(new_combination): + combined_feature = input_dataframe[new_combination[0]].astype(str) + for feature in new_combination[1:]: + combined_feature += input_dataframe[feature].astype(str) + combined_feature = combined_feature.apply(lambda x: xxhash.xxh64(x).hexdigest()) + ftr_name = join_string.join(new_combination) + return ftr_name, combined_feature - com_counter = 0 new_feature_hash = {} - for new_combination in full_combination_space: - pbar.set_description( - f'Created {com_counter}/{len(full_combination_space)}', - ) - combined_feature: list[str] = [str(0)] * input_dataframe.shape[0] - for feature in new_combination: - tmp_feature = input_dataframe[feature].tolist() - for enx, el in enumerate(tmp_feature): - combined_feature[enx] = str( - internal_hash( - str(combined_feature[enx]) + str(el), - ), - ) - ftr_name = join_string.join(str(x) for x in new_combination) + for idx, new_combination in enumerate(full_combination_space): + pbar.set_description(f'Created {idx + 1}/{len(full_combination_space)}') + ftr_name, combined_feature = combine_features(new_combination) new_feature_hash[ftr_name] = combined_feature - com_counter += 1 + tmp_df = pd.DataFrame(new_feature_hash) pbar.set_description('Concatenating into final frame ..') input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1) diff --git a/outrank/core_utils.py b/outrank/core_utils.py index b50cc7c..8909d1c 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -647,7 +647,7 @@ def summarize_rare_counts( def is_prior_heuristic(args: Any) -> bool: - if 'MI-numba-randomized' and args.reference_model_JSON: + if args.heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'} and args.reference_model_JSON: return True return False From 5c73a4423e4de5329df8ae56925b48d7d072062b Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 17:25:33 +0200 Subject: [PATCH 07/15] Some refactoring --- examples/recursive_ranking.py | 2 +- outrank/algorithms/importance_estimator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index a2bdec3..a535a43 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -68,7 +68,7 @@ def initialize_model_spec_dir() -> None: """Initialize the model specification directory with the initial JSON file.""" command = ( 'mkdir -p model_spec_dir && ' - 'rm -rvf model_spec_dir/* && ' + 'rm -rv model_spec_dir/* && ' 'echo \'{"desc": {"features": []}}\' > ./model_spec_dir/model_0.json' ) subprocess.run(command, shell=True, check=True) diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index f5c0714..5c2edbf 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -77,7 +77,7 @@ def generate_data_for_ranking(combination: tuple[str, str], reference_model_feat feature_one = feature_two feature_two = args.label_column - if args.reference_model_JSON != '' and args.reference_model_JSON is not None: + if args.reference_model_JSON: vector_first = tmp_df[list(reference_model_features) + [feature_one]].values else: vector_first = tmp_df[feature_one].values From 18f1ff0f2e61ea1055ef52936ff5ac727d8cc689 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 17:27:17 +0200 Subject: [PATCH 08/15] Some imports --- outrank/core_ranking.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index ad36a70..6923807 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -20,6 +20,7 @@ import numpy as np import pandas as pd import tqdm +import xxhash from outrank.algorithms.importance_estimator import \ get_importances_estimate_pairwise @@ -186,11 +187,6 @@ def enrich_with_transformations( return transformed_df -import pandas as pd -import itertools -import numpy as np -import xxhash # Assuming xxhash is used for hashing - def compute_combined_features( input_dataframe: pd.DataFrame, args: Any, From 90e1569b17549cb5a7784bb6e028c3c902e2caeb Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 17:30:09 +0200 Subject: [PATCH 09/15] Imports --- .../algorithms/synthetic_data_generators/cc_generator.py | 2 +- outrank/task_instance_ranking.py | 9 ++++++--- tests/cc_generator_test.py | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py index f97def3..cecde5b 100644 --- a/outrank/algorithms/synthetic_data_generators/cc_generator.py +++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py @@ -28,7 +28,7 @@ def __init__(self, seed: int = 42): } def __repr__(self): - return f"CategoricalClassification(dataset_info={self.dataset_info})" + return f'CategoricalClassification(dataset_info={self.dataset_info})' def generate_data( self, diff --git a/outrank/task_instance_ranking.py b/outrank/task_instance_ranking.py index 6555068..30fc7f1 100644 --- a/outrank/task_instance_ranking.py +++ b/outrank/task_instance_ranking.py @@ -2,14 +2,17 @@ import gzip import os -from collections import Counter, defaultdict +from collections import Counter +from collections import defaultdict from typing import Any import numpy as np import pandas as pd import tqdm -from outrank.core_utils import generic_line_parser, get_dataset_info, get_num_of_instances +from outrank.core_utils import generic_line_parser +from outrank.core_utils import get_dataset_info +from outrank.core_utils import get_num_of_instances try: import matplotlib.pyplot as plt @@ -36,7 +39,7 @@ def score_line(line: list[str]) -> dict[str, float]: 'empty_dict': empty_dict_prop, 'all_empty': all_empty_prop, 'all_zero': all_zero_prop, - 'row_entropy': compute_average_entropy(line) + 'row_entropy': compute_average_entropy(line), } for j in [30, 60, 100, 200, 300]: diff --git a/tests/cc_generator_test.py b/tests/cc_generator_test.py index 770e3b2..704bbc9 100644 --- a/tests/cc_generator_test.py +++ b/tests/cc_generator_test.py @@ -5,7 +5,9 @@ import numpy as np from scipy.stats import pearsonr -from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification +from outrank.algorithms.synthetic_data_generators.cc_generator import \ + CategoricalClassification + class TestCategoricalClassification(unittest.TestCase): From ca32340951f100acb614386232f8b20e3449049f Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 17:32:20 +0200 Subject: [PATCH 10/15] ruff --- examples/recursive_ranking.py | 1 - outrank/algorithms/importance_estimator.py | 4 ---- outrank/algorithms/sketches/counting_cms.py | 1 - .../algorithms/sketches/counting_counters_ordinary.py | 1 - outrank/algorithms/sketches/counting_ultiloglog.py | 6 ------ .../synthetic_data_generators/cc_generator.py | 3 --- outrank/core_ranking.py | 5 ----- outrank/core_utils.py | 10 ++-------- .../feature_transformer_vault/default_transformers.py | 2 +- .../feature_transformations/ranking_transformers.py | 3 --- outrank/task_summary.py | 1 - tests/cms_test.py | 1 - 12 files changed, 3 insertions(+), 35 deletions(-) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index a535a43..5db0956 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -6,7 +6,6 @@ import os import shutil import subprocess -from typing import Optional import pandas as pd diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index 5c2edbf..b57b8cb 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -4,9 +4,6 @@ import operator import traceback from typing import Any -from typing import Dict -from typing import List -from typing import Tuple import numpy as np import pandas as pd @@ -21,7 +18,6 @@ from sklearn.svm import SVC from outrank.algorithms.feature_ranking import ranking_cov_alignment -from outrank.core_utils import is_prior_heuristic logger = logging.getLogger('syn-logger') logger.setLevel(logging.DEBUG) diff --git a/outrank/algorithms/sketches/counting_cms.py b/outrank/algorithms/sketches/counting_cms.py index 56eef3c..c12e09f 100644 --- a/outrank/algorithms/sketches/counting_cms.py +++ b/outrank/algorithms/sketches/counting_cms.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys from collections import Counter import numpy as np diff --git a/outrank/algorithms/sketches/counting_counters_ordinary.py b/outrank/algorithms/sketches/counting_counters_ordinary.py index 95d4a62..6c7ee7d 100644 --- a/outrank/algorithms/sketches/counting_counters_ordinary.py +++ b/outrank/algorithms/sketches/counting_counters_ordinary.py @@ -26,7 +26,6 @@ def add(self, val): depth = 8 width = 2**22 - import numpy as np cms = PrimitiveConstrainedCounter() items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 10000 diff --git a/outrank/algorithms/sketches/counting_ultiloglog.py b/outrank/algorithms/sketches/counting_ultiloglog.py index 0ac7fb6..81b7e42 100644 --- a/outrank/algorithms/sketches/counting_ultiloglog.py +++ b/outrank/algorithms/sketches/counting_ultiloglog.py @@ -62,13 +62,7 @@ def __len__(self): if __name__ == '__main__': import random import string - import time - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - import tqdm - from pympler import asizeof def get_random_string(length): # choose from all lowercase letter diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py index cecde5b..be6164b 100644 --- a/outrank/algorithms/synthetic_data_generators/cc_generator.py +++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py @@ -1,10 +1,7 @@ from __future__ import annotations -from typing import List from typing import Literal from typing import Optional -from typing import Tuple -from typing import Union import numpy as np from numpy.typing import ArrayLike diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 6923807..edcc91c 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -11,11 +11,6 @@ from collections import deque from timeit import default_timer as timer from typing import Any -from typing import Dict -from typing import List -from typing import Set -from typing import Tuple -from typing import Union import numpy as np import pandas as pd diff --git a/outrank/core_utils.py b/outrank/core_utils.py index 8909d1c..31d11c8 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -9,12 +9,6 @@ from collections import defaultdict from dataclasses import dataclass from typing import Any -from typing import Dict -from typing import List -from typing import Optional -from typing import Set -from typing import Tuple -from typing import Union import numpy as np import pandas as pd @@ -199,7 +193,7 @@ def parse_ob_line_vw( ] if not include_namespace_info: the_real_instance = [ - x[2:] if not x is None else None for x in the_real_instance + x[2:] if x is not None else None for x in the_real_instance ] parts = [label] + the_real_instance @@ -268,7 +262,7 @@ def parse_namespace(namespace_path: str) -> tuple[set[str], dict[str, str]]: id_feature_map[fw_id] = feature if type_name == 'f32': float_set.add(feature) - except Exception as es: + except Exception: pass return float_set, id_feature_map diff --git a/outrank/feature_transformations/feature_transformer_vault/default_transformers.py b/outrank/feature_transformations/feature_transformer_vault/default_transformers.py index 959ff6f..cc4b303 100644 --- a/outrank/feature_transformations/feature_transformer_vault/default_transformers.py +++ b/outrank/feature_transformations/feature_transformer_vault/default_transformers.py @@ -291,7 +291,7 @@ for k, v in EXTENDED_TRANSFORMERS.items(): for round_param in [1, 2, 3, 4]: new_key = k + f'_round{round_param}' - new_value = f'np.round(np.astype(np.array(' + v + f'), np.float32), {round_param})' + new_value = 'np.round(np.astype(np.array(' + v + f'), np.float32), {round_param})' EXTENDED_ROUNDED_TRANSFORMERS[new_key] = new_value diff --git a/outrank/feature_transformations/ranking_transformers.py b/outrank/feature_transformations/ranking_transformers.py index 11227f7..1b2f5dc 100644 --- a/outrank/feature_transformations/ranking_transformers.py +++ b/outrank/feature_transformations/ranking_transformers.py @@ -3,9 +3,6 @@ import logging from typing import Any -from typing import Dict -from typing import List -from typing import Set import numpy as np import pandas as pd diff --git a/outrank/task_summary.py b/outrank/task_summary.py index 2fbc6e9..5e56fe4 100644 --- a/outrank/task_summary.py +++ b/outrank/task_summary.py @@ -4,7 +4,6 @@ import os from collections import defaultdict from typing import Any -from typing import List import numpy as np import pandas as pd diff --git a/tests/cms_test.py b/tests/cms_test.py index da506a2..7a2cbd9 100644 --- a/tests/cms_test.py +++ b/tests/cms_test.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys import unittest import numpy as np From d8563fac5a6607d9eef7069b4985e0e7a9688b26 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 17:37:31 +0200 Subject: [PATCH 11/15] le import --- outrank/algorithms/importance_estimator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index b57b8cb..e9a2608 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -14,6 +14,7 @@ from sklearn.linear_model import SGDClassifier from sklearn.metrics import adjusted_mutual_info_score from sklearn.model_selection import cross_val_score +from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC From c56c3638f4e60b58a049803dd0e74b3ef9727a96 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 17:49:02 +0200 Subject: [PATCH 12/15] nonsense-- --- outrank/algorithms/importance_estimator.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index e9a2608..a95b828 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -169,15 +169,20 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame): pass def initialize_classifier(surrogate_model: str): + if 'surrogate-LR' in surrogate_model: return LogisticRegression(max_iter=100000) + elif 'surrogate-SVM' in surrogate_model: return SVC(gamma='auto', probability=True) - elif 'surrogate-SGD' in surrogate_model: - return SGDClassifier(max_iter=100000, loss='log_loss') + elif 'surrogate-SGD-SVD' in surrogate_model: clf = Pipeline([('svd', TruncatedSVD(n_components=2**5)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))]) return clf + + elif 'surrogate-SGD' in surrogate_model: + return SGDClassifier(max_iter=100000, loss='log_loss') + else: logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD') return SGDClassifier(max_iter=100000, loss='log_loss') From 32597dde70aa1061e61a7979b36e83c8b719c911 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Thu, 24 Oct 2024 19:08:29 +0200 Subject: [PATCH 13/15] Minor fixes --- examples/recursive_ranking.py | 8 ++++---- outrank/algorithms/importance_estimator.py | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index 5db0956..9e6f110 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -17,13 +17,13 @@ DATA_PATH = os.path.expanduser('~/datasets/toy') MODEL_SPEC_DIR = 'model_spec_dir' LABEL_COLUMN_NAME = 'label' -HEURISTIC = 'surrogate-SGD-SVD' +HEURISTIC = 'surrogate-SGD' DATA_FORMAT = 'ob-vw' NUM_THREADS = 6 -INTERACTION_ORDER = 3 +INTERACTION_ORDER = 2 COMBINATION_NUMBER_BOUND = 1_000 MINIBATCH_SIZE = 10_000 -SUBSAMPLING = 1 +SUBSAMPLING = 10 def run_outrank_task(reference_model_json: str, output_folder: str) -> None: """Run the outrank task with the specified parameters.""" @@ -97,7 +97,7 @@ def parse_arguments() -> argparse.Namespace: parser.add_argument( '--iterations', type=int, - default=10, + default=80, help='Number of iterations to run (default: 10)', ) return parser.parse_args() diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index a95b828..e7b3edd 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -41,8 +41,7 @@ def sklearn_surrogate( vector_first: np.ndarray, vector_second: np.ndarray, surrogate_model: str, ) -> float: clf = initialize_classifier(surrogate_model) - transf = OneHotEncoder() - X = transf.fit_transform(vector_first) + X = OneHotEncoder().fit_transform(vector_first) scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds) return 1 + np.median(scores) From ce091e1e4d9b5969e6d053267266d865dc90a9eb Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 25 Oct 2024 11:20:57 +0200 Subject: [PATCH 14/15] svd works too --- examples/recursive_ranking.py | 2 +- outrank/algorithms/importance_estimator.py | 21 ++++++++++++++------- outrank/core_utils.py | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index 9e6f110..b056f20 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -17,7 +17,7 @@ DATA_PATH = os.path.expanduser('~/datasets/toy') MODEL_SPEC_DIR = 'model_spec_dir' LABEL_COLUMN_NAME = 'label' -HEURISTIC = 'surrogate-SGD' +HEURISTIC = 'surrogate-SGD-SVD' DATA_FORMAT = 'ob-vw' NUM_THREADS = 6 INTERACTION_ORDER = 2 diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index e7b3edd..f95363e 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd from scipy.stats import pearsonr +from sklearn import random_projection from sklearn.decomposition import TruncatedSVD from sklearn.feature_selection import mutual_info_classif from sklearn.linear_model import LogisticRegression @@ -23,7 +24,8 @@ logger = logging.getLogger('syn-logger') logger.setLevel(logging.DEBUG) -num_folds = 3 +NUM_FOLDS = 2 +SVD_DIMS = 2 try: from outrank.algorithms.feature_ranking import ranking_mi_numba @@ -40,9 +42,14 @@ def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float: def sklearn_surrogate( vector_first: np.ndarray, vector_second: np.ndarray, surrogate_model: str, ) -> float: - clf = initialize_classifier(surrogate_model) X = OneHotEncoder().fit_transform(vector_first) - scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds) + + if '-SVD' in surrogate_model and X.shape[1] > 2: + # yes this is not super correct due to embedding full data first, but it's much faster + seems to offer same results anyways. + X = TruncatedSVD(n_components=min(SVD_DIMS, X.shape[1])).fit_transform(X) + + clf = initialize_classifier(surrogate_model, n_dim=min(X.shape[1], 1024)) + scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=NUM_FOLDS) return 1 + np.median(scores) def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float: @@ -90,7 +97,7 @@ def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, if heuristic == 'MI': score = sklearn_MI(vector_first, vector_second) - elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'}: + elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-RP', 'surrogate-SGD-SVD'}: score = sklearn_surrogate(vector_first, vector_second, heuristic) elif heuristic == 'max-value-coverage': @@ -167,7 +174,7 @@ def calc_higher_order(feature: str, is_redundancy: bool = True) -> float: def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame): pass -def initialize_classifier(surrogate_model: str): +def initialize_classifier(surrogate_model: str, n_dim: int) -> Any: if 'surrogate-LR' in surrogate_model: return LogisticRegression(max_iter=100000) @@ -175,8 +182,8 @@ def initialize_classifier(surrogate_model: str): elif 'surrogate-SVM' in surrogate_model: return SVC(gamma='auto', probability=True) - elif 'surrogate-SGD-SVD' in surrogate_model: - clf = Pipeline([('svd', TruncatedSVD(n_components=2**5)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))]) + elif 'surrogate-SGD-RP' in surrogate_model: + clf = Pipeline([('proj', random_projection.SparseRandomProjection(n_components=n_dim)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))]) return clf elif 'surrogate-SGD' in surrogate_model: diff --git a/outrank/core_utils.py b/outrank/core_utils.py index 31d11c8..0ee3d23 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -641,7 +641,7 @@ def summarize_rare_counts( def is_prior_heuristic(args: Any) -> bool: - if args.heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'} and args.reference_model_JSON: + if args.heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-RP'} and args.reference_model_JSON: return True return False From 2ca3a874cb0f8b9d3544daf4cc56fff891e18115 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 25 Oct 2024 11:23:45 +0200 Subject: [PATCH 15/15] version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c004f3e..8411c4f 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def _read_description(): packages = [x for x in setuptools.find_packages() if x != 'test'] setuptools.setup( name='outrank', - version='0.97.4', + version='0.97.5', description='OutRank: Feature ranking for massive sparse data sets.', long_description=_read_description(), long_description_content_type='text/markdown',