From 7f40ea6923c464600a10493b29c871d886509e2e Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Wed, 23 Oct 2024 11:22:33 +0200
Subject: [PATCH 01/15] tldr mode

---
 outrank/__main__.py                           |  7 ++
 outrank/task_summary.py                       | 72 ++++++++++++++-----
 .../visualizations/ranking_visualization.py   | 14 ++--
 3 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/outrank/__main__.py b/outrank/__main__.py
index f261dc5..3ad89d7 100644
--- a/outrank/__main__.py
+++ b/outrank/__main__.py
@@ -204,6 +204,13 @@ def main():
         help='Relevant for task data_generator -- how many features.',
     )
 
+    parser.add_argument(
+        '--tldr',
+        type=str,
+        default='True',
+        help='If enabled, it will output some of the main results on the screen after finishing.',
+    )
+
     parser.add_argument(
         '--num_synthetic_rows',
         type=int,
diff --git a/outrank/task_summary.py b/outrank/task_summary.py
index bebf5e7..2fbc6e9 100644
--- a/outrank/task_summary.py
+++ b/outrank/task_summary.py
@@ -3,6 +3,8 @@
 import logging
 import os
 from collections import defaultdict
+from typing import Any
+from typing import List
 
 import numpy as np
 import pandas as pd
@@ -10,44 +12,62 @@
 logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
 
 
-def outrank_task_result_summary(args):
-    triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv')
+def read_and_sort_triplets(triplets_path: str) -> pd.DataFrame:
+    """Read triplets from a file and sort by the 'Score' column."""
     triplets = pd.read_csv(triplets_path, sep='\t')
-    triplets = triplets.sort_values(by='Score', ascending=False)
+    return triplets.sort_values(by='Score', ascending=False)
+
 
+def generate_final_ranking(triplets: pd.DataFrame, label_column: str) -> list[list[Any]]:
+    """Generate final ranking based on the label column."""
     final_ranking = []
     for _, row in triplets.iterrows():
         feature_a, feature_b = row['FeatureA'], row['FeatureB']
         score = row['Score']
-        if args.label_column == feature_a.split('-')[0]:
+        if label_column == feature_a.split('-')[0]:
             final_ranking.append([feature_b, score])
-        elif args.label_column == feature_b.split('-')[0]:
+        elif label_column == feature_b.split('-')[0]:
             final_ranking.append([feature_a, score])
+    return final_ranking
 
-    final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {args.heuristic}'])
+
+def create_final_dataframe(final_ranking: list[list[Any]], heuristic: str) -> pd.DataFrame:
+    """Create a final DataFrame and normalize if necessary."""
+    final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {heuristic}'])
     final_df = (
         final_df.groupby('Feature')
         .median()
         .reset_index()
-        .sort_values(by=f'Score {args.heuristic}', ascending=False)
+        .sort_values(by=f'Score {heuristic}', ascending=False)
     )
 
-    if "MI" in args.heuristic:
-        min_score = final_df[f'Score {args.heuristic}'].min()
-        max_score = final_df[f'Score {args.heuristic}'].max()
-        final_df[f'Score {args.heuristic}'] = (final_df[f'Score {args.heuristic}'] - min_score) / (max_score - min_score)
+    if 'MI' in heuristic:
+        min_score = final_df[f'Score {heuristic}'].min()
+        max_score = final_df[f'Score {heuristic}'].max()
+        final_df[f'Score {heuristic}'] = (final_df[f'Score {heuristic}'] - min_score) / (max_score - min_score)
+
+    return final_df
 
-    logging.info(f'Storing summary files to {args.output_folder}')
+
+def store_summary_files(final_df: pd.DataFrame, output_folder: str, heuristic: str, tldr: bool) -> None:
+    """Store the summary files and optionally print the head of the DataFrame."""
+    logging.info(f'Storing summary files to {output_folder}')
     pd.set_option('display.max_rows', None, 'display.max_columns', None)
 
-    singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')
+    singles_path = os.path.join(output_folder, 'feature_singles.tsv')
     final_df.to_csv(singles_path, sep='\t', index=False)
 
-    if args.interaction_order > 1:
+    if tldr:
+        print(final_df.head(20))
+
+
+def handle_interaction_order(final_df: pd.DataFrame, output_folder: str, heuristic: str, interaction_order: int) -> None:
+    """Handle the interaction order if it is greater than 1."""
+    if interaction_order > 1:
         feature_store = defaultdict(list)
         for _, row in final_df.iterrows():
             fname = row['Feature']
-            score = row[f'Score {args.heuristic}']
+            score = row[f'Score {heuristic}']
             if 'AND' in fname:
                 for el in fname.split('-')[0].split(' AND '):
                     feature_store[el].append(score)
@@ -55,13 +75,29 @@ def outrank_task_result_summary(args):
         final_aggregate_df = pd.DataFrame([
             {
                 'Feature': k,
-                f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(v),
+                f'Combined score (order: {interaction_order}, {heuristic})': np.median(v),
             }
             for k, v in feature_store.items()
         ])
         final_aggregate_df.to_csv(
-            os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False
+            os.path.join(output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False,
         )
 
-    transformers_only_path = singles_path.replace('.tsv', '_transformers_only_imp.tsv')
+
+def filter_transformers_only(final_df: pd.DataFrame, output_folder: str) -> None:
+    """Filter the DataFrame to include only transformer features and store the result."""
+    transformers_only_path = os.path.join(output_folder, 'feature_singles_transformers_only_imp.tsv')
     final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False)
+
+
+def outrank_task_result_summary(args) -> None:
+    """Main function to generate a summary of outrank task results."""
+    triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv')
+    triplets = read_and_sort_triplets(triplets_path)
+
+    final_ranking = generate_final_ranking(triplets, args.label_column)
+    final_df = create_final_dataframe(final_ranking, args.heuristic)
+
+    store_summary_files(final_df, args.output_folder, args.heuristic, args.tldr)
+    handle_interaction_order(final_df, args.output_folder, args.heuristic, args.interaction_order)
+    filter_transformers_only(final_df, args.output_folder)
diff --git a/outrank/visualizations/ranking_visualization.py b/outrank/visualizations/ranking_visualization.py
index 0719d8e..d2ae48a 100644
--- a/outrank/visualizations/ranking_visualization.py
+++ b/outrank/visualizations/ranking_visualization.py
@@ -43,7 +43,7 @@ def visualize_hierarchical_clusters(
         values='Score',
         index='FeatureA',
         columns='FeatureB',
-        aggfunc=np.mean,
+        aggfunc='mean',  # Updated from np.mean to 'mean'
     )
 
     pivot_table.fillna(0, inplace=True)
@@ -59,7 +59,7 @@ def visualize_hierarchical_clusters(
         )
         plt.title(f'Linkage function: {linkage_heuristic}')
         with warnings.catch_warnings():
-            warnings.simplefilter("ignore", UserWarning)
+            warnings.simplefilter('ignore', UserWarning)
             plt.tight_layout()
         out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}'
         plt.savefig(out_path, dpi=300)
@@ -95,7 +95,7 @@ def visualize_hierarchical_clusters(
         dfx.columns = ['Silhouette', 'threshold', 'numClusters']
         sns.lineplot(x='numClusters', y='Silhouette', data=dfx, color='black')
         with warnings.catch_warnings():
-            warnings.simplefilter("ignore", UserWarning)
+            warnings.simplefilter('ignore', UserWarning)
             plt.tight_layout()
         out_path = f'{output_folder}/SilhouetteProfile.{image_format}'
         plt.savefig(out_path, dpi=300)
@@ -113,7 +113,7 @@ def visualize_hierarchical_clusters(
             projected_data['ClusterID'] = top_clustering.astype(str)
             sns.scatterplot(x='Dim1', y='Dim2', hue='ClusterID', data=projected_data, palette='Set2')
             with warnings.catch_warnings():
-                warnings.simplefilter("ignore", UserWarning)
+                warnings.simplefilter('ignore', UserWarning)
                 plt.tight_layout()
             plt.savefig(f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300)
             plt.clf()
@@ -130,7 +130,7 @@ def visualize_heatmap(
     sns.set(font_scale=2)
     fig, ax = plt.subplots()
     pivot_table = pd.pivot_table(
-        triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc=np.mean,
+        triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc='mean',  # Updated from np.mean to 'mean'
     )
     mask = np.zeros_like(pivot_table.values)
     mask[np.triu_indices_from(mask)] = True
@@ -160,7 +160,7 @@ def visualize_heatmap(
     plt.xlabel('')
     plt.ylabel('')
     with warnings.catch_warnings():
-        warnings.simplefilter("ignore", UserWarning)
+        warnings.simplefilter('ignore', UserWarning)
         plt.tight_layout()
     plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500)
     plt.clf()
@@ -245,7 +245,7 @@ def visualize_barplots(
         plt.xlabel(f'Feature importance (based on heuristic {heuristic})')
         plt.ylabel('')
         with warnings.catch_warnings():
-            warnings.simplefilter("ignore", UserWarning)
+            warnings.simplefilter('ignore', UserWarning)
             plt.tight_layout()
         plt.savefig(f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300)
         plt.clf()

From 88121feeef8fd38a6af2713b304fc96473aa2e18 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Wed, 23 Oct 2024 11:22:44 +0200
Subject: [PATCH 02/15] tldr

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6e2fd4f..c004f3e 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ def _read_description():
 packages = [x for x in setuptools.find_packages() if x != 'test']
 setuptools.setup(
     name='outrank',
-    version='0.97.3',
+    version='0.97.4',
     description='OutRank: Feature ranking for massive sparse data sets.',
     long_description=_read_description(),
     long_description_content_type='text/markdown',

From 8664d876db853435d57b33f21ca9e2b0726e3744 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Wed, 23 Oct 2024 14:52:22 +0200
Subject: [PATCH 03/15] docs

---
 examples/README.md            |  21 +++++++
 examples/recursive_ranking.py | 110 ++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 examples/README.md
 create mode 100644 examples/recursive_ranking.py

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..370c9e5
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,21 @@
+# Feature Evolution via Ranking
+
+This script facilitates the process of feature evolution through iterative ranking using the `outrank` tool. It automates the process of running multiple iterations of feature ranking, extracting the best features, and updating the model specifications accordingly.
+
+## Overview
+
+The script performs the following steps:
+1. **Initialization**: Sets up the initial model specification directory and creates the initial model JSON file.
+2. **Iteration**: Runs the `outrank` task for a specified number of iterations.
+3. **Feature Extraction**: Processes the results of each iteration to extract the best feature.
+4. **Model Update**: Updates the model specification JSON with the newly identified best feature.
+
+## Prerequisites
+
+- Ensure that the `outrank` tool is installed and accessible from the command line.
+- Python 3.6 or higher.
+- Required Python packages: `pandas`, `argparse`, `json`, `shutil`, and `logging`.
+
+## Installation
+
+Install the required Python packages using pip (`pip install outrank --upgrade`)
diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
new file mode 100644
index 0000000..7e076c6
--- /dev/null
+++ b/examples/recursive_ranking.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import shutil
+import subprocess
+from typing import Optional
+
+import pandas as pd
+
+# Configure logging
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
+logger = logging.getLogger('syn-logger')
+
+# Configuration constants
+DATA_PATH = os.path.expanduser('~/datasets/toy')
+MODEL_SPEC_DIR = 'model_spec_dir'
+LABEL_COLUMN_NAME = 'label'
+HEURISTIC = 'surrogate-SGD-prior'
+DATA_FORMAT = 'ob-vw'
+NUM_THREADS = 6
+INTERACTION_ORDER = 2
+COMBINATION_NUMBER_BOUND = 300
+MINIBATCH_SIZE = 30_000
+SUBSAMPLING = 1
+
+def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
+    """Run the outrank task with the specified parameters."""
+    outrank_command = (
+        f'outrank --task all --data_path {DATA_PATH} --data_source {DATA_FORMAT} '
+        f'--target_ranking_only True --combination_number_upper_bound {COMBINATION_NUMBER_BOUND} '
+        f'--num_threads {NUM_THREADS} --interaction_order {INTERACTION_ORDER} '
+        f'--output_folder {output_folder} --reference_model_JSON {reference_model_json} '
+        f'--heuristic {HEURISTIC} --label_column {LABEL_COLUMN_NAME} '
+        f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm True;'
+    )
+    logger.info(f'Running outrank command: {outrank_command}')
+    subprocess.run(outrank_command, shell=True, check=True)
+    logger.info(f'Outrank task completed for {reference_model_json}')
+
+def process_results(output_folder: str) -> str:
+    """Read the results and extract the best feature."""
+    results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t')
+    logger.info(f'Results head:\n{results.head(5)}')
+    best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1])
+    best_feature = ','.join(best_feature.split(' AND '))
+    logger.info(f'Best feature: {best_feature}')
+    return best_feature
+
+def update_model_spec(model_index: int, best_feature: str) -> None:
+    """Update the model specification JSON with the new best feature."""
+    current_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index}.json')
+    next_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index + 1}.json')
+
+    with open(current_model_path) as file:
+        model_spec = json.load(file)
+
+    current_features = model_spec['desc']['features']
+    current_features.append(best_feature)
+    logger.info(f'Updated features: {current_features}')
+
+    with open(next_model_path, 'w') as file:
+        new_model_spec = {'desc': {'features': current_features}}
+        json.dump(new_model_spec, file)
+
+def initialize_model_spec_dir() -> None:
+    """Initialize the model specification directory with the initial JSON file."""
+    command = (
+        'mkdir -p model_spec_dir && '
+        'rm -rvf model_spec_dir/* && '
+        'echo \'{"desc": {"features": []}}\' > ./model_spec_dir/model_0.json'
+    )
+    subprocess.run(command, shell=True, check=True)
+    logger.info('Initialized model specification directory with model_0.json')
+
+def run_evolution(iterations: int) -> None:
+    """Main function to run the test for multiple iterations."""
+    for i in range(iterations):
+        reference_model_json = os.path.join(MODEL_SPEC_DIR, f'model_{i}.json')
+        output_folder = f'output_dir_{i}'
+
+        if os.path.isdir(output_folder):
+            shutil.rmtree(output_folder)
+        os.mkdir(output_folder)
+
+        try:
+            run_outrank_task(reference_model_json, output_folder)
+            best_feature = process_results(output_folder)
+            update_model_spec(i, best_feature)
+        except Exception as e:
+            logger.error(f'An error occurred during iteration {i}: {e}')
+            continue
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description='Run the outrank evolution process.')
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=10,
+        help='Number of iterations to run (default: 10)',
+    )
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    initialize_model_spec_dir()
+    run_evolution(args.iterations)

From b60b5b041155a8f472211009f8670f4c74d06b6f Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 15:00:40 +0200
Subject: [PATCH 04/15] randomized heuristic

---
 examples/recursive_ranking.py              |  7 +--
 outrank/algorithms/importance_estimator.py | 70 ++++++++++++++++------
 outrank/core_utils.py                      |  2 +-
 3 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
index 7e076c6..2ffe820 100644
--- a/examples/recursive_ranking.py
+++ b/examples/recursive_ranking.py
@@ -18,12 +18,12 @@
 DATA_PATH = os.path.expanduser('~/datasets/toy')
 MODEL_SPEC_DIR = 'model_spec_dir'
 LABEL_COLUMN_NAME = 'label'
-HEURISTIC = 'surrogate-SGD-prior'
+HEURISTIC = 'MI-numba-randomized'
 DATA_FORMAT = 'ob-vw'
 NUM_THREADS = 6
 INTERACTION_ORDER = 2
-COMBINATION_NUMBER_BOUND = 300
-MINIBATCH_SIZE = 30_000
+COMBINATION_NUMBER_BOUND = 1_000
+MINIBATCH_SIZE = 10_000
 SUBSAMPLING = 1
 
 def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
@@ -43,7 +43,6 @@ def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
 def process_results(output_folder: str) -> str:
     """Read the results and extract the best feature."""
     results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t')
-    logger.info(f'Results head:\n{results.head(5)}')
     best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1])
     best_feature = ','.join(best_feature.split(' AND '))
     logger.info(f'Best feature: {best_feature}')
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index f9e8241..18f2a7d 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -64,6 +64,12 @@ def sklearn_surrogate(
 
 def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
     cardinality_correction = heuristic == 'MI-numba-randomized'
+
+    if vector_first.shape[1] == 1:
+        vector_first = vector_first.reshape(-1)
+    else:
+        vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1)
+
     return ranking_mi_numba.mutual_info_estimator_numba(
         vector_first.astype(np.int32),
         vector_second.astype(np.int32),
@@ -74,38 +80,64 @@ def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str
 def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
     return adjusted_mutual_info_score(vector_first, vector_second)
 
-def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]:
+def generate_data_for_ranking(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple(np.ndarray, np.ndrray):
     feature_one, feature_two = combination
 
-    if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns:
-        logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.')
-        return feature_one, feature_two, 0.0
+    if feature_one == args.label_column:
+        feature_one = feature_two
+        feature_two = args.label_column
+
+    if args.reference_model_JSON != '' and args.reference_model_JSON is not None:
+        vector_first = tmp_df[list(reference_model_features) + [feature_one]].values
+    else:
+        vector_first = tmp_df[feature_one].values
 
-    vector_first = tmp_df[feature_one].values
     vector_second = tmp_df[feature_two].values
+    return vector_first, vector_second
 
-    if vector_first.size == 0 or vector_second.size == 0:
-        return feature_one, feature_two, 0.0
 
-    if args.heuristic == 'MI':
+def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, args: Any) -> float:
+
+    heuristic = args.heuristic
+    score = 0.0
+
+    if heuristic == 'MI':
         score = sklearn_MI(vector_first, vector_second)
-    elif 'surrogate-' in args.heuristic:
-        X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([])
-        score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic, is_target=True if feature_two == 'label' else False)
-    elif 'max-value-coverage' in args.heuristic:
+
+    elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-prior'}:
+        logger.warning('surrogate-based models currently not available .. Try a MI-based one (e.g., MI-numba-randomized).')
+        score = 0.0
+
+    elif heuristic == 'max-value-coverage':
         score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
-    elif 'MI-numba' in args.heuristic:
-        score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio)
-    elif args.heuristic == 'AMI':
+
+    elif heuristic == 'MI-numba-randomized':
+        score = numba_mi(vector_first, vector_second, heuristic, args.mi_stratified_sampling_ratio)
+
+    elif heuristic == 'AMI':
         score = sklearn_mi_adj(vector_first, vector_second)
-    elif args.heuristic == 'correlation-Pearson':
+
+    elif heuristic == 'correlation-Pearson':
         score = pearsonr(vector_first, vector_second)[0]
-    elif args.heuristic == 'Constant':
+
+    elif heuristic == 'Constant':
         score = 0.0
+
     else:
-        raise ValueError('Please select a valid heuristic (MI, chi2, etc.).')
+        logger.warning(f'{heuristic} not defined!')
+        score = 0.0
+
+    return score
+
+def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]:
+
+    feature_one, feature_two = combination
+    inputs_encoded, output_encoded = generate_data_for_ranking(combination, reference_model_features, args, tmp_df)
+
+    ranking_score = conduct_feature_ranking(inputs_encoded, output_encoded, args)
+
+    return feature_one, feature_two, ranking_score
 
-    return feature_one, feature_two, score
 
 def rank_features_3MR(
     relevance_dict: dict[str, float],
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index 1be4993..b50cc7c 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -647,7 +647,7 @@ def summarize_rare_counts(
 
 
 def is_prior_heuristic(args: Any) -> bool:
-    if '-prior' in args.heuristic and args.reference_model_JSON:
+    if 'MI-numba-randomized' and args.reference_model_JSON:
         return True
     return False
 

From 3d499cb882661246493db12ac44c24ef03c2817a Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 15:11:07 +0200
Subject: [PATCH 05/15] le tests

---
 examples/recursive_ranking.py              | 2 +-
 outrank/algorithms/importance_estimator.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
index 2ffe820..74c3bcf 100644
--- a/examples/recursive_ranking.py
+++ b/examples/recursive_ranking.py
@@ -21,7 +21,7 @@
 HEURISTIC = 'MI-numba-randomized'
 DATA_FORMAT = 'ob-vw'
 NUM_THREADS = 6
-INTERACTION_ORDER = 2
+INTERACTION_ORDER = 3
 COMBINATION_NUMBER_BOUND = 1_000
 MINIBATCH_SIZE = 10_000
 SUBSAMPLING = 1
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index 18f2a7d..a999e26 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -65,10 +65,11 @@ def sklearn_surrogate(
 def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
     cardinality_correction = heuristic == 'MI-numba-randomized'
 
-    if vector_first.shape[1] == 1:
-        vector_first = vector_first.reshape(-1)
-    else:
-        vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1)
+    if vector_first.size == 2:
+        if vector_first.shape[1] == 1:
+            vector_first = vector_first.reshape(-1)
+        else:
+            vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1)
 
     return ranking_mi_numba.mutual_info_estimator_numba(
         vector_first.astype(np.int32),

From 8af0e17370db3ce459059a084e534c74580f0dc5 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 17:20:57 +0200
Subject: [PATCH 06/15] some refactoring

---
 examples/recursive_ranking.py              |  4 +--
 outrank/algorithms/importance_estimator.py | 33 +++++++----------
 outrank/core_ranking.py                    | 42 +++++++++++-----------
 outrank/core_utils.py                      |  2 +-
 4 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
index 74c3bcf..a2bdec3 100644
--- a/examples/recursive_ranking.py
+++ b/examples/recursive_ranking.py
@@ -18,7 +18,7 @@
 DATA_PATH = os.path.expanduser('~/datasets/toy')
 MODEL_SPEC_DIR = 'model_spec_dir'
 LABEL_COLUMN_NAME = 'label'
-HEURISTIC = 'MI-numba-randomized'
+HEURISTIC = 'surrogate-SGD-SVD'
 DATA_FORMAT = 'ob-vw'
 NUM_THREADS = 6
 INTERACTION_ORDER = 3
@@ -34,7 +34,7 @@ def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
         f'--num_threads {NUM_THREADS} --interaction_order {INTERACTION_ORDER} '
         f'--output_folder {output_folder} --reference_model_JSON {reference_model_json} '
         f'--heuristic {HEURISTIC} --label_column {LABEL_COLUMN_NAME} '
-        f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm True;'
+        f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm False;'
     )
     logger.info(f'Running outrank command: {outrank_command}')
     subprocess.run(outrank_command, shell=True, check=True)
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index a999e26..f5c0714 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 from scipy.stats import pearsonr
+from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_selection import mutual_info_classif
 from sklearn.linear_model import LogisticRegression
 from sklearn.linear_model import SGDClassifier
@@ -25,7 +26,7 @@
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
 
-num_folds = 2
+num_folds = 3
 
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
@@ -40,36 +41,24 @@ def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
     )[0]
 
 def sklearn_surrogate(
-    vector_first: np.ndarray, vector_second: np.ndarray, X: np.ndarray, surrogate_model: str
-        , is_target: bool=False,
+    vector_first: np.ndarray, vector_second: np.ndarray,  surrogate_model: str,
 ) -> float:
     clf = initialize_classifier(surrogate_model)
     transf = OneHotEncoder()
-
-    if not is_target:
-        return 1.0
-
-    if len(np.unique(vector_second)) > 2:
-        vector_first, vector_second = vector_second, vector_first
-
-    if X.size <= 1:
-        X = vector_first.reshape(-1, 1)
-    else:
-        X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
-
-    X = transf.fit_transform(X)
-
+    X = transf.fit_transform(vector_first)
     scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds)
     return 1 + np.median(scores)
 
 def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
     cardinality_correction = heuristic == 'MI-numba-randomized'
 
-    if vector_first.size == 2:
+    try:
         if vector_first.shape[1] == 1:
             vector_first = vector_first.reshape(-1)
         else:
             vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1)
+    except:
+        logger.warning('Reshaping for MI computation in place - you are considering many-one mapping')
 
     return ranking_mi_numba.mutual_info_estimator_numba(
         vector_first.astype(np.int32),
@@ -105,9 +94,8 @@ def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray,
     if heuristic == 'MI':
         score = sklearn_MI(vector_first, vector_second)
 
-    elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-prior'}:
-        logger.warning('surrogate-based models currently not available .. Try a MI-based one (e.g., MI-numba-randomized).')
-        score = 0.0
+    elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'}:
+        score = sklearn_surrogate(vector_first, vector_second, heuristic)
 
     elif heuristic == 'max-value-coverage':
         score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
@@ -190,6 +178,9 @@ def initialize_classifier(surrogate_model: str):
         return SVC(gamma='auto', probability=True)
     elif 'surrogate-SGD' in surrogate_model:
         return SGDClassifier(max_iter=100000, loss='log_loss')
+    elif 'surrogate-SGD-SVD' in surrogate_model:
+        clf = Pipeline([('svd', TruncatedSVD(n_components=2**5)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))])
+        return clf
     else:
         logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
         return SGDClassifier(max_iter=100000, loss='log_loss')
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 24d9e3e..ad36a70 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -186,6 +186,11 @@ def enrich_with_transformations(
     return transformed_df
 
 
+import pandas as pd
+import itertools
+import numpy as np
+import xxhash  # Assuming xxhash is used for hashing
+
 def compute_combined_features(
     input_dataframe: pd.DataFrame,
     args: Any,
@@ -200,18 +205,16 @@ def compute_combined_features(
     join_string = ' AND_REL ' if is_3mr else ' AND '
     interaction_order = 2 if is_3mr else args.interaction_order
 
-    model_combinations = []
     full_combination_space = []
 
-
     if args.interaction_order > 1:
-            full_combination_space = list(
-                itertools.combinations(all_columns, interaction_order),
-            )
+        full_combination_space = list(
+            itertools.combinations(all_columns, interaction_order),
+        )
     full_combination_space = prior_combinations_sample(full_combination_space, args)
 
     if args.reference_model_JSON != '':
-        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
+        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only=True)
         model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
         if not is_prior_heuristic(args):
             full_combination_space = model_combinations
@@ -219,25 +222,20 @@ def compute_combined_features(
     if is_prior_heuristic(args):
         full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
 
+    def combine_features(new_combination):
+        combined_feature = input_dataframe[new_combination[0]].astype(str)
+        for feature in new_combination[1:]:
+            combined_feature += input_dataframe[feature].astype(str)
+        combined_feature = combined_feature.apply(lambda x: xxhash.xxh64(x).hexdigest())
+        ftr_name = join_string.join(new_combination)
+        return ftr_name, combined_feature
 
-    com_counter = 0
     new_feature_hash = {}
-    for new_combination in full_combination_space:
-        pbar.set_description(
-            f'Created {com_counter}/{len(full_combination_space)}',
-        )
-        combined_feature: list[str] = [str(0)] * input_dataframe.shape[0]
-        for feature in new_combination:
-            tmp_feature = input_dataframe[feature].tolist()
-            for enx, el in enumerate(tmp_feature):
-                combined_feature[enx] = str(
-                    internal_hash(
-                        str(combined_feature[enx]) + str(el),
-                    ),
-                )
-        ftr_name = join_string.join(str(x) for x in new_combination)
+    for idx, new_combination in enumerate(full_combination_space):
+        pbar.set_description(f'Created {idx + 1}/{len(full_combination_space)}')
+        ftr_name, combined_feature = combine_features(new_combination)
         new_feature_hash[ftr_name] = combined_feature
-        com_counter += 1
+
     tmp_df = pd.DataFrame(new_feature_hash)
     pbar.set_description('Concatenating into final frame ..')
     input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index b50cc7c..8909d1c 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -647,7 +647,7 @@ def summarize_rare_counts(
 
 
 def is_prior_heuristic(args: Any) -> bool:
-    if 'MI-numba-randomized' and args.reference_model_JSON:
+    if args.heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'} and args.reference_model_JSON:
         return True
     return False
 

From 5c73a4423e4de5329df8ae56925b48d7d072062b Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 17:25:33 +0200
Subject: [PATCH 07/15] Some refactoring

---
 examples/recursive_ranking.py              | 2 +-
 outrank/algorithms/importance_estimator.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
index a2bdec3..a535a43 100644
--- a/examples/recursive_ranking.py
+++ b/examples/recursive_ranking.py
@@ -68,7 +68,7 @@ def initialize_model_spec_dir() -> None:
     """Initialize the model specification directory with the initial JSON file."""
     command = (
         'mkdir -p model_spec_dir && '
-        'rm -rvf model_spec_dir/* && '
+        'rm -rv model_spec_dir/* && '
         'echo \'{"desc": {"features": []}}\' > ./model_spec_dir/model_0.json'
     )
     subprocess.run(command, shell=True, check=True)
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index f5c0714..5c2edbf 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -77,7 +77,7 @@ def generate_data_for_ranking(combination: tuple[str, str], reference_model_feat
         feature_one = feature_two
         feature_two = args.label_column
 
-    if args.reference_model_JSON != '' and args.reference_model_JSON is not None:
+    if args.reference_model_JSON:
         vector_first = tmp_df[list(reference_model_features) + [feature_one]].values
     else:
         vector_first = tmp_df[feature_one].values

From 18f1ff0f2e61ea1055ef52936ff5ac727d8cc689 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 17:27:17 +0200
Subject: [PATCH 08/15] Some imports

---
 outrank/core_ranking.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index ad36a70..6923807 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -20,6 +20,7 @@
 import numpy as np
 import pandas as pd
 import tqdm
+import xxhash
 
 from outrank.algorithms.importance_estimator import \
     get_importances_estimate_pairwise
@@ -186,11 +187,6 @@ def enrich_with_transformations(
     return transformed_df
 
 
-import pandas as pd
-import itertools
-import numpy as np
-import xxhash  # Assuming xxhash is used for hashing
-
 def compute_combined_features(
     input_dataframe: pd.DataFrame,
     args: Any,

From 90e1569b17549cb5a7784bb6e028c3c902e2caeb Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 17:30:09 +0200
Subject: [PATCH 09/15] Imports

---
 .../algorithms/synthetic_data_generators/cc_generator.py | 2 +-
 outrank/task_instance_ranking.py                         | 9 ++++++---
 tests/cc_generator_test.py                               | 4 +++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py
index f97def3..cecde5b 100644
--- a/outrank/algorithms/synthetic_data_generators/cc_generator.py
+++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py
@@ -28,7 +28,7 @@ def __init__(self, seed: int = 42):
         }
 
     def __repr__(self):
-        return f"CategoricalClassification(dataset_info={self.dataset_info})"
+        return f'CategoricalClassification(dataset_info={self.dataset_info})'
 
     def generate_data(
         self,
diff --git a/outrank/task_instance_ranking.py b/outrank/task_instance_ranking.py
index 6555068..30fc7f1 100644
--- a/outrank/task_instance_ranking.py
+++ b/outrank/task_instance_ranking.py
@@ -2,14 +2,17 @@
 
 import gzip
 import os
-from collections import Counter, defaultdict
+from collections import Counter
+from collections import defaultdict
 from typing import Any
 
 import numpy as np
 import pandas as pd
 import tqdm
 
-from outrank.core_utils import generic_line_parser, get_dataset_info, get_num_of_instances
+from outrank.core_utils import generic_line_parser
+from outrank.core_utils import get_dataset_info
+from outrank.core_utils import get_num_of_instances
 
 try:
     import matplotlib.pyplot as plt
@@ -36,7 +39,7 @@ def score_line(line: list[str]) -> dict[str, float]:
         'empty_dict': empty_dict_prop,
         'all_empty': all_empty_prop,
         'all_zero': all_zero_prop,
-        'row_entropy': compute_average_entropy(line)
+        'row_entropy': compute_average_entropy(line),
     }
 
     for j in [30, 60, 100, 200, 300]:
diff --git a/tests/cc_generator_test.py b/tests/cc_generator_test.py
index 770e3b2..704bbc9 100644
--- a/tests/cc_generator_test.py
+++ b/tests/cc_generator_test.py
@@ -5,7 +5,9 @@
 import numpy as np
 from scipy.stats import pearsonr
 
-from outrank.algorithms.synthetic_data_generators.cc_generator import CategoricalClassification
+from outrank.algorithms.synthetic_data_generators.cc_generator import \
+    CategoricalClassification
+
 
 class TestCategoricalClassification(unittest.TestCase):
 

From ca32340951f100acb614386232f8b20e3449049f Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 17:32:20 +0200
Subject: [PATCH 10/15] ruff

---
 examples/recursive_ranking.py                          |  1 -
 outrank/algorithms/importance_estimator.py             |  4 ----
 outrank/algorithms/sketches/counting_cms.py            |  1 -
 .../algorithms/sketches/counting_counters_ordinary.py  |  1 -
 outrank/algorithms/sketches/counting_ultiloglog.py     |  6 ------
 .../synthetic_data_generators/cc_generator.py          |  3 ---
 outrank/core_ranking.py                                |  5 -----
 outrank/core_utils.py                                  | 10 ++--------
 .../feature_transformer_vault/default_transformers.py  |  2 +-
 .../feature_transformations/ranking_transformers.py    |  3 ---
 outrank/task_summary.py                                |  1 -
 tests/cms_test.py                                      |  1 -
 12 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
index a535a43..5db0956 100644
--- a/examples/recursive_ranking.py
+++ b/examples/recursive_ranking.py
@@ -6,7 +6,6 @@
 import os
 import shutil
 import subprocess
-from typing import Optional
 
 import pandas as pd
 
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index 5c2edbf..b57b8cb 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -4,9 +4,6 @@
 import operator
 import traceback
 from typing import Any
-from typing import Dict
-from typing import List
-from typing import Tuple
 
 import numpy as np
 import pandas as pd
@@ -21,7 +18,6 @@
 from sklearn.svm import SVC
 
 from outrank.algorithms.feature_ranking import ranking_cov_alignment
-from outrank.core_utils import is_prior_heuristic
 
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
diff --git a/outrank/algorithms/sketches/counting_cms.py b/outrank/algorithms/sketches/counting_cms.py
index 56eef3c..c12e09f 100644
--- a/outrank/algorithms/sketches/counting_cms.py
+++ b/outrank/algorithms/sketches/counting_cms.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import sys
 from collections import Counter
 
 import numpy as np
diff --git a/outrank/algorithms/sketches/counting_counters_ordinary.py b/outrank/algorithms/sketches/counting_counters_ordinary.py
index 95d4a62..6c7ee7d 100644
--- a/outrank/algorithms/sketches/counting_counters_ordinary.py
+++ b/outrank/algorithms/sketches/counting_counters_ordinary.py
@@ -26,7 +26,6 @@ def add(self, val):
 
     depth = 8
     width = 2**22
-    import numpy as np
     cms = PrimitiveConstrainedCounter()
 
     items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 10000
diff --git a/outrank/algorithms/sketches/counting_ultiloglog.py b/outrank/algorithms/sketches/counting_ultiloglog.py
index 0ac7fb6..81b7e42 100644
--- a/outrank/algorithms/sketches/counting_ultiloglog.py
+++ b/outrank/algorithms/sketches/counting_ultiloglog.py
@@ -62,13 +62,7 @@ def __len__(self):
 if __name__ == '__main__':
     import random
     import string
-    import time
 
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-    import tqdm
-    from pympler import asizeof
 
     def get_random_string(length):
         # choose from all lowercase letter
diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py
index cecde5b..be6164b 100644
--- a/outrank/algorithms/synthetic_data_generators/cc_generator.py
+++ b/outrank/algorithms/synthetic_data_generators/cc_generator.py
@@ -1,10 +1,7 @@
 from __future__ import annotations
 
-from typing import List
 from typing import Literal
 from typing import Optional
-from typing import Tuple
-from typing import Union
 
 import numpy as np
 from numpy.typing import ArrayLike
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 6923807..edcc91c 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -11,11 +11,6 @@
 from collections import deque
 from timeit import default_timer as timer
 from typing import Any
-from typing import Dict
-from typing import List
-from typing import Set
-from typing import Tuple
-from typing import Union
 
 import numpy as np
 import pandas as pd
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index 8909d1c..31d11c8 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -9,12 +9,6 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Set
-from typing import Tuple
-from typing import Union
 
 import numpy as np
 import pandas as pd
@@ -199,7 +193,7 @@ def parse_ob_line_vw(
     ]
     if not include_namespace_info:
         the_real_instance = [
-            x[2:] if not x is None else None for x in the_real_instance
+            x[2:] if x is not None else None for x in the_real_instance
         ]
 
     parts = [label] + the_real_instance
@@ -268,7 +262,7 @@ def parse_namespace(namespace_path: str) -> tuple[set[str], dict[str, str]]:
                 id_feature_map[fw_id] = feature
                 if type_name == 'f32':
                     float_set.add(feature)
-            except Exception as es:
+            except Exception:
                 pass
 
     return float_set, id_feature_map
diff --git a/outrank/feature_transformations/feature_transformer_vault/default_transformers.py b/outrank/feature_transformations/feature_transformer_vault/default_transformers.py
index 959ff6f..cc4b303 100644
--- a/outrank/feature_transformations/feature_transformer_vault/default_transformers.py
+++ b/outrank/feature_transformations/feature_transformer_vault/default_transformers.py
@@ -291,7 +291,7 @@
 for k, v in EXTENDED_TRANSFORMERS.items():
     for round_param in [1, 2, 3, 4]:
         new_key = k + f'_round{round_param}'
-        new_value = f'np.round(np.astype(np.array(' + v + f'), np.float32), {round_param})'
+        new_value = 'np.round(np.astype(np.array(' + v + f'), np.float32), {round_param})'
         EXTENDED_ROUNDED_TRANSFORMERS[new_key] = new_value
 
 
diff --git a/outrank/feature_transformations/ranking_transformers.py b/outrank/feature_transformations/ranking_transformers.py
index 11227f7..1b2f5dc 100644
--- a/outrank/feature_transformations/ranking_transformers.py
+++ b/outrank/feature_transformations/ranking_transformers.py
@@ -3,9 +3,6 @@
 
 import logging
 from typing import Any
-from typing import Dict
-from typing import List
-from typing import Set
 
 import numpy as np
 import pandas as pd
diff --git a/outrank/task_summary.py b/outrank/task_summary.py
index 2fbc6e9..5e56fe4 100644
--- a/outrank/task_summary.py
+++ b/outrank/task_summary.py
@@ -4,7 +4,6 @@
 import os
 from collections import defaultdict
 from typing import Any
-from typing import List
 
 import numpy as np
 import pandas as pd
diff --git a/tests/cms_test.py b/tests/cms_test.py
index da506a2..7a2cbd9 100644
--- a/tests/cms_test.py
+++ b/tests/cms_test.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import sys
 import unittest
 
 import numpy as np

From d8563fac5a6607d9eef7069b4985e0e7a9688b26 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 17:37:31 +0200
Subject: [PATCH 11/15] le import

---
 outrank/algorithms/importance_estimator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index b57b8cb..e9a2608 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -14,6 +14,7 @@
 from sklearn.linear_model import SGDClassifier
 from sklearn.metrics import adjusted_mutual_info_score
 from sklearn.model_selection import cross_val_score
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
 

From c56c3638f4e60b58a049803dd0e74b3ef9727a96 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 17:49:02 +0200
Subject: [PATCH 12/15] nonsense--

---
 outrank/algorithms/importance_estimator.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index e9a2608..a95b828 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -169,15 +169,20 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
     pass
 
 def initialize_classifier(surrogate_model: str):
+
     if 'surrogate-LR' in surrogate_model:
         return LogisticRegression(max_iter=100000)
+
     elif 'surrogate-SVM' in surrogate_model:
         return SVC(gamma='auto', probability=True)
-    elif 'surrogate-SGD' in surrogate_model:
-        return SGDClassifier(max_iter=100000, loss='log_loss')
+
     elif 'surrogate-SGD-SVD' in surrogate_model:
         clf = Pipeline([('svd', TruncatedSVD(n_components=2**5)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))])
         return clf
+
+    elif 'surrogate-SGD' in surrogate_model:
+        return SGDClassifier(max_iter=100000, loss='log_loss')
+
     else:
         logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
         return SGDClassifier(max_iter=100000, loss='log_loss')

From 32597dde70aa1061e61a7979b36e83c8b719c911 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Thu, 24 Oct 2024 19:08:29 +0200
Subject: [PATCH 13/15] Minor fixes

---
 examples/recursive_ranking.py              | 8 ++++----
 outrank/algorithms/importance_estimator.py | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
index 5db0956..9e6f110 100644
--- a/examples/recursive_ranking.py
+++ b/examples/recursive_ranking.py
@@ -17,13 +17,13 @@
 DATA_PATH = os.path.expanduser('~/datasets/toy')
 MODEL_SPEC_DIR = 'model_spec_dir'
 LABEL_COLUMN_NAME = 'label'
-HEURISTIC = 'surrogate-SGD-SVD'
+HEURISTIC = 'surrogate-SGD'
 DATA_FORMAT = 'ob-vw'
 NUM_THREADS = 6
-INTERACTION_ORDER = 3
+INTERACTION_ORDER = 2
 COMBINATION_NUMBER_BOUND = 1_000
 MINIBATCH_SIZE = 10_000
-SUBSAMPLING = 1
+SUBSAMPLING = 10
 
 def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
     """Run the outrank task with the specified parameters."""
@@ -97,7 +97,7 @@ def parse_arguments() -> argparse.Namespace:
     parser.add_argument(
         '--iterations',
         type=int,
-        default=10,
+        default=80,
         help='Number of iterations to run (default: 10)',
     )
     return parser.parse_args()
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index a95b828..e7b3edd 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -41,8 +41,7 @@ def sklearn_surrogate(
     vector_first: np.ndarray, vector_second: np.ndarray,  surrogate_model: str,
 ) -> float:
     clf = initialize_classifier(surrogate_model)
-    transf = OneHotEncoder()
-    X = transf.fit_transform(vector_first)
+    X = OneHotEncoder().fit_transform(vector_first)
     scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds)
     return 1 + np.median(scores)
 

From ce091e1e4d9b5969e6d053267266d865dc90a9eb Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Fri, 25 Oct 2024 11:20:57 +0200
Subject: [PATCH 14/15] svd works too

---
 examples/recursive_ranking.py              |  2 +-
 outrank/algorithms/importance_estimator.py | 21 ++++++++++++++-------
 outrank/core_utils.py                      |  2 +-
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
index 9e6f110..b056f20 100644
--- a/examples/recursive_ranking.py
+++ b/examples/recursive_ranking.py
@@ -17,7 +17,7 @@
 DATA_PATH = os.path.expanduser('~/datasets/toy')
 MODEL_SPEC_DIR = 'model_spec_dir'
 LABEL_COLUMN_NAME = 'label'
-HEURISTIC = 'surrogate-SGD'
+HEURISTIC = 'surrogate-SGD-SVD'
 DATA_FORMAT = 'ob-vw'
 NUM_THREADS = 6
 INTERACTION_ORDER = 2
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index e7b3edd..f95363e 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pandas as pd
 from scipy.stats import pearsonr
+from sklearn import random_projection
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_selection import mutual_info_classif
 from sklearn.linear_model import LogisticRegression
@@ -23,7 +24,8 @@
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
 
-num_folds = 3
+NUM_FOLDS  = 2
+SVD_DIMS = 2
 
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
@@ -40,9 +42,14 @@ def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
 def sklearn_surrogate(
     vector_first: np.ndarray, vector_second: np.ndarray,  surrogate_model: str,
 ) -> float:
-    clf = initialize_classifier(surrogate_model)
     X = OneHotEncoder().fit_transform(vector_first)
-    scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds)
+
+    if '-SVD' in surrogate_model and X.shape[1] > 2:
+        # yes this is not super correct due to embedding full data first, but it's much faster + seems to offer same results anyways.
+        X = TruncatedSVD(n_components=min(SVD_DIMS, X.shape[1])).fit_transform(X)
+
+    clf = initialize_classifier(surrogate_model, n_dim=min(X.shape[1], 1024))
+    scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=NUM_FOLDS)
     return 1 + np.median(scores)
 
 def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
@@ -90,7 +97,7 @@ def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray,
     if heuristic == 'MI':
         score = sklearn_MI(vector_first, vector_second)
 
-    elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'}:
+    elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-RP', 'surrogate-SGD-SVD'}:
         score = sklearn_surrogate(vector_first, vector_second, heuristic)
 
     elif heuristic == 'max-value-coverage':
@@ -167,7 +174,7 @@ def calc_higher_order(feature: str, is_redundancy: bool = True) -> float:
 def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
     pass
 
-def initialize_classifier(surrogate_model: str):
+def initialize_classifier(surrogate_model: str, n_dim: int) -> Any:
 
     if 'surrogate-LR' in surrogate_model:
         return LogisticRegression(max_iter=100000)
@@ -175,8 +182,8 @@ def initialize_classifier(surrogate_model: str):
     elif 'surrogate-SVM' in surrogate_model:
         return SVC(gamma='auto', probability=True)
 
-    elif 'surrogate-SGD-SVD' in surrogate_model:
-        clf = Pipeline([('svd', TruncatedSVD(n_components=2**5)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))])
+    elif 'surrogate-SGD-RP' in surrogate_model:
+        clf = Pipeline([('proj', random_projection.SparseRandomProjection(n_components=n_dim)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))])
         return clf
 
     elif 'surrogate-SGD' in surrogate_model:
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index 31d11c8..0ee3d23 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -641,7 +641,7 @@ def summarize_rare_counts(
 
 
 def is_prior_heuristic(args: Any) -> bool:
-    if args.heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-SVD'} and args.reference_model_JSON:
+    if args.heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-RP'} and args.reference_model_JSON:
         return True
     return False
 

From 2ca3a874cb0f8b9d3544daf4cc56fff891e18115 Mon Sep 17 00:00:00 2001
From: bskrlj <bskrlj@outbrain.com>
Date: Fri, 25 Oct 2024 11:23:45 +0200
Subject: [PATCH 15/15] version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c004f3e..8411c4f 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ def _read_description():
 packages = [x for x in setuptools.find_packages() if x != 'test']
 setuptools.setup(
     name='outrank',
-    version='0.97.4',
+    version='0.97.5',
     description='OutRank: Feature ranking for massive sparse data sets.',
     long_description=_read_description(),
     long_description_content_type='text/markdown',