some more refactorings

outbrain · Oct 18, 2024 · 1a2370a · 1a2370a
1 parent d2e79f9
commit 1a2370a
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 251 deletions.
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
@@ -1,18 +1,15 @@
-# A module for pairwise computation of importances -- entrypoint for the core ranking algorighm(s)
 from __future__ import annotations
 
 import logging
 import operator
 import traceback
-from typing import Any
-from typing import Dict
+from typing import Any, Dict, List, Tuple
 
 import numpy as np
 import pandas as pd
 from scipy.stats import pearsonr
 from sklearn.feature_selection import mutual_info_classif
-from sklearn.linear_model import LogisticRegression
-from sklearn.linear_model import SGDClassifier
+from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.metrics import adjusted_mutual_info_score
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import OneHotEncoder
@@ -28,200 +25,122 @@
 
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
-
     numba_available = True
-
-except Exception as es:
-    traceback.print_exc(0)
+except ImportError:
+    traceback.print_exc()
     numba_available = False
 
-
-def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
-    estimate_feature_importance = mutual_info_classif(
-        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
+def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
+    return mutual_info_classif(
+        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True
     )[0]
-    return estimate_feature_importance
-
 
 def sklearn_surrogate(
-    vector_first: Any, vector_second: Any, X: Any, surrogate_model: str,
+    vector_first: np.ndarray, vector_second: np.ndarray, X: np.ndarray, surrogate_model: str
 ) -> float:
-
     clf = initialize_classifier(surrogate_model)
-
     transf = OneHotEncoder()
 
-    # They do not commute, swap if needed
-    if len(np.unique(vector_second) > 2):
-        vector_third = vector_second
-        vector_second = vector_first
-        vector_first = vector_third
-        del vector_third
+    if len(np.unique(vector_second)) > 2:
+        vector_first, vector_second = vector_second, vector_first
 
     if X.size <= 1:
         X = vector_first.reshape(-1, 1)
     else:
         X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
 
     X = transf.fit_transform(X)
-    estimate_feature_importance_list = cross_val_score(
-        clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
-    )
-    estimate_feature_importance = 1 + \
-        np.median(estimate_feature_importance_list)
-
-    return estimate_feature_importance
-
-
-def numba_mi(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio):
-    if heuristic == 'MI-numba-randomized':
-        cardinality_correction = True
-
-    else:
-        cardinality_correction = False
-
-    estimate_feature_importance = ranking_mi_numba.mutual_info_estimator_numba(
-        vector_first.reshape(-1).astype(np.int32),
-        vector_second.reshape(-1).astype(np.int32),
+    scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds)
+    return 1 + np.median(scores)
+
+def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
+    cardinality_correction = heuristic == 'MI-numba-randomized'
+    return ranking_mi_numba.mutual_info_estimator_numba(
+        vector_first.astype(np.int32),
+        vector_second.astype(np.int32),
         approximation_factor=np.float32(mi_stratified_sampling_ratio),
         cardinality_correction=cardinality_correction,
     )
 
-    return estimate_feature_importance
-
-
-def sklearn_mi_adj(vector_first, vector_second):
-    # AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
-    estimate_feature_importance = adjusted_mutual_info_score(
-        vector_first.reshape(-1), vector_second.reshape(-1),
-    )
-    return estimate_feature_importance
-
-
-def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
-    """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
+def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
+    return adjusted_mutual_info_score(vector_first, vector_second)
 
-    feature_one = combination[0]
-    feature_two = combination[1]
+def get_importances_estimate_pairwise(combination: Tuple[str, str], reference_model_features: List[str], args: Any, tmp_df: pd.DataFrame) -> Tuple[str, str, float]:
+    feature_one, feature_two = combination
 
-    if feature_one not in tmp_df.columns:
-        logging.info(f'{feature_one} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
-        return [feature_one, feature_two, 0]
-    elif feature_two not in tmp_df.columns:
-        logging.info(f'{feature_two} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
-        return [feature_one, feature_two, 0]
+    if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns:
+        logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.')
+        return feature_one, feature_two, 0.0
 
-    vector_first = tmp_df[[feature_one]].values.ravel()
-    vector_second = tmp_df[[feature_two]].values.ravel()
+    vector_first = tmp_df[feature_one].values
+    vector_second = tmp_df[feature_two].values
 
-    if len(vector_first) == 0 or len(vector_second) == 0:
-        return [feature_one, feature_two, 0]
+    if vector_first.size == 0 or vector_second.size == 0:
+        return feature_one, feature_two, 0.0
 
-    # Compute score based on the selected heuristic.
     if args.heuristic == 'MI':
-        # Compute the infoGain
-        estimate_feature_importance = sklearn_MI(vector_first, vector_second)
-
+        score = sklearn_MI(vector_first, vector_second)
     elif 'surrogate-' in args.heuristic:
-        X = np.array(float)
-        if is_prior_heuristic(args) and (len(reference_model_features) > 0):
-            X = tmp_df[reference_model_features].values
-
-        estimate_feature_importance = sklearn_surrogate(
-            vector_first, vector_second, X, args.heuristic,
-        )
+        X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([])
+        score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic)
     elif 'max-value-coverage' in args.heuristic:
-        estimate_feature_importance = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
-
+        score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
     elif 'MI-numba' in args.heuristic:
-        estimate_feature_importance = numba_mi(
-            vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio,
-        )
-
+        score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio)
     elif args.heuristic == 'AMI':
-        estimate_feature_importance = sklearn_mi_adj(
-            vector_first, vector_second,
-        )
-
+        score = sklearn_mi_adj(vector_first, vector_second)
     elif args.heuristic == 'correlation-Pearson':
-        estimate_feature_importance = pearsonr(vector_first, vector_second)[0]
-
+        score = pearsonr(vector_first, vector_second)[0]
     elif args.heuristic == 'Constant':
-        estimate_feature_importance = 0.0
-
+        score = 0.0
     else:
-        raise ValueError(
-            'Please select one of the possible heuristics (MI, chi2)',
-        )
-
-    return (feature_one, feature_two, estimate_feature_importance)
+        raise ValueError('Please select a valid heuristic (MI, chi2, etc.).')
 
+    return feature_one, feature_two, score
 
 def rank_features_3MR(
-    relevance_dict: dict[str, float],
-    redundancy_dict: dict[tuple[Any, Any], Any],
-    relational_dict: dict[tuple[Any, Any], Any],
+    relevance_dict: Dict[str, float],
+    redundancy_dict: Dict[Tuple[Any, Any], Any],
+    relational_dict: Dict[Tuple[Any, Any], Any],
     strategy: str = 'median',
-    alpha: float = 1,
-    beta: float = 1,
+    alpha: float = 1.0,
+    beta: float = 1.0,
 ) -> pd.DataFrame:
-    all_features = relevance_dict.keys()
-    most_important_feature = max(
-        relevance_dict.items(), key=operator.itemgetter(1),
-    )[0]
+    all_features = set(relevance_dict.keys())
+    most_important_feature = max(relevance_dict.items(), key=operator.itemgetter(1))[0]
     ranked_features = [most_important_feature]
 
-    def calc_higher_order(feature, is_redundancy=True):
+    def calc_higher_order(feature: str, is_redundancy: bool = True) -> float:
         values = []
         for feat in ranked_features:
             interaction_tuple = (feat, feature)
             if is_redundancy:
-                if interaction_tuple in redundancy_dict:
-                    values.append(redundancy_dict[interaction_tuple])
-                else:
-                    logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
+                values.append(redundancy_dict.get(interaction_tuple, 0))
             else:
-                if interaction_tuple in relational_dict:
-                    values.append(relational_dict[interaction_tuple])
-                else:
-                    logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
-
-        if strategy == 'sum':
-            return sum(values)
-        if strategy == 'mean':
-            return np.mean(values)
-        return np.median(values)
-
-    while len(ranked_features) != len(all_features):
-        top_importance = 0
-        most_important_feature = ''
-
-        for ind, feat in enumerate(set(all_features) - set(ranked_features)):
+                values.append(relational_dict.get(interaction_tuple, 0))
+        return np.median(values) if strategy == 'median' else (np.mean(values) if strategy == 'mean' else sum(values))
+
+    while len(ranked_features) < len(all_features):
+        top_importance = -np.inf
+        most_important_feature = None
+
+        for feat in all_features - set(ranked_features):
             feature_redundancy = calc_higher_order(feat)
             feature_relation = calc_higher_order(feat, False)
             feature_relevance = relevance_dict[feat]
-            importance = (
-                feature_relevance - alpha * feature_redundancy + beta * feature_relation
-            )
+            importance = feature_relevance - alpha * feature_redundancy + beta * feature_relation
 
-            if (importance > top_importance) or (ind == 0):
+            if importance > top_importance:
                 top_importance = importance
                 most_important_feature = feat
+
         ranked_features.append(most_important_feature)
-    return pd.DataFrame(
-        {
-            'Feature': ranked_features,
-            '3mr_ranking': list(range(1, len(ranked_features) + 1)),
-        },
-    )
 
+    return pd.DataFrame({'Feature': ranked_features, '3MR_Ranking': range(1, len(ranked_features) + 1)})
 
 def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
-    # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
-    # TODO - this is to be executed directly on df - no need for parallel kernel(s)
     pass
 
-
 def initialize_classifier(surrogate_model: str):
     if 'surrogate-LR' in surrogate_model:
         return LogisticRegression(max_iter=100000)
@@ -230,5 +149,5 @@ def initialize_classifier(surrogate_model: str):
     elif 'surrogate-SGD' in surrogate_model:
         return SGDClassifier(max_iter=100000, loss='log_loss')
     else:
-        logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
+        logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
         return SGDClassifier(max_iter=100000, loss='log_loss')
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
@@ -23,22 +23,22 @@
 logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
 
 pro_tips = [
-    'OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"',
-    'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).',
-    'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!',
-    'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).',
-    'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)',
-    'Visualization part also includes clustering - this might be very insightful!',
-    'By default OutRank includes feature cardinality and coverage in feature names (card; cov)',
-    'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.',
-    'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).',
-    'Give it as many threads as physically possible (--num_threads).',
-    'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.',
-    'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).',
-    'Your target can be any feature! (explaining one feature with others)',
-    'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).',
+    'OutRank can construct subfeatures based on subspaces. Use: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e".',
+    'Heuristic MI-numba-randomized offers a great balance of speed and performance.',
+    'Heuristic surrogate-lr performs internal cross-validation; keep that in mind.',
+    'Consider running OutRank on a smaller sample first; it might be sufficient (--subsampling = a lot).',
+    'OutRank supports two types of combinations: unsupervised pairwise ranking (--target_ranking_only=False) and supervised combinations (--interaction_order > 1).',
+    'The visualization includes clustering, which can be very insightful!',
+    'By default, OutRank includes feature cardinality and coverage in feature names (e.g., card; cov).',
+    'Intermediary checkpoints (tmp_checkpoint.tsv) can provide insights during longer runs.',
+    'You can rank redundancies of combined features using --interaction_order and --target_ranking_only=False.',
+    'Use as many threads as possible for better performance (--num_threads).',
+    'Speed up ranking by reducing the feature buffer size (--combination_number_upper_bound) and using --subsampling together.',
+    'Not sure which feature transformations to choose? Use --transformers=default for a solid baseline (includes common DS transformations).',
+    'Your target can be any feature, allowing you to explain one feature with others.',
+    'OutRank uses HyperLogLog for cardinality estimation, useful for understanding cardinalities across datasets.',
     'Each feature is named as featureName(cardinality, coverage in percents) in the final files.',
-    'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.',
+    'Generate candidate feature transformation ranges using --task=feature_summary_transformers.',
 ]