Skip to content

Commit

Permalink
some more refactorings
Browse files Browse the repository at this point in the history
  • Loading branch information
SkBlaz committed Oct 18, 2024
1 parent d2e79f9 commit 1a2370a
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 251 deletions.
203 changes: 61 additions & 142 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
# A module for pairwise computation of importances -- entrypoint for the core ranking algorighm(s)
from __future__ import annotations

import logging
import operator
import traceback
from typing import Any
from typing import Dict
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
Expand All @@ -28,200 +25,122 @@

try:
from outrank.algorithms.feature_ranking import ranking_mi_numba

numba_available = True

except Exception as es:
traceback.print_exc(0)
except ImportError:
traceback.print_exc()
numba_available = False


def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
estimate_feature_importance = mutual_info_classif(
vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
return mutual_info_classif(
vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True
)[0]
return estimate_feature_importance


def sklearn_surrogate(
vector_first: Any, vector_second: Any, X: Any, surrogate_model: str,
vector_first: np.ndarray, vector_second: np.ndarray, X: np.ndarray, surrogate_model: str
) -> float:

clf = initialize_classifier(surrogate_model)

transf = OneHotEncoder()

# They do not commute, swap if needed
if len(np.unique(vector_second) > 2):
vector_third = vector_second
vector_second = vector_first
vector_first = vector_third
del vector_third
if len(np.unique(vector_second)) > 2:
vector_first, vector_second = vector_second, vector_first

if X.size <= 1:
X = vector_first.reshape(-1, 1)
else:
X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)

X = transf.fit_transform(X)
estimate_feature_importance_list = cross_val_score(
clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
)
estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)

return estimate_feature_importance


def numba_mi(vector_first, vector_second, heuristic, mi_stratified_sampling_ratio):
if heuristic == 'MI-numba-randomized':
cardinality_correction = True

else:
cardinality_correction = False

estimate_feature_importance = ranking_mi_numba.mutual_info_estimator_numba(
vector_first.reshape(-1).astype(np.int32),
vector_second.reshape(-1).astype(np.int32),
scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds)
return 1 + np.median(scores)

def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
cardinality_correction = heuristic == 'MI-numba-randomized'
return ranking_mi_numba.mutual_info_estimator_numba(
vector_first.astype(np.int32),
vector_second.astype(np.int32),
approximation_factor=np.float32(mi_stratified_sampling_ratio),
cardinality_correction=cardinality_correction,
)

return estimate_feature_importance


def sklearn_mi_adj(vector_first, vector_second):
# AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
estimate_feature_importance = adjusted_mutual_info_score(
vector_first.reshape(-1), vector_second.reshape(-1),
)
return estimate_feature_importance


def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
"""A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
return adjusted_mutual_info_score(vector_first, vector_second)

feature_one = combination[0]
feature_two = combination[1]
def get_importances_estimate_pairwise(combination: Tuple[str, str], reference_model_features: List[str], args: Any, tmp_df: pd.DataFrame) -> Tuple[str, str, float]:
feature_one, feature_two = combination

if feature_one not in tmp_df.columns:
logging.info(f'{feature_one} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
return [feature_one, feature_two, 0]
elif feature_two not in tmp_df.columns:
logging.info(f'{feature_two} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
return [feature_one, feature_two, 0]
if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns:
logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.')
return feature_one, feature_two, 0.0

vector_first = tmp_df[[feature_one]].values.ravel()
vector_second = tmp_df[[feature_two]].values.ravel()
vector_first = tmp_df[feature_one].values
vector_second = tmp_df[feature_two].values

if len(vector_first) == 0 or len(vector_second) == 0:
return [feature_one, feature_two, 0]
if vector_first.size == 0 or vector_second.size == 0:
return feature_one, feature_two, 0.0

# Compute score based on the selected heuristic.
if args.heuristic == 'MI':
# Compute the infoGain
estimate_feature_importance = sklearn_MI(vector_first, vector_second)

score = sklearn_MI(vector_first, vector_second)
elif 'surrogate-' in args.heuristic:
X = np.array(float)
if is_prior_heuristic(args) and (len(reference_model_features) > 0):
X = tmp_df[reference_model_features].values

estimate_feature_importance = sklearn_surrogate(
vector_first, vector_second, X, args.heuristic,
)
X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([])
score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic)
elif 'max-value-coverage' in args.heuristic:
estimate_feature_importance = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)

score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
elif 'MI-numba' in args.heuristic:
estimate_feature_importance = numba_mi(
vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio,
)

score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio)
elif args.heuristic == 'AMI':
estimate_feature_importance = sklearn_mi_adj(
vector_first, vector_second,
)

score = sklearn_mi_adj(vector_first, vector_second)
elif args.heuristic == 'correlation-Pearson':
estimate_feature_importance = pearsonr(vector_first, vector_second)[0]

score = pearsonr(vector_first, vector_second)[0]
elif args.heuristic == 'Constant':
estimate_feature_importance = 0.0

score = 0.0
else:
raise ValueError(
'Please select one of the possible heuristics (MI, chi2)',
)

return (feature_one, feature_two, estimate_feature_importance)
raise ValueError('Please select a valid heuristic (MI, chi2, etc.).')

return feature_one, feature_two, score

def rank_features_3MR(
relevance_dict: dict[str, float],
redundancy_dict: dict[tuple[Any, Any], Any],
relational_dict: dict[tuple[Any, Any], Any],
relevance_dict: Dict[str, float],
redundancy_dict: Dict[Tuple[Any, Any], Any],
relational_dict: Dict[Tuple[Any, Any], Any],
strategy: str = 'median',
alpha: float = 1,
beta: float = 1,
alpha: float = 1.0,
beta: float = 1.0,
) -> pd.DataFrame:
all_features = relevance_dict.keys()
most_important_feature = max(
relevance_dict.items(), key=operator.itemgetter(1),
)[0]
all_features = set(relevance_dict.keys())
most_important_feature = max(relevance_dict.items(), key=operator.itemgetter(1))[0]
ranked_features = [most_important_feature]

def calc_higher_order(feature, is_redundancy=True):
def calc_higher_order(feature: str, is_redundancy: bool = True) -> float:
values = []
for feat in ranked_features:
interaction_tuple = (feat, feature)
if is_redundancy:
if interaction_tuple in redundancy_dict:
values.append(redundancy_dict[interaction_tuple])
else:
logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
values.append(redundancy_dict.get(interaction_tuple, 0))
else:
if interaction_tuple in relational_dict:
values.append(relational_dict[interaction_tuple])
else:
logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')

if strategy == 'sum':
return sum(values)
if strategy == 'mean':
return np.mean(values)
return np.median(values)

while len(ranked_features) != len(all_features):
top_importance = 0
most_important_feature = ''

for ind, feat in enumerate(set(all_features) - set(ranked_features)):
values.append(relational_dict.get(interaction_tuple, 0))
return np.median(values) if strategy == 'median' else (np.mean(values) if strategy == 'mean' else sum(values))

while len(ranked_features) < len(all_features):
top_importance = -np.inf
most_important_feature = None

for feat in all_features - set(ranked_features):
feature_redundancy = calc_higher_order(feat)
feature_relation = calc_higher_order(feat, False)
feature_relevance = relevance_dict[feat]
importance = (
feature_relevance - alpha * feature_redundancy + beta * feature_relation
)
importance = feature_relevance - alpha * feature_redundancy + beta * feature_relation

if (importance > top_importance) or (ind == 0):
if importance > top_importance:
top_importance = importance
most_important_feature = feat

ranked_features.append(most_important_feature)
return pd.DataFrame(
{
'Feature': ranked_features,
'3mr_ranking': list(range(1, len(ranked_features) + 1)),
},
)

return pd.DataFrame({'Feature': ranked_features, '3MR_Ranking': range(1, len(ranked_features) + 1)})

def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
# TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
# TODO - this is to be executed directly on df - no need for parallel kernel(s)
pass


def initialize_classifier(surrogate_model: str):
if 'surrogate-LR' in surrogate_model:
return LogisticRegression(max_iter=100000)
Expand All @@ -230,5 +149,5 @@ def initialize_classifier(surrogate_model: str):
elif 'surrogate-SGD' in surrogate_model:
return SGDClassifier(max_iter=100000, loss='log_loss')
else:
logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
return SGDClassifier(max_iter=100000, loss='log_loss')
30 changes: 15 additions & 15 deletions outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,22 @@
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

pro_tips = [
'OutRank can construct subfeatures; features based on subspaces. Example command argument is: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e"',
'Heuristic MI-numba-randomized seems like the best of both worlds! (speed + performance).',
'Heuristic surrogate-lr performs cross-validation (internally), keep that in mind!',
'Consider running OutRank on a smaller data sample first, might be enough (--subsampling = a lot).',
'There are two types of combinations supported; unsupervised pairwise ranking (redundancies- --target_ranking_only=False), and supervised combinations - (--interaction_order > 1)',
'Visualization part also includes clustering - this might be very insightful!',
'By default OutRank includes feature cardinality and coverage in feature names (card; cov)',
'Intermediary checkpoints (tmp_checkpoint.tsv) might already give you insights during longer runs.',
'In theory, you can rank redundancies of combined features (--interaction_order AND --target_ranking_only=False).',
'Give it as many threads as physically possible (--num_threads).',
'You can speed up ranking by diminishing feature buffer size (--combination_number_upper_bound determines how many ranking computations per batch will be considered). This, and --subsampling are very powerful together.',
'Want to rank feature transformations, but not sure which ones to choose? --transformers=default should serve as a solid baseline (common DS transformations included).',
'Your target can be any feature! (explaining one feature with others)',
'OutRank uses HyperLogLog for cardinality estimation - this is also a potential usecase (understanding cardinalities across different data sets).',
'OutRank can construct subfeatures based on subspaces. Use: --subfeature_mapping "feature_a->feature_b;feature_c<->feature_d;feature_c<->feature_e".',
'Heuristic MI-numba-randomized offers a great balance of speed and performance.',
'Heuristic surrogate-lr performs internal cross-validation; keep that in mind.',
'Consider running OutRank on a smaller sample first; it might be sufficient (--subsampling = a lot).',
'OutRank supports two types of combinations: unsupervised pairwise ranking (--target_ranking_only=False) and supervised combinations (--interaction_order > 1).',
'The visualization includes clustering, which can be very insightful!',
'By default, OutRank includes feature cardinality and coverage in feature names (e.g., card; cov).',
'Intermediary checkpoints (tmp_checkpoint.tsv) can provide insights during longer runs.',
'You can rank redundancies of combined features using --interaction_order and --target_ranking_only=False.',
'Use as many threads as possible for better performance (--num_threads).',
'Speed up ranking by reducing the feature buffer size (--combination_number_upper_bound) and using --subsampling together.',
'Not sure which feature transformations to choose? Use --transformers=default for a solid baseline (includes common DS transformations).',
'Your target can be any feature, allowing you to explain one feature with others.',
'OutRank uses HyperLogLog for cardinality estimation, useful for understanding cardinalities across datasets.',
'Each feature is named as featureName(cardinality, coverage in percents) in the final files.',
'You can generate candidate feature transformation ranges (fw) by using --task=feature_summary_transformers.',
'Generate candidate feature transformation ranges using --task=feature_summary_transformers.',
]


Expand Down
Loading

0 comments on commit 1a2370a

Please sign in to comment.