diff --git a/outrank/__main__.py b/outrank/__main__.py index f261dc5..3ad89d7 100644 --- a/outrank/__main__.py +++ b/outrank/__main__.py @@ -204,6 +204,13 @@ def main(): help='Relevant for task data_generator -- how many features.', ) + parser.add_argument( + '--tldr', + type=str, + default='True', + help='If enabled, it will output some of the main results on the screen after finishing.', + ) + parser.add_argument( '--num_synthetic_rows', type=int, diff --git a/outrank/task_summary.py b/outrank/task_summary.py index bebf5e7..2fbc6e9 100644 --- a/outrank/task_summary.py +++ b/outrank/task_summary.py @@ -3,6 +3,8 @@ import logging import os from collections import defaultdict +from typing import Any +from typing import List import numpy as np import pandas as pd @@ -10,44 +12,62 @@ logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) -def outrank_task_result_summary(args): - triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') +def read_and_sort_triplets(triplets_path: str) -> pd.DataFrame: + """Read triplets from a file and sort by the 'Score' column.""" triplets = pd.read_csv(triplets_path, sep='\t') - triplets = triplets.sort_values(by='Score', ascending=False) + return triplets.sort_values(by='Score', ascending=False) + +def generate_final_ranking(triplets: pd.DataFrame, label_column: str) -> list[list[Any]]: + """Generate final ranking based on the label column.""" final_ranking = [] for _, row in triplets.iterrows(): feature_a, feature_b = row['FeatureA'], row['FeatureB'] score = row['Score'] - if args.label_column == feature_a.split('-')[0]: + if label_column == feature_a.split('-')[0]: final_ranking.append([feature_b, score]) - elif args.label_column == feature_b.split('-')[0]: + elif label_column == feature_b.split('-')[0]: final_ranking.append([feature_a, score]) + return final_ranking - final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {args.heuristic}']) + +def create_final_dataframe(final_ranking: list[list[Any]], heuristic: str) -> pd.DataFrame: + """Create a final DataFrame and normalize if necessary.""" + final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {heuristic}']) final_df = ( final_df.groupby('Feature') .median() .reset_index() - .sort_values(by=f'Score {args.heuristic}', ascending=False) + .sort_values(by=f'Score {heuristic}', ascending=False) ) - if "MI" in args.heuristic: - min_score = final_df[f'Score {args.heuristic}'].min() - max_score = final_df[f'Score {args.heuristic}'].max() - final_df[f'Score {args.heuristic}'] = (final_df[f'Score {args.heuristic}'] - min_score) / (max_score - min_score) + if 'MI' in heuristic: + min_score = final_df[f'Score {heuristic}'].min() + max_score = final_df[f'Score {heuristic}'].max() + final_df[f'Score {heuristic}'] = (final_df[f'Score {heuristic}'] - min_score) / (max_score - min_score) + + return final_df - logging.info(f'Storing summary files to {args.output_folder}') + +def store_summary_files(final_df: pd.DataFrame, output_folder: str, heuristic: str, tldr: bool) -> None: + """Store the summary files and optionally print the head of the DataFrame.""" + logging.info(f'Storing summary files to {output_folder}') pd.set_option('display.max_rows', None, 'display.max_columns', None) - singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') + singles_path = os.path.join(output_folder, 'feature_singles.tsv') final_df.to_csv(singles_path, sep='\t', index=False) - if args.interaction_order > 1: + if tldr: + print(final_df.head(20)) + + +def handle_interaction_order(final_df: pd.DataFrame, output_folder: str, heuristic: str, interaction_order: int) -> None: + """Handle the interaction order if it is greater than 1.""" + if interaction_order > 1: feature_store = defaultdict(list) for _, row in final_df.iterrows(): fname = row['Feature'] - score = row[f'Score {args.heuristic}'] + score = row[f'Score {heuristic}'] if 'AND' in fname: for el in fname.split('-')[0].split(' AND '): feature_store[el].append(score) @@ -55,13 +75,29 @@ def outrank_task_result_summary(args): final_aggregate_df = pd.DataFrame([ { 'Feature': k, - f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(v), + f'Combined score (order: {interaction_order}, {heuristic})': np.median(v), } for k, v in feature_store.items() ]) final_aggregate_df.to_csv( - os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False + os.path.join(output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False, ) - transformers_only_path = singles_path.replace('.tsv', '_transformers_only_imp.tsv') + +def filter_transformers_only(final_df: pd.DataFrame, output_folder: str) -> None: + """Filter the DataFrame to include only transformer features and store the result.""" + transformers_only_path = os.path.join(output_folder, 'feature_singles_transformers_only_imp.tsv') final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False) + + +def outrank_task_result_summary(args) -> None: + """Main function to generate a summary of outrank task results.""" + triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv') + triplets = read_and_sort_triplets(triplets_path) + + final_ranking = generate_final_ranking(triplets, args.label_column) + final_df = create_final_dataframe(final_ranking, args.heuristic) + + store_summary_files(final_df, args.output_folder, args.heuristic, args.tldr) + handle_interaction_order(final_df, args.output_folder, args.heuristic, args.interaction_order) + filter_transformers_only(final_df, args.output_folder) diff --git a/outrank/visualizations/ranking_visualization.py b/outrank/visualizations/ranking_visualization.py index 0719d8e..d2ae48a 100644 --- a/outrank/visualizations/ranking_visualization.py +++ b/outrank/visualizations/ranking_visualization.py @@ -43,7 +43,7 @@ def visualize_hierarchical_clusters( values='Score', index='FeatureA', columns='FeatureB', - aggfunc=np.mean, + aggfunc='mean', # Updated from np.mean to 'mean' ) pivot_table.fillna(0, inplace=True) @@ -59,7 +59,7 @@ def visualize_hierarchical_clusters( ) plt.title(f'Linkage function: {linkage_heuristic}') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}' plt.savefig(out_path, dpi=300) @@ -95,7 +95,7 @@ def visualize_hierarchical_clusters( dfx.columns = ['Silhouette', 'threshold', 'numClusters'] sns.lineplot(x='numClusters', y='Silhouette', data=dfx, color='black') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() out_path = f'{output_folder}/SilhouetteProfile.{image_format}' plt.savefig(out_path, dpi=300) @@ -113,7 +113,7 @@ def visualize_hierarchical_clusters( projected_data['ClusterID'] = top_clustering.astype(str) sns.scatterplot(x='Dim1', y='Dim2', hue='ClusterID', data=projected_data, palette='Set2') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() plt.savefig(f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300) plt.clf() @@ -130,7 +130,7 @@ def visualize_heatmap( sns.set(font_scale=2) fig, ax = plt.subplots() pivot_table = pd.pivot_table( - triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc=np.mean, + triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc='mean', # Updated from np.mean to 'mean' ) mask = np.zeros_like(pivot_table.values) mask[np.triu_indices_from(mask)] = True @@ -160,7 +160,7 @@ def visualize_heatmap( plt.xlabel('') plt.ylabel('') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500) plt.clf() @@ -245,7 +245,7 @@ def visualize_barplots( plt.xlabel(f'Feature importance (based on heuristic {heuristic})') plt.ylabel('') with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) + warnings.simplefilter('ignore', UserWarning) plt.tight_layout() plt.savefig(f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300) plt.clf() diff --git a/setup.py b/setup.py index 6e2fd4f..c004f3e 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def _read_description(): packages = [x for x in setuptools.find_packages() if x != 'test'] setuptools.setup( name='outrank', - version='0.97.3', + version='0.97.4', description='OutRank: Feature ranking for massive sparse data sets.', long_description=_read_description(), long_description_content_type='text/markdown',