From e9d80843f92c7db1dc9dc52890b0c91fc19d3d29 Mon Sep 17 00:00:00 2001 From: John Arevalo Date: Mon, 13 Nov 2023 09:23:07 -0500 Subject: [PATCH 1/4] aggregate observed values only for categorical columns --- src/copairs/map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/copairs/map.py b/src/copairs/map.py index ef5ab05..1a0d14d 100644 --- a/src/copairs/map.py +++ b/src/copairs/map.py @@ -62,7 +62,7 @@ def create_matcher(obs: pd.DataFrame, def aggregate(result: pd.DataFrame, sameby, threshold: float) -> pd.DataFrame: - agg_rs = result.groupby(sameby, as_index=False).agg({ + agg_rs = result.groupby(sameby, as_index=False, observed=True).agg({ 'average_precision': 'mean', 'p_value': From df71846a7608f7b734046c6f7a2c162510004628 Mon Sep 17 00:00:00 2001 From: John Arevalo Date: Mon, 13 Nov 2023 09:30:34 -0500 Subject: [PATCH 2/4] Add counts to average_precision dataframe --- src/copairs/map.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/copairs/map.py b/src/copairs/map.py index 1a0d14d..1d4fb31 100644 --- a/src/copairs/map.py +++ b/src/copairs/map.py @@ -138,6 +138,8 @@ def run_pipeline( logger.info('Creating result DataFrame...') meta['average_precision'] = ap_scores meta['p_value'] = p_values + meta["n_pos_pairs"] = null_confs[:, 0] + meta["n_total_pairs"] = null_confs[:, 1] logger.info('Finished.') return meta From d1a609f9173cebba24b66ff35efea8e481793205 Mon Sep 17 00:00:00 2001 From: John Arevalo Date: Mon, 13 Nov 2023 09:31:20 -0500 Subject: [PATCH 3/4] Raise error when metadata contains nan values --- src/copairs/map.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/copairs/map.py b/src/copairs/map.py index 1d4fb31..1f13438 100644 --- a/src/copairs/map.py +++ b/src/copairs/map.py @@ -102,6 +102,9 @@ def run_pipeline( seed=0 ) -> pd.DataFrame: # Critical!, otherwise the indexing wont work + columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby) + if meta[columns].isna().any(axis=None): + raise ValueError('metadata columns should not have null values.') meta = meta.reset_index(drop=True).copy() logger.info('Indexing metadata...') matcher = create_matcher(meta, pos_sameby, pos_diffby, neg_sameby, neg_diffby) From 3b5c727078f984291a9586dcf79e60709e21d6f7 Mon Sep 17 00:00:00 2001 From: John Arevalo Date: Mon, 13 Nov 2023 09:57:29 -0500 Subject: [PATCH 4/4] Fix #26 check meta and feats len --- src/copairs/map.py | 105 +++++++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 41 deletions(-) diff --git a/src/copairs/map.py b/src/copairs/map.py index 1f13438..fc69403 100644 --- a/src/copairs/map.py +++ b/src/copairs/map.py @@ -75,39 +75,50 @@ def aggregate(result: pd.DataFrame, sameby, threshold: float) -> pd.DataFrame: agg_rs.rename({'p_value': 'nlog10pvalue'}, axis=1, inplace=True) agg_rs['above_p_threshold'] = agg_rs['nlog10pvalue'] > -np.log10(threshold) agg_rs['above_q_threshold'] = agg_rs['nlog10qvalue'] > -np.log10(threshold) - agg_rs.rename(columns={'average_precision': 'mean_average_precision'}, inplace=True) + agg_rs.rename(columns={'average_precision': 'mean_average_precision'}, + inplace=True) return agg_rs def build_rank_lists(pos_pairs, neg_pairs, pos_dists, neg_dists): - labels = np.concatenate([np.ones(pos_pairs.size, dtype=np.int32), - np.zeros(neg_pairs.size, dtype=np.int32)]) + labels = np.concatenate([ + np.ones(pos_pairs.size, dtype=np.int32), + np.zeros(neg_pairs.size, dtype=np.int32) + ]) ix = np.concatenate([pos_pairs.ravel(), neg_pairs.ravel()]) - dist_all = np.concatenate([np.repeat(pos_dists, 2), np.repeat(neg_dists, 2)]) + dist_all = np.concatenate( + [np.repeat(pos_dists, 2), + np.repeat(neg_dists, 2)]) ix_sort = np.lexsort([1 - dist_all, ix]) rel_k_list = labels[ix_sort] _, counts = np.unique(ix, return_counts=True) return rel_k_list, counts -def run_pipeline( - meta, - feats, - pos_sameby, - pos_diffby, - neg_sameby, - neg_diffby, - null_size, - batch_size=20000, - seed=0 -) -> pd.DataFrame: - # Critical!, otherwise the indexing wont work - columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby) +def validate_pipeline_input(meta, feats, columns): if meta[columns].isna().any(axis=None): raise ValueError('metadata columns should not have null values.') + if len(meta) != len(feats): + raise ValueError('meta and feats have different number of rows') + + +def run_pipeline(meta, + feats, + pos_sameby, + pos_diffby, + neg_sameby, + neg_diffby, + null_size, + batch_size=20000, + seed=0) -> pd.DataFrame: + columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby) + validate_pipeline_input(meta, feats, columns) + + # Critical!, otherwise the indexing wont work meta = meta.reset_index(drop=True).copy() logger.info('Indexing metadata...') - matcher = create_matcher(meta, pos_sameby, pos_diffby, neg_sameby, neg_diffby) + matcher = create_matcher(meta, pos_sameby, pos_diffby, neg_sameby, + neg_diffby) logger.info('Finding positive pairs...') pos_pairs = matcher.get_all_pairs(sameby=pos_sameby, diffby=pos_diffby) @@ -130,13 +141,17 @@ def run_pipeline( neg_dists = compute.pairwise_cosine(feats, neg_pairs, batch_size) logger.info('Building rank lists...') - rel_k_list, counts = build_rank_lists(pos_pairs, neg_pairs, pos_dists, neg_dists) + rel_k_list, counts = build_rank_lists(pos_pairs, neg_pairs, pos_dists, + neg_dists) logger.info('Computing average precision...') ap_scores, null_confs = compute.compute_ap_contiguous(rel_k_list, counts) logger.info('Computing p-values...') - p_values = compute.compute_p_values(ap_scores, null_confs, null_size, seed=seed) + p_values = compute.compute_p_values(ap_scores, + null_confs, + null_size, + seed=seed) logger.info('Creating result DataFrame...') meta['average_precision'] = ap_scores @@ -166,10 +181,12 @@ def negs_for(query: np.ndarray): slices = compute.concat_ranges(start, end) batch_dists = neg_dists[slices] return batch_dists, sizes + return negs_for -def build_rank_lists_multi(pos_pairs, pos_dists, pos_counts, negs_for, null_size, seed): +def build_rank_lists_multi(pos_pairs, pos_dists, pos_counts, negs_for, + null_size, seed): ap_scores_list, p_values_list, ix_list = [], [], [] start = 0 @@ -180,16 +197,22 @@ def build_rank_lists_multi(pos_pairs, pos_dists, pos_counts, negs_for, null_size query = np.unique(mpos_pairs) neg_dists, neg_counts = negs_for(query) neg_ix = np.repeat(query, neg_counts) - labels = np.concatenate([np.ones(mpos_pairs.size, dtype=np.int32), - np.zeros(len(neg_dists), dtype=np.int32)]) + labels = np.concatenate([ + np.ones(mpos_pairs.size, dtype=np.int32), + np.zeros(len(neg_dists), dtype=np.int32) + ]) ix = np.concatenate([mpos_pairs.ravel(), neg_ix]) dist_all = np.concatenate([np.repeat(mpos_dists, 2), neg_dists]) ix_sort = np.lexsort([1 - dist_all, ix]) rel_k_list = labels[ix_sort] _, counts = np.unique(ix, return_counts=True) - ap_scores, null_confs = compute.compute_ap_contiguous(rel_k_list, counts) - p_values = compute.compute_p_values(ap_scores, null_confs, null_size, seed=seed) + ap_scores, null_confs = compute.compute_ap_contiguous( + rel_k_list, counts) + p_values = compute.compute_p_values(ap_scores, + null_confs, + null_size, + seed=seed) ap_scores_list.append(ap_scores) p_values_list.append(p_values) @@ -197,18 +220,18 @@ def build_rank_lists_multi(pos_pairs, pos_dists, pos_counts, negs_for, null_size return ap_scores_list, p_values_list, ix_list -def run_pipeline_multilabel( - meta, - feats, - pos_sameby, - pos_diffby, - neg_sameby, - neg_diffby, - null_size, - multilabel_col, - batch_size=20000, - seed=0 -) -> pd.DataFrame: +def run_pipeline_multilabel(meta, + feats, + pos_sameby, + pos_diffby, + neg_sameby, + neg_diffby, + null_size, + multilabel_col, + batch_size=20000, + seed=0) -> pd.DataFrame: + columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby) + validate_pipeline_input(meta, feats, columns) # Critical!, otherwise the indexing wont work meta = meta.reset_index(drop=True).copy() @@ -243,8 +266,7 @@ def run_pipeline_multilabel( logger.info('Computing mAP and p-values per label...') negs_for = create_neg_query_solver(neg_pairs, neg_dists) ap_scores_list, p_values_list, ix_list = build_rank_lists_multi( - pos_pairs, pos_dists, pos_counts, negs_for, null_size, seed - ) + pos_pairs, pos_dists, pos_counts, negs_for, null_size, seed) logger.info('Creating result DataFrame...') results = [] @@ -253,7 +275,7 @@ def run_pipeline_multilabel( 'average_precision': ap_scores_list[i], 'p_value': p_values_list[i], 'ix': ix_list[i], - }) + }) if isinstance(key, tuple): # Is a ComposedKey for k, v in zip(key._fields, key): @@ -263,6 +285,7 @@ def run_pipeline_multilabel( results.append(result) results = pd.concat(results).reset_index(drop=True) meta = meta.drop(multilabel_col, axis=1) - results = meta.merge(results, right_on='ix', left_index=True).drop('ix', axis=1) + results = meta.merge(results, right_on='ix', left_index=True).drop('ix', + axis=1) logger.info('Finished.') return results