Skip to content

Commit

Permalink
hypothesis testing on mAP instead of AP
Browse files Browse the repository at this point in the history
  • Loading branch information
johnarevalo committed Dec 11, 2023
1 parent 91a282b commit 88cbe4f
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 29 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "copairs"
version = "0.3.3"
version = "0.4.0-alpha"
description = "Find pairs and compute metrics between them"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
13 changes: 9 additions & 4 deletions src/copairs/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
from tqdm.autonotebook import tqdm


def parallel_map(par_func, items):
'''Execute par_func(i) for every i in items using ThreadPool and tqdm.'''
num_items = len(items)
Expand All @@ -18,12 +19,14 @@ def parallel_map(par_func, items):
pass


def batch_processing(pairwise_op: Callable[[np.ndarray, np.ndarray],np.ndarray],):
def batch_processing(pairwise_op: Callable[[np.ndarray, np.ndarray],
np.ndarray], ):
'''Decorator adding the batch_size param to run the function with multithreading using a list of paired indices'''

def batched_fn(feats: np.ndarray, pair_ix: np.ndarray, batch_size: int):
num_pairs = len(pair_ix)
result = np.empty(num_pairs, dtype=np.float32)

def par_func(i):
x_sample = feats[pair_ix[i:i + batch_size, 0]]
y_sample = feats[pair_ix[i:i + batch_size, 1]]
Expand All @@ -32,6 +35,7 @@ def par_func(i):
parallel_map(par_func, np.arange(0, num_pairs, batch_size))

return result

return batched_fn


Expand Down Expand Up @@ -112,7 +116,6 @@ def random_ap(num_perm: int, num_pos: int, total: int, seed) -> np.ndarray:
rng = np.random.default_rng(seed)
rel_k = random_binary_matrix(num_perm, total, num_pos, rng)
null_dist = compute_ap(rel_k)
null_dist.sort()
return null_dist


Expand All @@ -137,10 +140,12 @@ def get_null_dists(confs, null_size, seed):
seeds = rng.integers(8096, size=num_confs)

null_dists = np.empty([len(confs), null_size], dtype=np.float32)

def par_func(i):
num_pos, total = confs[i]
null_dists[i] = null_dist_cached(num_pos, total, seeds[i],
null_size, cache_dir)
null_dists[i] = null_dist_cached(num_pos, total, seeds[i], null_size,
cache_dir)

parallel_map(par_func, np.arange(num_confs))
return null_dists

Expand Down
58 changes: 34 additions & 24 deletions src/copairs/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from copairs import compute
from copairs.matching import Matcher, MatcherMultilabel
from tqdm.contrib.concurrent import thread_map

logger = logging.getLogger('copairs')

Expand Down Expand Up @@ -61,23 +62,40 @@ def create_matcher(obs: pd.DataFrame,
return Matcher(obs, columns, seed=0)


def aggregate(result: pd.DataFrame, sameby, threshold: float) -> pd.DataFrame:
agg_rs = result.groupby(sameby, as_index=False, observed=True).agg({
'average_precision':
'mean',
'p_value':
lambda p_values: -np.log10(p_values).mean(),
})
def aggregate(ap_scores: pd.DataFrame, sameby, null_size: int,
threshold: float, seed: int) -> pd.DataFrame:
ap_scores = ap_scores.reset_index()

logger.info('Computing null_dist...')
null_confs = ap_scores[['n_pos_pairs', 'n_total_pairs']].values
null_confs, rev_ix = np.unique(null_confs, return_inverse=True)
null_dists = compute.get_null_dists(null_confs, null_size, seed=seed)
ap_scores['null_ix'] = rev_ix

def get_p_value(params):
map_score, indices = params
null_dist = null_dists[indices].mean(axis=0)
num = (null_dist > map_score).sum()
p_value = (num + 1) / (null_size + 1)
return p_value

logger.info('Computing p-values...')

def g(df):
return df['average_precision'].agg(['mean', lambda x: list(x.index)])

map_scores = ap_scores.groupby(sameby).agg(g)
map_scores.columns = ['mean_average_precision', 'indices']

params = map_scores[['mean_average_precision', 'indices']]
map_scores['p_value'] = thread_map(get_p_value, params.values)
reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(
10**-agg_rs['p_value'], method='fdr_bh')
agg_rs['q_value'] = pvals_corrected
agg_rs['nlog10qvalue'] = (-np.log10(agg_rs['q_value']))
agg_rs.rename({'p_value': 'nlog10pvalue'}, axis=1, inplace=True)
agg_rs['above_p_threshold'] = agg_rs['nlog10pvalue'] > -np.log10(threshold)
agg_rs['above_q_threshold'] = agg_rs['nlog10qvalue'] > -np.log10(threshold)
agg_rs.rename(columns={'average_precision': 'mean_average_precision'},
inplace=True)
return agg_rs
map_scores['p_value'], method='fdr_bh')
map_scores['corrected_p_value'] = pvals_corrected
map_scores['below_p'] = map_scores['p_value'] > threshold
map_scores['below_corrected_p'] = map_scores[
'corrected_p_value'] > threshold
return map_scores


def build_rank_lists(pos_pairs, neg_pairs, pos_dists, neg_dists):
Expand Down Expand Up @@ -108,7 +126,6 @@ def run_pipeline(meta,
pos_diffby,
neg_sameby,
neg_diffby,
null_size,
batch_size=20000,
seed=0) -> pd.DataFrame:
columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby)
Expand Down Expand Up @@ -147,15 +164,8 @@ def run_pipeline(meta,
logger.info('Computing average precision...')
ap_scores, null_confs = compute.compute_ap_contiguous(rel_k_list, counts)

logger.info('Computing p-values...')
p_values = compute.compute_p_values(ap_scores,
null_confs,
null_size,
seed=seed)

logger.info('Creating result DataFrame...')
meta.loc[paired_ix, 'average_precision'] = ap_scores
meta.loc[paired_ix, 'p_value'] = p_values
meta.loc[paired_ix, "n_pos_pairs"] = null_confs[:, 0]
meta.loc[paired_ix, "n_total_pairs"] = null_confs[:, 1]
logger.info('Finished.')
Expand Down

0 comments on commit 88cbe4f

Please sign in to comment.