Skip to content

Commit

Permalink
Merge pull request #47 from cytomining/minor_fixes
Browse files Browse the repository at this point in the history
Minor fixes
  • Loading branch information
johnarevalo authored Nov 13, 2023
2 parents 00055c2 + 3b5c727 commit de0c599
Showing 1 changed file with 68 additions and 40 deletions.
108 changes: 68 additions & 40 deletions src/copairs/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def create_matcher(obs: pd.DataFrame,


def aggregate(result: pd.DataFrame, sameby, threshold: float) -> pd.DataFrame:
agg_rs = result.groupby(sameby, as_index=False).agg({
agg_rs = result.groupby(sameby, as_index=False, observed=True).agg({
'average_precision':
'mean',
'p_value':
Expand All @@ -75,36 +75,50 @@ def aggregate(result: pd.DataFrame, sameby, threshold: float) -> pd.DataFrame:
agg_rs.rename({'p_value': 'nlog10pvalue'}, axis=1, inplace=True)
agg_rs['above_p_threshold'] = agg_rs['nlog10pvalue'] > -np.log10(threshold)
agg_rs['above_q_threshold'] = agg_rs['nlog10qvalue'] > -np.log10(threshold)
agg_rs.rename(columns={'average_precision': 'mean_average_precision'}, inplace=True)
agg_rs.rename(columns={'average_precision': 'mean_average_precision'},
inplace=True)
return agg_rs


def build_rank_lists(pos_pairs, neg_pairs, pos_dists, neg_dists):
labels = np.concatenate([np.ones(pos_pairs.size, dtype=np.int32),
np.zeros(neg_pairs.size, dtype=np.int32)])
labels = np.concatenate([
np.ones(pos_pairs.size, dtype=np.int32),
np.zeros(neg_pairs.size, dtype=np.int32)
])
ix = np.concatenate([pos_pairs.ravel(), neg_pairs.ravel()])
dist_all = np.concatenate([np.repeat(pos_dists, 2), np.repeat(neg_dists, 2)])
dist_all = np.concatenate(
[np.repeat(pos_dists, 2),
np.repeat(neg_dists, 2)])
ix_sort = np.lexsort([1 - dist_all, ix])
rel_k_list = labels[ix_sort]
_, counts = np.unique(ix, return_counts=True)
return rel_k_list, counts


def run_pipeline(
meta,
feats,
pos_sameby,
pos_diffby,
neg_sameby,
neg_diffby,
null_size,
batch_size=20000,
seed=0
) -> pd.DataFrame:
def validate_pipeline_input(meta, feats, columns):
if meta[columns].isna().any(axis=None):
raise ValueError('metadata columns should not have null values.')
if len(meta) != len(feats):
raise ValueError('meta and feats have different number of rows')


def run_pipeline(meta,
feats,
pos_sameby,
pos_diffby,
neg_sameby,
neg_diffby,
null_size,
batch_size=20000,
seed=0) -> pd.DataFrame:
columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby)
validate_pipeline_input(meta, feats, columns)

# Critical!, otherwise the indexing wont work
meta = meta.reset_index(drop=True).copy()
logger.info('Indexing metadata...')
matcher = create_matcher(meta, pos_sameby, pos_diffby, neg_sameby, neg_diffby)
matcher = create_matcher(meta, pos_sameby, pos_diffby, neg_sameby,
neg_diffby)

logger.info('Finding positive pairs...')
pos_pairs = matcher.get_all_pairs(sameby=pos_sameby, diffby=pos_diffby)
Expand All @@ -127,17 +141,23 @@ def run_pipeline(
neg_dists = compute.pairwise_cosine(feats, neg_pairs, batch_size)

logger.info('Building rank lists...')
rel_k_list, counts = build_rank_lists(pos_pairs, neg_pairs, pos_dists, neg_dists)
rel_k_list, counts = build_rank_lists(pos_pairs, neg_pairs, pos_dists,
neg_dists)

logger.info('Computing average precision...')
ap_scores, null_confs = compute.compute_ap_contiguous(rel_k_list, counts)

logger.info('Computing p-values...')
p_values = compute.compute_p_values(ap_scores, null_confs, null_size, seed=seed)
p_values = compute.compute_p_values(ap_scores,
null_confs,
null_size,
seed=seed)

logger.info('Creating result DataFrame...')
meta['average_precision'] = ap_scores
meta['p_value'] = p_values
meta["n_pos_pairs"] = null_confs[:, 0]
meta["n_total_pairs"] = null_confs[:, 1]
logger.info('Finished.')
return meta

Expand All @@ -161,10 +181,12 @@ def negs_for(query: np.ndarray):
slices = compute.concat_ranges(start, end)
batch_dists = neg_dists[slices]
return batch_dists, sizes

return negs_for


def build_rank_lists_multi(pos_pairs, pos_dists, pos_counts, negs_for, null_size, seed):
def build_rank_lists_multi(pos_pairs, pos_dists, pos_counts, negs_for,
null_size, seed):
ap_scores_list, p_values_list, ix_list = [], [], []

start = 0
Expand All @@ -175,35 +197,41 @@ def build_rank_lists_multi(pos_pairs, pos_dists, pos_counts, negs_for, null_size
query = np.unique(mpos_pairs)
neg_dists, neg_counts = negs_for(query)
neg_ix = np.repeat(query, neg_counts)
labels = np.concatenate([np.ones(mpos_pairs.size, dtype=np.int32),
np.zeros(len(neg_dists), dtype=np.int32)])
labels = np.concatenate([
np.ones(mpos_pairs.size, dtype=np.int32),
np.zeros(len(neg_dists), dtype=np.int32)
])

ix = np.concatenate([mpos_pairs.ravel(), neg_ix])
dist_all = np.concatenate([np.repeat(mpos_dists, 2), neg_dists])
ix_sort = np.lexsort([1 - dist_all, ix])
rel_k_list = labels[ix_sort]
_, counts = np.unique(ix, return_counts=True)
ap_scores, null_confs = compute.compute_ap_contiguous(rel_k_list, counts)
p_values = compute.compute_p_values(ap_scores, null_confs, null_size, seed=seed)
ap_scores, null_confs = compute.compute_ap_contiguous(
rel_k_list, counts)
p_values = compute.compute_p_values(ap_scores,
null_confs,
null_size,
seed=seed)

ap_scores_list.append(ap_scores)
p_values_list.append(p_values)
ix_list.append(query)
return ap_scores_list, p_values_list, ix_list


def run_pipeline_multilabel(
meta,
feats,
pos_sameby,
pos_diffby,
neg_sameby,
neg_diffby,
null_size,
multilabel_col,
batch_size=20000,
seed=0
) -> pd.DataFrame:
def run_pipeline_multilabel(meta,
feats,
pos_sameby,
pos_diffby,
neg_sameby,
neg_diffby,
null_size,
multilabel_col,
batch_size=20000,
seed=0) -> pd.DataFrame:
columns = flatten_str_list(pos_sameby, pos_diffby, neg_sameby, neg_diffby)
validate_pipeline_input(meta, feats, columns)
# Critical!, otherwise the indexing wont work
meta = meta.reset_index(drop=True).copy()

Expand Down Expand Up @@ -238,8 +266,7 @@ def run_pipeline_multilabel(
logger.info('Computing mAP and p-values per label...')
negs_for = create_neg_query_solver(neg_pairs, neg_dists)
ap_scores_list, p_values_list, ix_list = build_rank_lists_multi(
pos_pairs, pos_dists, pos_counts, negs_for, null_size, seed
)
pos_pairs, pos_dists, pos_counts, negs_for, null_size, seed)

logger.info('Creating result DataFrame...')
results = []
Expand All @@ -248,7 +275,7 @@ def run_pipeline_multilabel(
'average_precision': ap_scores_list[i],
'p_value': p_values_list[i],
'ix': ix_list[i],
})
})
if isinstance(key, tuple):
# Is a ComposedKey
for k, v in zip(key._fields, key):
Expand All @@ -258,6 +285,7 @@ def run_pipeline_multilabel(
results.append(result)
results = pd.concat(results).reset_index(drop=True)
meta = meta.drop(multilabel_col, axis=1)
results = meta.merge(results, right_on='ix', left_index=True).drop('ix', axis=1)
results = meta.merge(results, right_on='ix', left_index=True).drop('ix',
axis=1)
logger.info('Finished.')
return results

0 comments on commit de0c599

Please sign in to comment.