Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coverage alignment heuristic #76

Merged
merged 10 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion outrank/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def main():
args = parser.parse_args()

if args.task == 'selftest':
conduct_self_test()
conduct_self_test('MI-numba-randomized')
exit()

if args.data_path is None and args.task != 'data_generator':
Expand Down
28 changes: 28 additions & 0 deletions outrank/algorithms/feature_ranking/ranking_cov_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from __future__ import annotations

import numpy as np
import numpy.typing as npt

np.random.seed(123)
max_size = 10**6
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

qq, can max_size be estimated depending on input vector?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can, but this is directly related to batch sizes, where 1mil is a very very safe bound (many things go wrong before this is reached)



def max_pair_coverage(array1: npt.NDArray[np.int32], array2: npt.NDArray[np.int32]) -> float:
def hash_pair(el1: np.int32, el2: np.int32):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing output type hint :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will add with improved pre-commit hooks, as that should have picked it up

return (el1 * 1471343 - el2) % max_size

counts = np.zeros(max_size, dtype=np.int32)
tot_len = len(array1)
for i in range(tot_len):
identifier = hash_pair(array1[i], array2[i])
counts[identifier] += 1

return np.max(counts) / tot_len


if __name__ == '__main__':

array1 = np.array([1,1,2,3,1,1,1,5] * 100000)
array2 = np.array([0,0,5,5,3,0,0,0] * 100000)
coverage = max_pair_coverage(array1, array2)
assert coverage == 0.5
3 changes: 3 additions & 0 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

from outrank.algorithms.feature_ranking import ranking_cov_alignment
from outrank.core_utils import is_prior_heuristic

logger = logging.getLogger('syn-logger')
Expand Down Expand Up @@ -129,6 +130,8 @@ def get_importances_estimate_pairwise(combination, reference_model_features, arg
estimate_feature_importance = sklearn_surrogate(
vector_first, vector_second, X, args.heuristic,
)
elif 'max-value-coverage' in args.heuristic:
estimate_feature_importance = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)

elif 'MI-numba' in args.heuristic:
estimate_feature_importance = numba_mi(
Expand Down
10 changes: 6 additions & 4 deletions outrank/task_selftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
logger.setLevel(logging.DEBUG)


def conduct_self_test():
def conduct_self_test(heuristic='MI-numba-randomized'):
# Simulate full flow, ranking only
subprocess.run(
'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
)
subprocess.run(
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
f'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --heuristic {heuristic};',
shell=True,
)

Expand All @@ -39,8 +39,10 @@ def conduct_self_test():
logger.info(f'Removing {path} as part of cleanup ..')
shutil.rmtree(path)

logger.info('All tests passed, OutRank seems in shape \N{winking face}')
logger.info(f'All tests passed for heuristic: {heuristic} \N{rocket}')


if __name__ == '__main__':
conduct_self_test()
conduct_self_test('MI-numba-randomized')
conduct_self_test('max-value-coverage')
logger.info('OutRank seems in shape \N{winking face}')
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _read_description():
packages = [x for x in setuptools.find_packages() if x != 'test']
setuptools.setup(
name='outrank',
version='0.96.0',
version='0.96.1',
description='OutRank: Feature ranking for massive sparse data sets.',
long_description=_read_description(),
long_description_content_type='text/markdown',
Expand Down
56 changes: 56 additions & 0 deletions tests/cov_heu_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations

import sys
import unittest

import numpy as np

from outrank.algorithms.feature_ranking.ranking_cov_alignment import \
max_pair_coverage

np.random.seed(123)
sys.path.append('./outrank')


class TestMaxPairCoverage(unittest.TestCase):
def test_basic_functionality(self):
array1 = np.array([1, 2, 3, 1, 2])
array2 = np.array([4, 5, 6, 4, 5])
result = max_pair_coverage(array1, array2)
self.assertAlmostEqual(result, 2/5, places=5)

def test_identical_elements(self):
array1 = np.array([1, 1, 1, 1])
array2 = np.array([1, 1, 1, 1])
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1.0)

def test_large_arrays(self):
array1 = np.random.randint(0, 100, size=10000)
array2 = np.random.randint(0, 100, size=10000)
result = max_pair_coverage(array1, array2)
self.assertTrue(0 <= result <= 1)

def test_all_unique_pairs(self):
array1 = np.array([1, 2, 3, 4, 5])
array2 = np.array([6, 7, 8, 9, 10])
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1/5)

def test_all_same_pairs(self):
array1 = np.array([1, 1, 1, 1, 1])
array2 = np.array([2, 2, 2, 2, 2])
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1.0)

def test_high_collision_potential(self):
array1 = np.array([1] * 1000)
array2 = np.array([2] * 1000)
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1.0)

def test_very_large_arrays(self):
array1 = np.random.randint(0, 1000, size=1000000)
array2 = np.random.randint(0, 1000, size=1000000)
result = max_pair_coverage(array1, array2)
self.assertTrue(0 <= result <= 1)
Loading