Skip to content

Commit

Permalink
fixed tests, fixed errors in generator.py (#75)
Browse files Browse the repository at this point in the history
Errors found in logical combination functions (_xor, _or, _and -  all called bitwise_xor instead of respective functions), error in generate label when passing distribution list (removed unnecesary prints, fixed boundary selection). Fixed errors in testing logical combinations.
  • Loading branch information
98MM authored Jul 19, 2024
1 parent 39b13fd commit 92c1e2e
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 57 deletions.
61 changes: 36 additions & 25 deletions outrank/algorithms/synthetic_data_generators/cc_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def generate_data(
random_values: bool | None = False,
low: int | None = 0,
high: int | None = 1000,
k: int | float = 10,
seed: int = 42,
) -> np.ndarray:

Expand All @@ -53,6 +54,7 @@ def generate_data(
:param random_values: flag, enables random (integer) feature values from set [low, high]
:param low: sets lower bound of random feature values
:param high: sets high bound of random feature values
:param k: scale constant for normal distribution, default 10, sets width of normal distribution, larger value -> narrower peak
:param seed: sets seed of numpy random
:return: X, 2D dataset
"""
Expand All @@ -69,7 +71,7 @@ def generate_data(
})

np.random.seed(seed)
X = np.empty([n_features, n_samples])
X = np.empty([n_features, n_samples], dtype='int32')

# No specific structure parameter passed
if structure is None:
Expand All @@ -81,6 +83,7 @@ def generate_data(
random_values=random_values,
low=low,
high=high,
k=k,
)
X[i] = x
# Structure parameter passed, building based on structure
Expand All @@ -102,6 +105,7 @@ def generate_data(
random_values=random_values,
low=low,
high=high,
k=k,
)
X[ix] = x
ix += 1
Expand Down Expand Up @@ -132,6 +136,7 @@ def generate_data(
random_values=random_values,
low=low,
high=high,
k=k,
)
X[ix] = x
ix += 1
Expand All @@ -158,6 +163,7 @@ def generate_data(
random_values=random_values,
low=low,
high=high,
k=k,
)
X[i] = x

Expand All @@ -171,6 +177,7 @@ def _configure_generate_feature(
random_values: bool | None = False,
low: int | None = 0,
high: int | None = 1000,
k: int | float = 10,
) -> np.ndarray:

"""
Expand All @@ -181,6 +188,7 @@ def _configure_generate_feature(
:param random_values: randomly picked values for vec if true, otherwise values range from [low, cardinality] with by 1
:param low: lower bound of random feature vector values
:param high: upper bound of random feature vector values
:param k: scale constant for normal distribution, default 10, sets width of normal distribution, larger value -> narrower peak
:return: feature vector
"""

Expand All @@ -194,6 +202,7 @@ def _configure_generate_feature(
random_values=random_values,
low=low,
high=high,
k=k,
)
# feature_cardinality is a list of [value_domain, value_frequencies]
else:
Expand All @@ -212,6 +221,7 @@ def _configure_generate_feature(
n_samples,
vec=value_domain,
ensure_rep=ensure_rep,
k=k,
)

return x
Expand All @@ -226,6 +236,7 @@ def _generate_feature(
low: int | None = 0,
high: int | None = 1000,
p: list[float] | np.ndarray | None = None,
k: int | float = 10,
) -> np.ndarray:
"""
Generates feature vector of length size. Default probability density distribution is approximately normal, centred around a randomly picked value.
Expand All @@ -237,20 +248,23 @@ def _generate_feature(
:param low: lower bound of random feature vector values
:param high: upper bound of random feature vector values
:param p: list of probabilities of each value
:param k: scale constant for normal distribution, default 10, sets width of normal distribution, larger value -> narrower peak
:return: feature vector x
"""

if vec is None:
if random_values:
vec = np.random.choice(range(low, high + 1), cardinality, replace=False)
vec = range(low, high + 1)
vec = np.random.choice(vec, size=cardinality, replace=False)
else:
vec = np.arange(low, low + cardinality, 1)
else:
vec = np.array(vec)

vec_len = len(vec)
if p is None:
v_shift = vec - vec[np.random.randint(len(vec))]
p = norm.pdf(v_shift, scale=3)
p = norm.pdf(v_shift, scale=vec_len/k)
else:
p = np.array(p)

Expand All @@ -263,7 +277,7 @@ def _generate_feature(
sampled_values = np.random.choice(vec, size=size, p=p)

np.random.shuffle(sampled_values)
return sampled_values
return sampled_values.astype('int32')

def generate_combinations(
self,
Expand Down Expand Up @@ -309,6 +323,7 @@ def _xor(self, arr: list[int] | ArrayLike) -> np.ndarray:
:param arr: features to perform XOR operation on
:return: bitwise XOR result
"""
arr = np.array(arr)
arrT = arr.T
arrT = arrT.astype(int)
out = np.bitwise_xor(arrT[0], arrT[1])
Expand All @@ -324,9 +339,10 @@ def _and(self, arr: list[int] | ArrayLike) -> np.ndarray:
:param arr: features to perform AND operation on
:return: bitwise AND result
"""
arr = np.array(arr)
arrT = arr.T
arrT = arrT.astype(int)
out = np.bitwise_xor(arrT[0], arrT[1])
out = np.bitwise_and(arrT[0], arrT[1])
if len(arrT) > 2:
for i in range(2, len(arrT)):
out = np.bitwise_and(out, arrT[i])
Expand All @@ -339,9 +355,10 @@ def _or(self, arr: list[int] | ArrayLike) -> np.ndarray:
:param arr: features to perform OR operation on
:return: bitwise OR result
"""
arr = np.array(arr)
arrT = arr.T
arrT = arrT.astype(int)
out = np.bitwise_xor(arrT[0], arrT[1])
out = np.bitwise_or(arrT[0], arrT[1])
if len(arrT) > 2:
for i in range(2, len(arrT)):
out = np.bitwise_or(out, arrT[i])
Expand Down Expand Up @@ -458,8 +475,7 @@ def generate_labels(
if isinstance(p, (list, np.ndarray)):
if sum(p) > 1: raise ValueError('sum of values in must be less than 1.0')
if len(p) > n: raise ValueError('length of p must equal n')

if p > 1: raise ValueError('p must be less than 1.0')
elif p > 1.0: raise ValueError('p must be less than 1.0')

n_samples, n_features = X.shape

Expand Down Expand Up @@ -494,22 +510,20 @@ def generate_labels(

for i in range(1, len(percentiles) - 1):
percentiles[i] += percentiles[i - 1]

percentiles.insert(0, 0)
percentiles.pop()
print(percentiles)

p_points = np.percentile(decision_boundary, percentiles)
print(p_points)

y = np.zeros_like(decision_boundary, dtype=int)

for i in range(1, n):
p_point = p_points[i]
for j in range(len(decision_boundary)):
if decision_boundary[j] > p_point:
y[j] += 1
y += np.where(decision_boundary > p_point, 1, 0)
else:
decision_boundary = decision_function(X)
if isinstance(p, (list, np.ndarray)):
p = p[0]
p_point = np.percentile(decision_boundary, p * 100)
y = np.where(decision_boundary > p_point, 1, 0)
else:
Expand Down Expand Up @@ -729,7 +743,7 @@ def downsample_dataset(
self,
X: ArrayLike,
y: list[int] | ArrayLike,
N: int | None = None,
n: int | None = None,
seed: int = 42,
reshuffle: bool = False,
) -> tuple[np.ndarray, np.ndarray]:
Expand All @@ -738,7 +752,7 @@ def downsample_dataset(
Downsamples dataset X according to N or the number of samples in minority class, resulting in a balanced dataset.
:param X: Dataset to downsample
:param y: Labels corresponding to X
:param N: Optional number of samples per class to downsample to
:param n: Optional number of samples per class to downsample to
:param seed: Seed for random state of resample function
:param reshuffle: Reshuffle the dataset after downsampling
:return: Balanced X and y after downsampling
Expand All @@ -747,10 +761,10 @@ def downsample_dataset(
original_shape = X.shape

values, counts = np.unique(y, return_counts=True)
if N is None:
N = min(counts)
if n is None:
n = min(counts)

if N > min(counts):
if n > min(counts):
raise ValueError('N must be equal to or less than the number of samples in minority class')

X_arrays_list = []
Expand All @@ -760,11 +774,11 @@ def downsample_dataset(
X_label_downsample = resample(
X_label,
replace=True,
n_samples=N,
n_samples=n,
random_state=seed,
)
X_arrays_list.append(X_label_downsample)
ys = [label] * N
ys = [label] * n
y_downsampled = np.concatenate((y_downsampled, ys), axis=0)

X_downsampled = np.concatenate(X_arrays_list, axis=0)
Expand Down Expand Up @@ -810,7 +824,4 @@ def print_dataset(
print(f'], Label: {y[n]}')
n += 1

"""
def summarize(self):
# TODO: Logging function
"""
# TODO: Logging function
74 changes: 42 additions & 32 deletions tests/cc_generator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,17 @@ class TestCategoricalClassification(unittest.TestCase):
def setUp(self):
self.cc_instance = CategoricalClassification()

# def test_init(self):
# self.assertEqual(self.cc_instance.dataset_info, '')
def test_init(self):
self.assertIsInstance(self.cc_instance, CategoricalClassification)
dict = {
'general': {},
'combinations': [],
'correlations': [],
'duplicates': [],
'labels': {},
'noise': [],
}
self.assertEqual(self.cc_instance.dataset_info, dict)

def test_generate_data_shape_and_type(self):
X = self.cc_instance.generate_data(n_features=5, n_samples=100)
Expand Down Expand Up @@ -105,43 +114,44 @@ def test_generate_duplicates_duplication(self):
Xt = X.T
self.assertTrue((Xt[0] == Xt[-1]).all())

# def test_xor_operation(self):
# a = np.array([1, 0, 1])
# b = np.array([0, 1, 1])
# arr = [a, b]
# result = self.cc_instance._xor(arr)
# expected = np.array([1, 1, 0])
# self.assertTrue(np.array_equal(result, expected), 'XOR operation did not produce expected result')

# def test_and_operation(self):
# a = np.array([1, 0, 1])
# b = np.array([0, 1, 1])
# arr = np.array([a, b])
# result = self.cc_instance._and(arr)
# expected = np.array([0, 0, 1])
# self.assertTrue(np.array_equal(result, expected), 'AND operation did not produce expected result')

# def test_or_operation(self):
# a = np.array([1, 0, 1])
# b = np.array([0, 1, 1])
# arr = [a, b]
# result = self.cc_instance._or(arr)
# expected = np.array([1, 1, 1])
# self.assertTrue(np.array_equal(result, expected), 'OR operation did not produce expected result')
def test_xor_operation(self):
a = np.array([1, 0, 1])
b = np.array([0, 1, 1])
arr = np.array([a, b])
result = self.cc_instance._xor(arr)
expected = np.array([0, 0])
self.assertTrue(np.array_equal(result, expected), 'XOR operation did not produce expected result')

def test_and_operation(self):
a = np.array([1, 0, 1])
b = np.array([0, 1, 1])
arr = np.array([a, b])
result = self.cc_instance._and(arr)
expected = np.array([0, 0])
self.assertTrue(np.array_equal(result, expected), 'AND operation did not produce expected result')

def test_or_operation(self):
a = np.array([1, 0, 1])
b = np.array([0, 1, 1])
arr = np.array([a, b])
result = self.cc_instance._or(arr)
expected = np.array([1, 1])
self.assertTrue(np.array_equal(result, expected), 'OR operation did not produce expected result')

def test_generate_labels_shape_and_type(self):
X = self.cc_instance.generate_data(n_features=5, n_samples=100)
labels = self.cc_instance.generate_labels(X)
self.assertIsInstance(labels, np.ndarray, 'Output should be a numpy array')
self.assertEqual(labels.shape, (100,), 'Shape should be (n_samples,)')

# def test_generate_labels_distribution(self):
# X = self.cc_instance.generate_data(n_features=5, n_samples=100)
# labels = self.cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5])
# unique, counts = np.unique(labels, return_counts=True)
# distribution = counts / 100
# expected_distribution = np.array([0.2, 0.3, 0.5])
# self.assertTrue(np.allclose(distribution, expected_distribution, atol=0.1), 'Label distribution does not match expected distribution')
def test_generate_labels_distribution(self):
X = self.cc_instance.generate_data(n_features=5, n_samples=100)
labels = self.cc_instance.generate_labels(X, n=3, p=[0.2, 0.3, 0.5])
unique, counts = np.unique(labels, return_counts=True)
distribution = counts / 100
# distribution = [round(d, 1) for d in distribution]
expected_distribution = np.array([0.2, 0.3, 0.5])
self.assertTrue(np.allclose(distribution, expected_distribution, rtol=0.1, atol=0.1), 'Label distribution does not match expected distribution')

def test_generate_labels_class_relation_linear(self):
X = self.cc_instance.generate_data(n_features=5, n_samples=100)
Expand Down

0 comments on commit 92c1e2e

Please sign in to comment.