Skip to content

Commit

Permalink
fix: Count unique texts, data leaks in calculate metrics (#1438)
Browse files Browse the repository at this point in the history
* add more stat

* add more stat

* update statistics
  • Loading branch information
Samoed authored Nov 14, 2024
1 parent 3a1a470 commit dd5d226
Show file tree
Hide file tree
Showing 68 changed files with 47,767 additions and 4,820 deletions.
12 changes: 6 additions & 6 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,11 @@ def calculate_metadata_metrics(

descriptive_stats = {}
hf_subset_stat = "hf_subset_descriptive_stats"
pbar_split = tqdm.tqdm(self.metadata.eval_splits, desc="Processing Splits...")
eval_splits = self.metadata.eval_splits
if self.metadata.type in ["Classification", "MultilabelClassification"]:
eval_splits += ["train"]

pbar_split = tqdm.tqdm(eval_splits, desc="Processing Splits...")
for split in pbar_split:
pbar_split.set_postfix_str(f"Split: {split}")
logger.info(f"Processing metadata for split {split}")
Expand All @@ -215,12 +219,8 @@ def calculate_metadata_metrics(
if isinstance(self.metadata.eval_langs, dict)
else self.metadata.eval_langs
)
if self.metadata.type == "Classification":
eval_langs += ["train"]

pbar_subsets = tqdm.tqdm(
self.metadata.eval_langs, desc="Processing Languages..."
)
pbar_subsets = tqdm.tqdm(eval_langs, desc="Processing Languages...")
for hf_subset in pbar_subsets:
pbar_subsets.set_postfix_str(f"Language: {hf_subset}")
logger.info(f"Processing metadata for language {hf_subset}")
Expand Down
39 changes: 34 additions & 5 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,31 @@ class BitextDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
unique_pairs: Number of duplicate pairs
min_sentence1_length: Minimum length of sentence1
average_sentence1_length: Average length of sentence1
max_sentence1_length: Maximum length of sentence1
unique_sentence1: Number of duplicates in sentence1
min_sentence2_length: Minimum length of sentence2
average_sentence2_length: Average length of sentence2
max_sentence2_length: Maximum length of sentence2
"""

num_samples: int
number_of_characters: int
unique_pairs: int

min_sentence1_length: int
average_sentence1_length: float
max_sentence1_length: int
unique_sentence1: int

min_sentence2_length: int
average_sentence2_length: float
max_sentence2_length: int
unique_sentence2: int


class AbsTaskBitextMining(AbsTask):
Expand Down Expand Up @@ -153,12 +170,24 @@ def _calculate_metrics_from_split(
sent_1, sent_2 = pairs_cols[0]
sentence1 = self.dataset[split][sent_1]
sentence2 = self.dataset[split][sent_2]
total_s1_len = sum([len(s1) for s1 in sentence1])
total_s2_len = sum([len(s2) for s2 in sentence2])

s1_len = [len(s1) for s1 in sentence1]
s2_len = [len(s2) for s2 in sentence2]
total_s1_len = sum(s1_len)
total_s2_len = sum(s2_len)

unique_pairs = len(set(zip(sentence1, sentence2)))
unique_sentence1 = len(set(sentence1))
unique_sentence2 = len(set(sentence2))
return BitextDescriptiveStatistics(
average_sentence1_length=total_s1_len / len(sentence1),
average_sentence2_length=total_s2_len / len(sentence2),
num_samples=len(sentence1),
number_of_characters=total_s1_len + total_s2_len,
unique_pairs=unique_pairs,
min_sentence1_length=min(s1_len),
average_sentence1_length=sum(s1_len) / len(sentence1),
max_sentence1_length=max(s1_len),
unique_sentence1=unique_sentence1,
min_sentence2_length=min(s2_len),
average_sentence2_length=total_s2_len / len(sentence2),
max_sentence2_length=max(s2_len),
unique_sentence2=unique_sentence2,
)
29 changes: 28 additions & 1 deletion mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,26 @@ class ClassificationDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
num_texts_in_train: Number of texts in the train split
min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_text: Number of unique texts
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int
num_texts_in_train: int | None

min_text_length: int
average_text_length: float
max_text_length: int
unique_text: int

unique_labels: int
labels: dict[str, dict[str, int]]

Expand Down Expand Up @@ -205,25 +217,40 @@ def _undersample_data(self, X, y, samples_per_label: int, idxs=None):
def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> ClassificationDescriptiveStatistics:
train_text = []
if hf_subset:
text = self.dataset[hf_subset][split]["text"]
label = self.dataset[hf_subset][split]["label"]
if split != "train":
train_text = self.dataset[hf_subset]["train"]["text"]
elif compute_overall:
text = []
label = []
for hf_subset in self.metadata.eval_langs:
text.extend(self.dataset[hf_subset][split]["text"])
label.extend(self.dataset[hf_subset][split]["label"])
if split != "train":
train_text.extend(self.dataset[hf_subset]["train"]["text"])
else:
text = self.dataset[split]["text"]
label = self.dataset[split]["label"]
if split != "train":
train_text = self.dataset["train"]["text"]

total_text_len = sum([len(t) for t in text])
text_len = [len(t) for t in text]
total_text_len = sum(text_len)
label_count = Counter(label)
num_texts_in_train = (
len(set(text) & set(train_text)) if split != "train" else None
)
return ClassificationDescriptiveStatistics(
num_samples=len(text),
number_of_characters=total_text_len,
num_texts_in_train=num_texts_in_train,
min_text_length=min(text_len),
average_text_length=total_text_len / len(text),
max_text_length=max(text_len),
unique_text=len(set(text)),
unique_labels=len(label_count),
labels={
str(label): {"count": count} for label, count in label_count.items()
Expand Down
26 changes: 25 additions & 1 deletion mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,31 @@ class ClusteringDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_texts: Number of unique texts
min_labels_per_text: Minimum number of labels per text
average_labels_per_text: Average number of labels per text
max_labels_per_text: Maximum number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int

min_text_length: int
average_text_length: float
max_text_length: int
unique_texts: int

min_labels_per_text: int
average_labels_per_text: float
max_labels_per_text: int

unique_labels: int
labels: dict[str, dict[str, int]]

Expand Down Expand Up @@ -96,7 +111,11 @@ def _calculate_metrics_from_split(
sentences = self.dataset[split]["sentences"]
labels = self.dataset[split]["labels"]

total_text_len = sum([len(t) for t in sentences])
text_len = [len(t) for t in sentences]
all_sentences = []
for s in sentences:
all_sentences.extend(s)
total_text_len = sum(text_len)
total_labels = []
for label in labels:
if isinstance(label, list):
Expand All @@ -107,8 +126,13 @@ def _calculate_metrics_from_split(
return ClusteringDescriptiveStatistics(
num_samples=len(sentences),
number_of_characters=total_text_len,
min_text_length=min(text_len),
average_text_length=total_text_len / len(sentences),
max_text_length=max(text_len),
unique_texts=len(set(all_sentences)),
min_labels_per_text=min(label_counter.values()),
average_labels_per_text=len(total_labels) / len(sentences),
max_labels_per_text=max(label_counter.values()),
unique_labels=len(label_counter),
labels={
str(label): {
Expand Down
21 changes: 20 additions & 1 deletion mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,30 @@ class ClusteringFastDescriptiveStatistics(DescriptiveStatistics):
Attributes:
num_samples: number of samples in the dataset.
number_of_characters: Total number of symbols in the dataset.
min_text_length: Minimum length of text
average_text_length: Average length of text
max_text_length: Maximum length of text
unique_texts: Number of unique texts
min_labels_per_text: Minimum number of labels per text
average_labels_per_text: Average number of labels per text
max_labels_per_text: Maximum number of labels per text
unique_labels: Number of unique labels
labels: dict of label frequencies
"""

num_samples: int
number_of_characters: int

min_text_length: int
average_text_length: float
max_text_length: int
unique_texts: int

min_labels_per_text: int
average_labels_per_text: float
max_labels_per_text: int
unique_labels: int
labels: dict[str, dict[str, int]]

Expand Down Expand Up @@ -226,7 +240,8 @@ def _calculate_metrics_from_split(
sentences = self.dataset[split]["sentences"]
labels = self.dataset[split]["labels"]

total_text_len = sum([len(t) for t in sentences])
text_len = [len(t) for t in sentences]
total_text_len = sum(text_len)
total_labels = []
for label in labels:
if isinstance(label, list):
Expand All @@ -237,8 +252,12 @@ def _calculate_metrics_from_split(
return ClusteringFastDescriptiveStatistics(
num_samples=len(sentences),
number_of_characters=total_text_len,
min_text_length=min(text_len),
average_text_length=total_text_len / len(sentences),
max_text_length=max(text_len),
min_labels_per_text=min(label_counter.values()),
average_labels_per_text=len(total_labels) / len(sentences),
max_labels_per_text=max(label_counter.values()),
unique_labels=len(label_counter),
labels={
str(label): {
Expand Down
Loading

0 comments on commit dd5d226

Please sign in to comment.