Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor evaluation to allow span based metrics #71

Open
wants to merge 16 commits into
base: feature/new-datagen-and-eval
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions presidio_evaluator/data_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections import Counter

import pandas as pd
import numpy as np
import spacy
from spacy import Language
from spacy.tokens import Doc, DocBin
Expand Down Expand Up @@ -73,6 +74,14 @@ def intersect(self, other, ignore_entity_type: bool):
return min(self.end_position, other.end_position) - max(
self.start_position, other.start_position
)

def get_overlap_ratio(self, other):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_overlap_ratio(self, other):
def get_overlap_ratio(self, other: Span):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_overlap_ratio(self, other):
def get_overlap_ratio(self, other: "Span") -> float:

I know we don't have type hints across the entire codebase, but let's try to update at least the methods we add to modernize the codebase.

"""
Calculates the ratio as: ratio = 2.0*M / T , where M = matches , T = total number of elements in both sequences
"""
nb_matches = self.intersect(other, ignore_entity_type = True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will we always want to ignore_the entity type? Perhaps we should pass it as and argument to the function?

total_characters = (self.end_position - self.start_position) + (other.end_position - other.start_position)
return np.round((2*nb_matches/total_characters), 2)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any theoretical chance that total_characters will be equal to 0?


@classmethod
def from_faker_span(cls, faker_span: FakerSpan) -> "Span":
Expand Down
5 changes: 3 additions & 2 deletions presidio_evaluator/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .model_error import ModelError
from .evaluator_objects import SpanOutput, TokenOutput, ModelPrediction
from .sample_error import SampleError
from .evaluation_result import EvaluationResult
from .evaluator import Evaluator

__all__ = ["ModelError", "EvaluationResult", "Evaluator"]
__all__ = ["SpanOutput", "TokenOutput", "ModelPrediction", "SampleError", "EvaluationResult", "Evaluator"]
219 changes: 122 additions & 97 deletions presidio_evaluator/evaluation/evaluation_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,110 +2,135 @@
from collections import Counter
from typing import List, Optional, Dict, Tuple

from presidio_evaluator.evaluation import ModelError
from presidio_evaluator.evaluation import SampleError


class EvaluationResult:
def __init__(
self,
results: Counter,
model_errors: Optional[List[ModelError]] = None,
text: Optional[str] = None,
pii_recall: Optional[float] = None,
pii_precision: Optional[float] = None,
pii_f: Optional[float] = None,
n: Optional[int] = None,
entity_recall_dict: Optional[Dict[str, float]] = None,
entity_precision_dict: Optional[Dict[str, float]] = None,
n_dict: Optional[Dict[str, int]] = None,
sample_errors: List[SampleError],
token_confusion_matrix: Counter,
token_model_metrics: Dict[str, Dict[str, float]],
span_model_metrics: Dict[str, Dict[str, float]]

):
"""
Holds the output of a comparison between ground truth and predicted
:param results: List of objects of type Counter
Holds the output of token and span evaluation for a given dataset
:param model_errors: List of token and span errors for further inspection
:param token_confusion_matrix: List of objects of type Counter
with structure {(actual, predicted) : count}
:param model_errors: List of specific model errors for further inspection
:param text: sample's full text (if used for one sample)
:param pii_recall: Recall for all entities (PII or not)
:param pii_precision: Precision for all entities (PII or not)
:param pii_f: F measure for all entities (PII or not)
:param n: Number of total entity tokens
:param entity_recall_dict: Recall per entity
:param entity_precision_dict: Precision per entity
:param n_dict: Number of tokens per entity
:param token_model_metrics: metrics calculated based on token results
:param span_model_metrics: metrics calculated based on span results
"""

self.results = results
self.model_errors = model_errors
self.text = text

self.pii_recall = pii_recall
self.pii_precision = pii_precision
self.pii_f = pii_f
self.n = n
self.entity_recall_dict = entity_recall_dict
self.entity_precision_dict = entity_precision_dict
self.n_dict = n_dict

def __str__(self):
return_str = ""
if not self.entity_precision_dict or not self.entity_recall_dict:
return json.dumps(self.results)

entities = self.n_dict.keys()

row_format = "{:>20}{:>20.2%}{:>20.2%}{:>20}"
header_format = "{:>20}" * 4
return_str += str(
header_format.format(
*("Entity", "Precision", "Recall", "Number of samples")
)
)
for entity in entities:
return_str += "\n" + row_format.format(
entity,
self.entity_precision_dict[entity],
self.entity_recall_dict[entity],
self.n_dict[entity],
)

# add PII values
return_str += "\n" + row_format.format(
"PII",
self.pii_precision,
self.pii_recall,
self.n,
)

return_str += f"\nPII F measure: {self.pii_f:.2%}"
return return_str

def __repr__(self):
return f"stats={self.results}"

def to_log(self):
metrics_dict = {
"pii_f": self.pii_f,
}
if self.entity_precision_dict:
metrics_dict.update(
{
f"{ent}_precision": v
for (ent, v) in self.entity_precision_dict.items()
}
)
if self.entity_recall_dict:
metrics_dict.update(
{f"{ent}_recall": v for (ent, v) in self.entity_recall_dict.items()}
)
if self.n:
metrics_dict.update(self.n_dict)
return metrics_dict

def to_confusion_matrix(self) -> Tuple[List[str], List[List[int]]]:
entities = sorted(list(set(self.n_dict.keys()).union("O")))
confusion_matrix = [[0] * len(entities) for _ in range(len(entities))]
for i, actual in enumerate(entities):
for j, predicted in enumerate(entities):
confusion_matrix[i][j] = self.results[(actual, predicted)]

return entities, confusion_matrix
self.sample_errors = sample_errors
self.token_confusion_matrix = token_confusion_matrix
self.token_model_metrics = token_model_metrics
self.span_model_metrics = span_model_metrics


# TODO: Review and refactor the method in old EvaluationResult to new one
# class EvaluationResult:
# def __init__(
# self,
# results: Counter,
# model_errors: Optional[List[ModelError]] = None,
# text: Optional[str] = None,
# pii_recall: Optional[float] = None,
# pii_precision: Optional[float] = None,
# pii_f: Optional[float] = None,
# n: Optional[int] = None,
# entity_recall_dict: Optional[Dict[str, float]] = None,
# entity_precision_dict: Optional[Dict[str, float]] = None,
# n_dict: Optional[Dict[str, int]] = None,
# ):
# """
# Holds the output of a comparison between ground truth and predicted
# :param results: List of objects of type Counter
# with structure {(actual, predicted) : count}
# :param model_errors: List of specific model errors for further inspection
# :param text: sample's full text (if used for one sample)
# :param pii_recall: Recall for all entities (PII or not)
# :param pii_precision: Precision for all entities (PII or not)
# :param pii_f: F measure for all entities (PII or not)
# :param n: Number of total entity tokens
# :param entity_recall_dict: Recall per entity
# :param entity_precision_dict: Precision per entity
# :param n_dict: Number of tokens per entity
# """

# self.results = results
# self.model_errors = model_errors
# self.text = text

# self.pii_recall = pii_recall
# self.pii_precision = pii_precision
# self.pii_f = pii_f
# self.n = n
# self.entity_recall_dict = entity_recall_dict
# self.entity_precision_dict = entity_precision_dict
# self.n_dict = n_dict

# def __str__(self):
# return_str = ""
# if not self.entity_precision_dict or not self.entity_recall_dict:
# return json.dumps(self.results)

# entities = self.n_dict.keys()

# row_format = "{:>20}{:>20.2%}{:>20.2%}{:>20}"
# header_format = "{:>20}" * 4
# return_str += str(
# header_format.format(
# *("Entity", "Precision", "Recall", "Number of samples")
# )
# )
# for entity in entities:
# return_str += "\n" + row_format.format(
# entity,
# self.entity_precision_dict[entity],
# self.entity_recall_dict[entity],
# self.n_dict[entity],
# )

# # add PII values
# return_str += "\n" + row_format.format(
# "PII",
# self.pii_precision,
# self.pii_recall,
# self.n,
# )

# return_str += f"\nPII F measure: {self.pii_f:.2%}"
# return return_str

# def __repr__(self):
# return f"stats={self.results}"

# def to_log(self):
# metrics_dict = {
# "pii_f": self.pii_f,
# }
# if self.entity_precision_dict:
# metrics_dict.update(
# {
# f"{ent}_precision": v
# for (ent, v) in self.entity_precision_dict.items()
# }
# )
# if self.entity_recall_dict:
# metrics_dict.update(
# {f"{ent}_recall": v for (ent, v) in self.entity_recall_dict.items()}
# )
# if self.n:
# metrics_dict.update(self.n_dict)
# return metrics_dict

# def to_confusion_matrix(self) -> Tuple[List[str], List[List[int]]]:
# entities = sorted(list(set(self.n_dict.keys()).union("O")))
# confusion_matrix = [[0] * len(entities) for _ in range(len(entities))]
# for i, actual in enumerate(entities):
# for j, predicted in enumerate(entities):
# confusion_matrix[i][j] = self.results[(actual, predicted)]

# return entities, confusion_matrix
Loading