From 9018e6c9059893628d032a94ac905e085430f3f3 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 16 Jan 2023 13:31:48 +0100 Subject: [PATCH 01/16] Initialize SampleError class --- presidio_evaluator/evaluation/model_error.py | 175 ------------------ presidio_evaluator/evaluation/sample_error.py | 44 +++++ 2 files changed, 44 insertions(+), 175 deletions(-) delete mode 100644 presidio_evaluator/evaluation/model_error.py create mode 100644 presidio_evaluator/evaluation/sample_error.py diff --git a/presidio_evaluator/evaluation/model_error.py b/presidio_evaluator/evaluation/model_error.py deleted file mode 100644 index cbbe241..0000000 --- a/presidio_evaluator/evaluation/model_error.py +++ /dev/null @@ -1,175 +0,0 @@ -from typing import Dict, List - -import pandas as pd -from spacy.tokens import Token - - -class ModelError: - def __init__( - self, - error_type: str, - annotation: str, - prediction: str, - token: Token, - full_text: str, - metadata: Dict, - ): - """ - Holds information about an error a model made for analysis purposes - :param error_type: str, e.g. FP, FN, Person->Address etc. - :param annotation: ground truth value - :param prediction: predicted value - :param token: token in question - :param full_text: full input text - :param metadata: metadata on text from InputSample - """ - - self.error_type = error_type - self.annotation = annotation - self.prediction = prediction - self.token = token - self.full_text = full_text - self.metadata = metadata - - def __str__(self): - return ( - "type: {}, " - "Annotation = {}, " - "prediction = {}, " - "Token = {}, " - "Full text = {}, " - "Metadata = {}".format( - self.error_type, - self.annotation, - self.prediction, - self.token, - self.full_text, - self.metadata, - ) - ) - - def __repr__(self): - return f" Date: Mon, 16 Jan 2023 15:47:48 +0100 Subject: [PATCH 02/16] Initilize TokenOutput class --- presidio_evaluator/evaluation/token_output.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 presidio_evaluator/evaluation/token_output.py diff --git a/presidio_evaluator/evaluation/token_output.py b/presidio_evaluator/evaluation/token_output.py new file mode 100644 index 0000000..d8e518a --- /dev/null +++ b/presidio_evaluator/evaluation/token_output.py @@ -0,0 +1,56 @@ +from typing import Optional, List +from spacy.tokens import Token + +from presidio_evaluator import Span, InputSample + + +class TokenOutput: + def __init__( + self, + error_type: str, + annotated_tag: str, + predicted_tag: str, + token: Token, + ): + """ + Holds information about a token error a model made for analysis purposes + :param error_type: str, e.g. FP, FN, Person->Address etc. + :param annotated_tag: str, actual label, e.g. Person + :param predicted_tag: str, predicted label, e.g. Address + :param token: str, token in question + """ + + self.error_type = error_type + self.annotated_tag = annotated_tag + self.predicted_tag = predicted_tag + self.token = token + + def __str__(self): + return ( + "type: {}, " + "Annotated tag = {}, " + "Predicted tag = {}, " + "Token = {}".format( + self.error_type, + self.annotated_tag, + self.predicted_tag, + self.token + ) + ) + + def __repr__(self): + return f" List["TokenOutput"]: + """ + Print the n most common tokens by error type + :param error_type: str, token error type, e.g. FP, FN + :param errors: List of token error in TokenOutput format. + :param n: int, top n most common fp to filter. + :param entity: str, List of entities to filter, e.g. Person, Address + """ + return List["TokenOutput"] From f6c38403f025e3f240ec9c20403cb262af3b8f93 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 16 Jan 2023 16:15:31 +0100 Subject: [PATCH 03/16] Initialize SpanOutput. Rename method in TokenOuput --- presidio_evaluator/evaluation/span_output.py | 47 +++++++++++++++++++ presidio_evaluator/evaluation/token_output.py | 12 ++--- 2 files changed, 52 insertions(+), 7 deletions(-) create mode 100644 presidio_evaluator/evaluation/span_output.py diff --git a/presidio_evaluator/evaluation/span_output.py b/presidio_evaluator/evaluation/span_output.py new file mode 100644 index 0000000..3e7d4f0 --- /dev/null +++ b/presidio_evaluator/evaluation/span_output.py @@ -0,0 +1,47 @@ +from typing import Optional, List + +from presidio_evaluator import Span + + +class SpanOutput: + def __init__( + self, + output_type: str, + overlap_score: float, + annotated_span: Optional[Span] = None, + predicted_span: Optional[Span] = None + ): + """ + Holds information about span prediction output for analysis purposes + :param error_type: str, e.g. strict, exact, partial, incorrect, miss, spurious. + :param overlap_score: float, overlapping ratio between annotated_span and predicted_span + :param annotated_span: str, actual span which comes from the annotated file, e.g. Address + :param predicted_span: str, predicted span of a given model + """ + self.output_type = output_type + self.overlap_score = overlap_score + self.annotated_span = annotated_span + self.predicted_span = predicted_span + + def __repr__(self): + return ( + f"Output type: {self.output_type}\n" + f"Overlap score: {self.overlap_score}\n" + f"Annotated span: {self.annotated_span}\n" + f"Predicted span: {self.predicted_span}\n" + ) + + @staticmethod + def get_span_output_by_type(outputs=List["SpanOutput"], + error_type=str, + n: Optional[int]=None, + entity=None) -> List["SpanOutput"]: + """ + Print the n most common tokens by error type + :param outputs: List of span errors in SpanOutput format. + :param error_type: str, span error type, e.g. strict, exact, partial, incorrect, miss, spurious + :param n: int, top n most common output to filter. If n is None, all token errors of error_type are returned. + :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned. + """ + return List["SpanOutput"] + \ No newline at end of file diff --git a/presidio_evaluator/evaluation/token_output.py b/presidio_evaluator/evaluation/token_output.py index d8e518a..e9a1152 100644 --- a/presidio_evaluator/evaluation/token_output.py +++ b/presidio_evaluator/evaluation/token_output.py @@ -1,8 +1,6 @@ from typing import Optional, List from spacy.tokens import Token -from presidio_evaluator import Span, InputSample - class TokenOutput: def __init__( @@ -42,15 +40,15 @@ def __repr__(self): return f" List["TokenOutput"]: """ Print the n most common tokens by error type - :param error_type: str, token error type, e.g. FP, FN :param errors: List of token error in TokenOutput format. - :param n: int, top n most common fp to filter. - :param entity: str, List of entities to filter, e.g. Person, Address + :param error_type: str, token error type, e.g. FP, FN + :param n: int, top n most common error to filter. If n is None, all token errors of error_type are returned. + :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned. """ return List["TokenOutput"] From c6fb0e4e2abd05d3929d02c44af58e22de553b82 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 16 Jan 2023 16:33:34 +0100 Subject: [PATCH 04/16] Initialize Evaluator class --- presidio_evaluator/evaluation/__init__.py | 5 +- presidio_evaluator/evaluation/evaluator.py | 673 ++++++++++-------- .../evaluation/evaluator_objects.py | 115 +++ presidio_evaluator/evaluation/span_output.py | 47 -- presidio_evaluator/evaluation/token_output.py | 54 -- presidio_evaluator/evaluation_helpers.py | 0 6 files changed, 485 insertions(+), 409 deletions(-) create mode 100644 presidio_evaluator/evaluation/evaluator_objects.py delete mode 100644 presidio_evaluator/evaluation/span_output.py delete mode 100644 presidio_evaluator/evaluation/token_output.py create mode 100644 presidio_evaluator/evaluation_helpers.py diff --git a/presidio_evaluator/evaluation/__init__.py b/presidio_evaluator/evaluation/__init__.py index f2cc9cd..1ee63f2 100644 --- a/presidio_evaluator/evaluation/__init__.py +++ b/presidio_evaluator/evaluation/__init__.py @@ -1,5 +1,6 @@ -from .model_error import ModelError +from .evaluator_objects import SpanOutput, TokenOutput, ModelPrediction +from .sample_error import SampleError from .evaluation_result import EvaluationResult from .evaluator import Evaluator -__all__ = ["ModelError", "EvaluationResult", "Evaluator"] +__all__ = ["SpanOutput", "TokenOutput", "ModelPrediction", "SampleError", "EvaluationResult", "Evaluator"] \ No newline at end of file diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py index 6532657..2ed020b 100644 --- a/presidio_evaluator/evaluation/evaluator.py +++ b/presidio_evaluator/evaluation/evaluator.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Tuple from pathlib import Path import numpy as np @@ -9,346 +9,407 @@ import pandas as pd from presidio_evaluator import InputSample -from presidio_evaluator.evaluation import EvaluationResult, ModelError -from presidio_evaluator.models import BaseModel +from presidio_evaluator.evaluation import (TokenOutput, + SpanOutput, + ModelPrediction, + EvaluationResult, + SampleError) class Evaluator: def __init__( self, - model: BaseModel, verbose: bool = False, compare_by_io=True, entities_to_keep: Optional[List[str]] = None, + span_overlap_threshold: float = 0.5 ): """ Evaluate a PII detection model or a Presidio analyzer / recognizer - - :param model: Instance of a fitted model (of base type BaseModel) :param compare_by_io: True if comparison should be done on the entity level and not the sub-entity level :param entities_to_keep: List of entity names to focus the evaluator on (and ignore the rest). Default is None = all entities. If the provided model has a list of entities to keep, this list would be used for evaluation. """ - self.model = model self.verbose = verbose self.compare_by_io = compare_by_io self.entities_to_keep = entities_to_keep - if self.entities_to_keep is None and self.model.entities: - self.entities_to_keep = self.model.entities + self.span_overlap_threshold = span_overlap_threshold - def compare(self, input_sample: InputSample, prediction: List[str]): - - """ - Compares ground truth tags (annotation) and predicted (prediction) - :param input_sample: input sample containing list of tags with scheme - :param prediction: predicted value for each token - self.labeling_scheme + def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOutput], Counter]: """ - annotation = input_sample.tags - tokens = input_sample.tokens - - if len(annotation) != len(prediction): - print( - "Annotation and prediction do not have the" - "same length. Sample={}".format(input_sample) - ) - return Counter(), [] - - results = Counter() - mistakes = [] - - new_annotation = annotation.copy() - - if self.compare_by_io: - new_annotation = self._to_io(new_annotation) - prediction = self._to_io(prediction) - - # Ignore annotations that aren't in the list of - # requested entities. - if self.entities_to_keep: - prediction = self._adjust_per_entities(prediction) - new_annotation = self._adjust_per_entities(new_annotation) - for i in range(0, len(new_annotation)): - results[(new_annotation[i], prediction[i])] += 1 - - if self.verbose: - print("Annotation:", new_annotation[i]) - print("Prediction:", prediction[i]) - print(results) - - # check if there was an error - is_error = new_annotation[i] != prediction[i] - if is_error: - if prediction[i] == "O": - mistakes.append( - ModelError( - error_type="FN", - annotation=new_annotation[i], - prediction=prediction[i], - token=tokens[i], - full_text=input_sample.full_text, - metadata=input_sample.metadata, - ) - ) - elif new_annotation[i] == "O": - mistakes.append( - ModelError( - error_type="FP", - annotation=new_annotation[i], - prediction=prediction[i], - token=tokens[i], - full_text=input_sample.full_text, - metadata=input_sample.metadata, - ) - ) - else: - mistakes.append( - ModelError( - error_type="Wrong entity", - annotation=new_annotation[i], - prediction=prediction[i], - token=tokens[i], - full_text=input_sample.full_text, - metadata=input_sample.metadata, - ) - ) - - return results, mistakes - - def _adjust_per_entities(self, tags): - if self.entities_to_keep: - return [tag if tag in self.entities_to_keep else "O" for tag in tags] - else: - return tags - - @staticmethod - def _to_io(tags): - """ - Translates BILUO/BIO/IOB to IO - only In or Out of entity. - ['B-PERSON','I-PERSON','L-PERSON'] is translated into - ['PERSON','PERSON','PERSON'] - :param tags: the input tags in BILUO/IOB/BIO format - :return: a new list of IO tags + Compares ground truth tags (annotation) and predicted (prediction) at token level. + Return a list of TokenOutput and a list of objects of type Counter with structure {(actual, predicted) : count} + :param model_prediction: model_prediction containing an InputSample and a list of predicted tags and tokens """ - return [tag[2:] if "-" in tag else tag for tag in tags] - - def evaluate_sample( - self, sample: InputSample, prediction: List[str] - ) -> EvaluationResult: - if self.verbose: - print("Input sentence: {}".format(sample.full_text)) - - results, mistakes = self.compare(input_sample=sample, prediction=prediction) - return EvaluationResult(results, mistakes, sample.full_text) - - def evaluate_all(self, dataset: List[InputSample]) -> List[EvaluationResult]: - evaluation_results = [] - if self.model.entity_mapping: - print( - f"Mapping entity values using this dictionary: {self.model.entity_mapping}" - ) - for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"): - - # Align tag values to the ones expected by the model - self.model.align_entity_types(sample) - - # Predict - prediction = self.model.predict(sample) - - # Remove entities not requested - prediction = self.model.filter_tags_in_supported_entities(prediction) - - # Switch to requested labeling scheme (IO/BIO/BILUO) - prediction = self.model.to_scheme(prediction) - - evaluation_result = self.evaluate_sample( - sample=sample, prediction=prediction - ) - evaluation_results.append(evaluation_result) - return evaluation_results + return List[TokenOutput], Counter - @staticmethod - def align_entity_types( - input_samples: List[InputSample], - entities_mapping: Dict[str, str] = None, - allow_missing_mappings: bool = False, - ) -> List[InputSample]: + def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]: """ - Change input samples to conform with Presidio's entities - :return: new list of InputSample + Compares ground truth tags (annotation) and predicted (prediction) at span level. + :param model_prediction: model_prediction containing an InputSample and a list of predicted tags and tokens + Returns: + List[SpanOutput]: a list of SpanOutput + dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}} """ - new_input_samples = input_samples.copy() - - # A list that will contain updated input samples, - new_list = [] - - for input_sample in new_input_samples: - contains_field_in_mapping = False - new_spans = [] - # Update spans to match the entity types in the values of entities_mapping - for span in input_sample.spans: - if span.entity_type in entities_mapping.keys(): - new_name = entities_mapping.get(span.entity_type) - span.entity_type = new_name - contains_field_in_mapping = True - - new_spans.append(span) - else: - if not allow_missing_mappings: - raise ValueError( - f"Key {span.entity_type} cannot be found in the provided entities_mapping" - ) - input_sample.spans = new_spans - - # Update tags in case this sample has relevant entities for evaluation - if contains_field_in_mapping: - for i, tag in enumerate(input_sample.tags): - has_prefix = "-" in tag - if has_prefix: - prefix = tag[:2] - clean = tag[2:] - else: - prefix = "" - clean = tag - - if clean in entities_mapping.keys(): - new_name = entities_mapping.get(clean) - input_sample.tags[i] = "{}{}".format(prefix, new_name) - else: - input_sample.tags[i] = "O" - - new_list.append(input_sample) - - return new_list - # Iterate on all samples - - def calculate_score( - self, - evaluation_results: List[EvaluationResult], - entities: Optional[List[str]] = None, - beta: float = 2.5, - ) -> EvaluationResult: - """ - Returns the pii_precision, pii_recall, f_measure either and number of records for each entity - or for all entities (ignore_entity_type = True) - :param evaluation_results: List of EvaluationResult - :param entities: List of entities to calculate score to. Default is None: all entities - :param beta: F measure beta value - between different entity types, or to treat these as misclassifications - :return: EvaluationResult with precision, recall and f measures - """ - - # aggregate results - all_results = sum([er.results for er in evaluation_results], Counter()) - - # compute pii_recall per entity - entity_recall = {} - entity_precision = {} - n = {} - if not entities: - entities = list(set([x[0] for x in all_results.keys() if x[0] != "O"])) - - for entity in entities: - # all annotation of given type - annotated = sum([all_results[x] for x in all_results if x[0] == entity]) - predicted = sum([all_results[x] for x in all_results if x[1] == entity]) - n[entity] = annotated - tp = all_results[(entity, entity)] - - if annotated > 0: - entity_recall[entity] = tp / annotated - else: - entity_recall[entity] = np.NaN - - if predicted > 0: - per_entity_tp = all_results[(entity, entity)] - entity_precision[entity] = per_entity_tp / predicted - else: - entity_precision[entity] = np.NaN - - # compute pii_precision and pii_recall - annotated_all = sum([all_results[x] for x in all_results if x[0] != "O"]) - predicted_all = sum([all_results[x] for x in all_results if x[1] != "O"]) - if annotated_all > 0: - pii_recall = ( - sum( - [ - all_results[x] - for x in all_results - if (x[0] != "O" and x[1] != "O") - ] - ) - / annotated_all - ) - else: - pii_recall = np.NaN - if predicted_all > 0: - pii_precision = ( - sum( - [ - all_results[x] - for x in all_results - if (x[0] != "O" and x[1] != "O") - ] - ) - / predicted_all - ) - else: - pii_precision = np.NaN - # compute pii_f_beta-score - pii_f_beta = self.f_beta(pii_precision, pii_recall, beta) - - # aggregate errors - errors = [] - for res in evaluation_results: - if res.model_errors: - errors.extend(res.model_errors) - - evaluation_result = EvaluationResult( - results=all_results, - model_errors=errors, - pii_precision=pii_precision, - pii_recall=pii_recall, - entity_recall_dict=entity_recall, - entity_precision_dict=entity_precision, - n_dict=n, - pii_f=pii_f_beta, - n=sum(n.values()), - ) - - return evaluation_result - - @staticmethod - def precision(tp: int, fp: int) -> float: - return tp / (tp + fp + 1e-100) + return List[SpanOutput], dict[dict] - @staticmethod - def recall(tp: int, fn: int) -> float: - return tp / (tp + fn + 1e-100) - - @staticmethod - def f_beta(precision: float, recall: float, beta: float) -> float: + def evaluate_all(self, model_predictions: List[ModelPrediction]) -> EvaluationResult: """ - Returns the F score for precision, recall and a beta parameter - :param precision: a float with the precision value - :param recall: a float with the recall value - :param beta: a float with the beta parameter of the F measure, - which gives more or less weight to precision - vs. recall - :return: a float value of the f(beta) measure. + Evaluate the PII performance at token and span levels. + :param model_predictions: list of ModelPrediction + Returns: + EvaluationResult: the evaluation outcomes in EvaluationResult format """ - if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0): - return np.nan - return ((1 + beta ** 2) * precision * recall) / ( - ((beta ** 2) * precision) + recall + return EvaluationResult( + sample_errors = None, + token_confusion_matrix = None, + token_model_metrics = None, + span_model_metrics = None ) +# TODO: Old class, will be replace by new Evaluator class +# class Evaluator: +# def __init__( +# self, +# model: BaseModel, +# verbose: bool = False, +# compare_by_io=True, +# entities_to_keep: Optional[List[str]] = None, +# ): +# """ +# Evaluate a PII detection model or a Presidio analyzer / recognizer + +# :param model: Instance of a fitted model (of base type BaseModel) +# :param compare_by_io: True if comparison should be done on the entity +# level and not the sub-entity level +# :param entities_to_keep: List of entity names to focus the evaluator on (and ignore the rest). +# Default is None = all entities. If the provided model has a list of entities to keep, +# this list would be used for evaluation. +# """ +# self.model = model +# self.verbose = verbose +# self.compare_by_io = compare_by_io +# self.entities_to_keep = entities_to_keep +# if self.entities_to_keep is None and self.model.entities: +# self.entities_to_keep = self.model.entities + +# def compare(self, input_sample: InputSample, prediction: List[str]): + +# """ +# Compares ground truth tags (annotation) and predicted (prediction) +# :param input_sample: input sample containing list of tags with scheme +# :param prediction: predicted value for each token +# self.labeling_scheme + +# """ +# annotation = input_sample.tags +# tokens = input_sample.tokens + +# if len(annotation) != len(prediction): +# print( +# "Annotation and prediction do not have the" +# "same length. Sample={}".format(input_sample) +# ) +# return Counter(), [] + +# results = Counter() +# mistakes = [] + +# new_annotation = annotation.copy() + +# if self.compare_by_io: +# new_annotation = self._to_io(new_annotation) +# prediction = self._to_io(prediction) + +# # Ignore annotations that aren't in the list of +# # requested entities. +# if self.entities_to_keep: +# prediction = self._adjust_per_entities(prediction) +# new_annotation = self._adjust_per_entities(new_annotation) +# for i in range(0, len(new_annotation)): +# results[(new_annotation[i], prediction[i])] += 1 + +# if self.verbose: +# print("Annotation:", new_annotation[i]) +# print("Prediction:", prediction[i]) +# print(results) + +# # check if there was an error +# is_error = new_annotation[i] != prediction[i] +# if is_error: +# if prediction[i] == "O": +# mistakes.append( +# ModelError( +# error_type="FN", +# annotation=new_annotation[i], +# prediction=prediction[i], +# token=tokens[i], +# full_text=input_sample.full_text, +# metadata=input_sample.metadata, +# ) +# ) +# elif new_annotation[i] == "O": +# mistakes.append( +# ModelError( +# error_type="FP", +# annotation=new_annotation[i], +# prediction=prediction[i], +# token=tokens[i], +# full_text=input_sample.full_text, +# metadata=input_sample.metadata, +# ) +# ) +# else: +# mistakes.append( +# ModelError( +# error_type="Wrong entity", +# annotation=new_annotation[i], +# prediction=prediction[i], +# token=tokens[i], +# full_text=input_sample.full_text, +# metadata=input_sample.metadata, +# ) +# ) + +# return results, mistakes + +# def _adjust_per_entities(self, tags): +# if self.entities_to_keep: +# return [tag if tag in self.entities_to_keep else "O" for tag in tags] +# else: +# return tags + +# @staticmethod +# def _to_io(tags): +# """ +# Translates BILUO/BIO/IOB to IO - only In or Out of entity. +# ['B-PERSON','I-PERSON','L-PERSON'] is translated into +# ['PERSON','PERSON','PERSON'] +# :param tags: the input tags in BILUO/IOB/BIO format +# :return: a new list of IO tags +# """ +# return [tag[2:] if "-" in tag else tag for tag in tags] + +# def evaluate_sample( +# self, sample: InputSample, prediction: List[str] +# ) -> EvaluationResult: +# if self.verbose: +# print("Input sentence: {}".format(sample.full_text)) + +# results, mistakes = self.compare(input_sample=sample, prediction=prediction) +# return EvaluationResult(results, mistakes, sample.full_text) + +# def evaluate_all(self, dataset: List[InputSample]) -> List[EvaluationResult]: +# evaluation_results = [] +# if self.model.entity_mapping: +# print( +# f"Mapping entity values using this dictionary: {self.model.entity_mapping}" +# ) +# for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"): + +# # Align tag values to the ones expected by the model +# self.model.align_entity_types(sample) + +# # Predict +# prediction = self.model.predict(sample) + +# # Remove entities not requested +# prediction = self.model.filter_tags_in_supported_entities(prediction) + +# # Switch to requested labeling scheme (IO/BIO/BILUO) +# prediction = self.model.to_scheme(prediction) + +# evaluation_result = self.evaluate_sample( +# sample=sample, prediction=prediction +# ) +# evaluation_results.append(evaluation_result) + +# return evaluation_results + +# @staticmethod +# def align_entity_types( +# input_samples: List[InputSample], +# entities_mapping: Dict[str, str] = None, +# allow_missing_mappings: bool = False, +# ) -> List[InputSample]: +# """ +# Change input samples to conform with Presidio's entities +# :return: new list of InputSample +# """ + +# new_input_samples = input_samples.copy() + +# # A list that will contain updated input samples, +# new_list = [] + +# for input_sample in new_input_samples: +# contains_field_in_mapping = False +# new_spans = [] +# # Update spans to match the entity types in the values of entities_mapping +# for span in input_sample.spans: +# if span.entity_type in entities_mapping.keys(): +# new_name = entities_mapping.get(span.entity_type) +# span.entity_type = new_name +# contains_field_in_mapping = True + +# new_spans.append(span) +# else: +# if not allow_missing_mappings: +# raise ValueError( +# f"Key {span.entity_type} cannot be found in the provided entities_mapping" +# ) +# input_sample.spans = new_spans + +# # Update tags in case this sample has relevant entities for evaluation +# if contains_field_in_mapping: +# for i, tag in enumerate(input_sample.tags): +# has_prefix = "-" in tag +# if has_prefix: +# prefix = tag[:2] +# clean = tag[2:] +# else: +# prefix = "" +# clean = tag + +# if clean in entities_mapping.keys(): +# new_name = entities_mapping.get(clean) +# input_sample.tags[i] = "{}{}".format(prefix, new_name) +# else: +# input_sample.tags[i] = "O" + +# new_list.append(input_sample) + +# return new_list +# # Iterate on all samples + +# def calculate_score( +# self, +# evaluation_results: List[EvaluationResult], +# entities: Optional[List[str]] = None, +# beta: float = 2.5, +# ) -> EvaluationResult: +# """ +# Returns the pii_precision, pii_recall, f_measure either and number of records for each entity +# or for all entities (ignore_entity_type = True) +# :param evaluation_results: List of EvaluationResult +# :param entities: List of entities to calculate score to. Default is None: all entities +# :param beta: F measure beta value +# between different entity types, or to treat these as misclassifications +# :return: EvaluationResult with precision, recall and f measures +# """ + +# # aggregate results +# all_results = sum([er.results for er in evaluation_results], Counter()) + +# # compute pii_recall per entity +# entity_recall = {} +# entity_precision = {} +# n = {} +# if not entities: +# entities = list(set([x[0] for x in all_results.keys() if x[0] != "O"])) + +# for entity in entities: +# # all annotation of given type +# annotated = sum([all_results[x] for x in all_results if x[0] == entity]) +# predicted = sum([all_results[x] for x in all_results if x[1] == entity]) +# n[entity] = annotated +# tp = all_results[(entity, entity)] + +# if annotated > 0: +# entity_recall[entity] = tp / annotated +# else: +# entity_recall[entity] = np.NaN + +# if predicted > 0: +# per_entity_tp = all_results[(entity, entity)] +# entity_precision[entity] = per_entity_tp / predicted +# else: +# entity_precision[entity] = np.NaN + +# # compute pii_precision and pii_recall +# annotated_all = sum([all_results[x] for x in all_results if x[0] != "O"]) +# predicted_all = sum([all_results[x] for x in all_results if x[1] != "O"]) +# if annotated_all > 0: +# pii_recall = ( +# sum( +# [ +# all_results[x] +# for x in all_results +# if (x[0] != "O" and x[1] != "O") +# ] +# ) +# / annotated_all +# ) +# else: +# pii_recall = np.NaN +# if predicted_all > 0: +# pii_precision = ( +# sum( +# [ +# all_results[x] +# for x in all_results +# if (x[0] != "O" and x[1] != "O") +# ] +# ) +# / predicted_all +# ) +# else: +# pii_precision = np.NaN +# # compute pii_f_beta-score +# pii_f_beta = self.f_beta(pii_precision, pii_recall, beta) + +# # aggregate errors +# errors = [] +# for res in evaluation_results: +# if res.model_errors: +# errors.extend(res.model_errors) + +# evaluation_result = EvaluationResult( +# results=all_results, +# model_errors=errors, +# pii_precision=pii_precision, +# pii_recall=pii_recall, +# entity_recall_dict=entity_recall, +# entity_precision_dict=entity_precision, +# n_dict=n, +# pii_f=pii_f_beta, +# n=sum(n.values()), +# ) + +# return evaluation_result + +# @staticmethod +# def precision(tp: int, fp: int) -> float: +# return tp / (tp + fp + 1e-100) + +# @staticmethod +# def recall(tp: int, fn: int) -> float: +# return tp / (tp + fn + 1e-100) + +# @staticmethod +# def f_beta(precision: float, recall: float, beta: float) -> float: +# """ +# Returns the F score for precision, recall and a beta parameter +# :param precision: a float with the precision value +# :param recall: a float with the recall value +# :param beta: a float with the beta parameter of the F measure, +# which gives more or less weight to precision +# vs. recall +# :return: a float value of the f(beta) measure. +# """ +# if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0): +# return np.nan + +# return ((1 + beta ** 2) * precision * recall) / ( +# ((beta ** 2) * precision) + recall +# ) + class Plotter: """ Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives) diff --git a/presidio_evaluator/evaluation/evaluator_objects.py b/presidio_evaluator/evaluation/evaluator_objects.py new file mode 100644 index 0000000..0bf2bb3 --- /dev/null +++ b/presidio_evaluator/evaluation/evaluator_objects.py @@ -0,0 +1,115 @@ +from typing import Optional, List +from spacy.tokens import Token + +from presidio_evaluator import Span, InputSample + + +class TokenOutput: + def __init__( + self, + error_type: str, + annotated_tag: str, + predicted_tag: str, + token: Token, + ): + """ + Holds information about a token error a model made for analysis purposes + :param error_type: str, e.g. FP, FN, Person->Address etc. + :param annotated_tag: str, actual label, e.g. Person + :param predicted_tag: str, predicted label, e.g. Address + :param token: str, token in question + """ + + self.error_type = error_type + self.annotated_tag = annotated_tag + self.predicted_tag = predicted_tag + self.token = token + + def __str__(self): + return ( + "type: {}, " + "Annotated tag = {}, " + "Predicted tag = {}, " + "Token = {}".format( + self.error_type, + self.annotated_tag, + self.predicted_tag, + self.token + ) + ) + + def __repr__(self): + return f" List["TokenOutput"]: + """ + Print the n most common tokens by error type + :param errors: List of token error in TokenOutput format. + :param error_type: str, token error type, e.g. FP, FN + :param n: int, top n most common error to filter. Default is None = all token errors of error_type are returned. + :param entity: str, List of entities to filter, e.g. Person, Address. Default is None = all entities + """ + return List["TokenOutput"] + + +class SpanOutput: + def __init__( + self, + output_type: str, + overlap_score: float, + annotated_span: Optional[Span] = None, + predicted_span: Optional[Span] = None + ): + """ + Holds information about span prediction output for analysis purposes + :param error_type: str, e.g. strict, exact, partial, incorrect, miss, spurious. + :param overlap_score: float, overlapping ratio between annotated_span and predicted_span + :param annotated_span: str, actual span which comes from the annotated file, e.g. Address + :param predicted_span: str, predicted span of a given model + """ + self.output_type = output_type + self.overlap_score = overlap_score + self.annotated_span = annotated_span + self.predicted_span = predicted_span + + def __repr__(self): + return ( + f"Output type: {self.output_type}\n" + f"Overlap score: {self.overlap_score}\n" + f"Annotated span: {self.annotated_span}\n" + f"Predicted span: {self.predicted_span}\n" + ) + + @staticmethod + def get_span_output_by_type(outputs=List["SpanOutput"], + error_type=str, + n: Optional[int]=None, + entity=None) -> List["SpanOutput"]: + """ + Print the n most common tokens by error type + :param outputs: List of span errors in SpanOutput format. + :param error_type: str, span error type, e.g. strict, exact, partial, incorrect, miss, spurious + :param n: int, top n most common output to filter. Default is None = all token errors of error_type are returned. + :param entity: str, List of entities to filter, e.g. Person, Address. Default is None = all entities. + """ + return List["SpanOutput"] + + +class ModelPrediction: + def __init__( + self, + input_sample: InputSample, + predicted_tags: Optional[List[str]], + predicted_spans: Optional[List[Span]] + ): + """ + Holds information about model prediction in both span and token level + :params + """ + self.input_sample = input_sample + self.predicted_tags = predicted_tags + self.predicted_spans = predicted_spans \ No newline at end of file diff --git a/presidio_evaluator/evaluation/span_output.py b/presidio_evaluator/evaluation/span_output.py deleted file mode 100644 index 3e7d4f0..0000000 --- a/presidio_evaluator/evaluation/span_output.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import Optional, List - -from presidio_evaluator import Span - - -class SpanOutput: - def __init__( - self, - output_type: str, - overlap_score: float, - annotated_span: Optional[Span] = None, - predicted_span: Optional[Span] = None - ): - """ - Holds information about span prediction output for analysis purposes - :param error_type: str, e.g. strict, exact, partial, incorrect, miss, spurious. - :param overlap_score: float, overlapping ratio between annotated_span and predicted_span - :param annotated_span: str, actual span which comes from the annotated file, e.g. Address - :param predicted_span: str, predicted span of a given model - """ - self.output_type = output_type - self.overlap_score = overlap_score - self.annotated_span = annotated_span - self.predicted_span = predicted_span - - def __repr__(self): - return ( - f"Output type: {self.output_type}\n" - f"Overlap score: {self.overlap_score}\n" - f"Annotated span: {self.annotated_span}\n" - f"Predicted span: {self.predicted_span}\n" - ) - - @staticmethod - def get_span_output_by_type(outputs=List["SpanOutput"], - error_type=str, - n: Optional[int]=None, - entity=None) -> List["SpanOutput"]: - """ - Print the n most common tokens by error type - :param outputs: List of span errors in SpanOutput format. - :param error_type: str, span error type, e.g. strict, exact, partial, incorrect, miss, spurious - :param n: int, top n most common output to filter. If n is None, all token errors of error_type are returned. - :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned. - """ - return List["SpanOutput"] - \ No newline at end of file diff --git a/presidio_evaluator/evaluation/token_output.py b/presidio_evaluator/evaluation/token_output.py deleted file mode 100644 index e9a1152..0000000 --- a/presidio_evaluator/evaluation/token_output.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Optional, List -from spacy.tokens import Token - - -class TokenOutput: - def __init__( - self, - error_type: str, - annotated_tag: str, - predicted_tag: str, - token: Token, - ): - """ - Holds information about a token error a model made for analysis purposes - :param error_type: str, e.g. FP, FN, Person->Address etc. - :param annotated_tag: str, actual label, e.g. Person - :param predicted_tag: str, predicted label, e.g. Address - :param token: str, token in question - """ - - self.error_type = error_type - self.annotated_tag = annotated_tag - self.predicted_tag = predicted_tag - self.token = token - - def __str__(self): - return ( - "type: {}, " - "Annotated tag = {}, " - "Predicted tag = {}, " - "Token = {}".format( - self.error_type, - self.annotated_tag, - self.predicted_tag, - self.token - ) - ) - - def __repr__(self): - return f" List["TokenOutput"]: - """ - Print the n most common tokens by error type - :param errors: List of token error in TokenOutput format. - :param error_type: str, token error type, e.g. FP, FN - :param n: int, top n most common error to filter. If n is None, all token errors of error_type are returned. - :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned. - """ - return List["TokenOutput"] diff --git a/presidio_evaluator/evaluation_helpers.py b/presidio_evaluator/evaluation_helpers.py new file mode 100644 index 0000000..e69de29 From bcd142a97b012257ce92b696133fe5f24294c185 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 16 Jan 2023 16:37:29 +0100 Subject: [PATCH 05/16] Initialize some utils function in helpers --- presidio_evaluator/evaluation_helpers.py | 134 +++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/presidio_evaluator/evaluation_helpers.py b/presidio_evaluator/evaluation_helpers.py index e69de29..286cc24 100644 --- a/presidio_evaluator/evaluation_helpers.py +++ b/presidio_evaluator/evaluation_helpers.py @@ -0,0 +1,134 @@ +import numpy as np +from typing import List, Dict +from collections import Counter + +from presidio_evaluator import Span +from presidio_evaluator.evaluation import SpanOutput + + +def get_matched_gold(predicted_span: Span, + annotated_span: List[Span], + overlap_threshold) -> SpanOutput: + """ + Given a predicted_span, get the best matchest annotated_span based on the overlap_threshold. + Return a SpanOutput + :param sample: InputSample + :param pred_span: Span, Predicted span + :param gold_span: List[Span]: List of gold spans from the annotation input + """ + return SpanOutput(output_type="", + predicted_span=None, + annotated_span=None, + overlap_score=0 + ) + +def span_compute_actual_possible(results: dict) -> dict: + """ + Take the result dict and calculate the actual and possible spans + """ + strict = results["strict"] + exact = results["exact"] + incorrect = results["incorrect"] + partial = results["partial"] + missed = results["miss"] + spurious = results["spurious"] + # Possible: Number of annotations in the gold-standard which contribute to the final score + possible = strict + exact + incorrect + partial + missed + # Actual: Number of annotations produced by the PII detection system + actual = strict + exact + incorrect + partial + spurious + + results["actual"] = actual + results["possible"] = possible + + return results + +def span_compute_precision_recall(results: dict) -> dict: + """ + Take the result dict to calculate the strict and flexible precision/ recall + """ + metrics = {} + strict = results["strict"] + exact = results["exact"] + partial = results["partial"] + actual = results["actual"] + possible = results["possible"] + + # Calculate the strict performance + strict_precision = strict / actual if actual > 0 else 0 + strict_recall = strict / possible if possible > 0 else 0 + + # Calculate the flexible performance + flexible_precision = (strict + exact)/ actual if actual > 0 else 0 + flexible_recall = (strict + exact) / possible if possible > 0 else 0 + + # Calculate the partial performance + partial_precision = (strict + exact + 0.5 * partial) / actual if actual > 0 else 0 + partial_recall = (strict + exact + 0.5 * partial) / possible if possible > 0 else 0 + + + metrics["strict precision"] = strict_precision + metrics["strict recall"] = strict_recall + metrics["flexible precision"] = flexible_precision + metrics["flexible recall"] = flexible_recall + metrics["partial precision"] = partial_precision + metrics["partial recall"] = partial_recall + return metrics + +# TODO: Implement this function +def dict_merge(dict_1: dict, dict2: dict) -> dict: + """ + Examples: Sum up the value of two dictionaries by keys + >>> dict_1 = {'PII': { + 'correct': 2, + 'partial': 1 + }, + 'PERSON': { + 'correct': 2, + 'partial': 0, + } + } + >>> dict_2 = {'PII': { + 'correct': 3, + 'partial': 0 + }, + 'PERSON': { + 'correct': 1, + 'partial': 1, + } + } + >>> dict_merge(dict1, dict2) + {'PII': { + 'correct': 5, + 'partial': 1 + }, + 'PERSON': { + 'correct': 3, + 'partial': 1, + } + } + """ + results = {} + return results + +# TODO: Implement this function +def token_calulate_score(token_confusion_matrix: Counter) -> Dict: + """ + Calculate the token model metrics from token confusion matrix + Examples: Sum up the value of two dictionaries by keys + >>> token_confusion_matrix = Counter({('O', 'O'): X, ('O', 'DateTime'): X, ('DateTime', 'O'): X, ('DateTime', 'DateTime'): X}) + >>> token_calulate_score(token_confusion_matrix) + {'PII': { + 'recall': xxx, + 'precision': xxx, + 'F measure': xxx + }, + 'PERSON': { + 'recall': xxx, + 'precision': xxx, + } + } + """ + token_model_metrics = {} + return token_model_metrics + + \ No newline at end of file From d74aba1adc0159ce07f3ab274399964d4fac6831 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 16 Jan 2023 16:51:20 +0100 Subject: [PATCH 06/16] Initialize EvaluationResult class --- .../evaluation/evaluation_result.py | 219 ++++++++++-------- 1 file changed, 122 insertions(+), 97 deletions(-) diff --git a/presidio_evaluator/evaluation/evaluation_result.py b/presidio_evaluator/evaluation/evaluation_result.py index e3f88c5..8756efb 100644 --- a/presidio_evaluator/evaluation/evaluation_result.py +++ b/presidio_evaluator/evaluation/evaluation_result.py @@ -2,110 +2,135 @@ from collections import Counter from typing import List, Optional, Dict, Tuple -from presidio_evaluator.evaluation import ModelError +from presidio_evaluator.evaluation import SampleError class EvaluationResult: def __init__( self, - results: Counter, - model_errors: Optional[List[ModelError]] = None, - text: Optional[str] = None, - pii_recall: Optional[float] = None, - pii_precision: Optional[float] = None, - pii_f: Optional[float] = None, - n: Optional[int] = None, - entity_recall_dict: Optional[Dict[str, float]] = None, - entity_precision_dict: Optional[Dict[str, float]] = None, - n_dict: Optional[Dict[str, int]] = None, + sample_errors: List[SampleError], + token_confusion_matrix: Counter, + token_model_metrics: Dict[str, Dict[str, float]], + span_model_metrics: Dict[str, Dict[str, float]] + ): """ - Holds the output of a comparison between ground truth and predicted - :param results: List of objects of type Counter + Holds the output of token and span evaluation for a given dataset + :param model_errors: List of token and span errors for further inspection + :param token_confusion_matrix: List of objects of type Counter with structure {(actual, predicted) : count} - :param model_errors: List of specific model errors for further inspection - :param text: sample's full text (if used for one sample) - :param pii_recall: Recall for all entities (PII or not) - :param pii_precision: Precision for all entities (PII or not) - :param pii_f: F measure for all entities (PII or not) - :param n: Number of total entity tokens - :param entity_recall_dict: Recall per entity - :param entity_precision_dict: Precision per entity - :param n_dict: Number of tokens per entity + :param token_model_metrics: metrics calculated based on token results + :param span_model_metrics: metrics calculated based on span results """ - self.results = results - self.model_errors = model_errors - self.text = text - - self.pii_recall = pii_recall - self.pii_precision = pii_precision - self.pii_f = pii_f - self.n = n - self.entity_recall_dict = entity_recall_dict - self.entity_precision_dict = entity_precision_dict - self.n_dict = n_dict - - def __str__(self): - return_str = "" - if not self.entity_precision_dict or not self.entity_recall_dict: - return json.dumps(self.results) - - entities = self.n_dict.keys() - - row_format = "{:>20}{:>20.2%}{:>20.2%}{:>20}" - header_format = "{:>20}" * 4 - return_str += str( - header_format.format( - *("Entity", "Precision", "Recall", "Number of samples") - ) - ) - for entity in entities: - return_str += "\n" + row_format.format( - entity, - self.entity_precision_dict[entity], - self.entity_recall_dict[entity], - self.n_dict[entity], - ) - - # add PII values - return_str += "\n" + row_format.format( - "PII", - self.pii_precision, - self.pii_recall, - self.n, - ) - - return_str += f"\nPII F measure: {self.pii_f:.2%}" - return return_str - - def __repr__(self): - return f"stats={self.results}" - - def to_log(self): - metrics_dict = { - "pii_f": self.pii_f, - } - if self.entity_precision_dict: - metrics_dict.update( - { - f"{ent}_precision": v - for (ent, v) in self.entity_precision_dict.items() - } - ) - if self.entity_recall_dict: - metrics_dict.update( - {f"{ent}_recall": v for (ent, v) in self.entity_recall_dict.items()} - ) - if self.n: - metrics_dict.update(self.n_dict) - return metrics_dict - - def to_confusion_matrix(self) -> Tuple[List[str], List[List[int]]]: - entities = sorted(list(set(self.n_dict.keys()).union("O"))) - confusion_matrix = [[0] * len(entities) for _ in range(len(entities))] - for i, actual in enumerate(entities): - for j, predicted in enumerate(entities): - confusion_matrix[i][j] = self.results[(actual, predicted)] - - return entities, confusion_matrix + self.sample_errors = sample_errors + self.token_confusion_matrix = token_confusion_matrix + self.token_model_metrics = token_model_metrics + self.span_model_metrics = span_model_metrics + + +# TODO: Review and refactor the method in old EvaluationResult to new one +# class EvaluationResult: +# def __init__( +# self, +# results: Counter, +# model_errors: Optional[List[ModelError]] = None, +# text: Optional[str] = None, +# pii_recall: Optional[float] = None, +# pii_precision: Optional[float] = None, +# pii_f: Optional[float] = None, +# n: Optional[int] = None, +# entity_recall_dict: Optional[Dict[str, float]] = None, +# entity_precision_dict: Optional[Dict[str, float]] = None, +# n_dict: Optional[Dict[str, int]] = None, +# ): +# """ +# Holds the output of a comparison between ground truth and predicted +# :param results: List of objects of type Counter +# with structure {(actual, predicted) : count} +# :param model_errors: List of specific model errors for further inspection +# :param text: sample's full text (if used for one sample) +# :param pii_recall: Recall for all entities (PII or not) +# :param pii_precision: Precision for all entities (PII or not) +# :param pii_f: F measure for all entities (PII or not) +# :param n: Number of total entity tokens +# :param entity_recall_dict: Recall per entity +# :param entity_precision_dict: Precision per entity +# :param n_dict: Number of tokens per entity +# """ + +# self.results = results +# self.model_errors = model_errors +# self.text = text + +# self.pii_recall = pii_recall +# self.pii_precision = pii_precision +# self.pii_f = pii_f +# self.n = n +# self.entity_recall_dict = entity_recall_dict +# self.entity_precision_dict = entity_precision_dict +# self.n_dict = n_dict + +# def __str__(self): +# return_str = "" +# if not self.entity_precision_dict or not self.entity_recall_dict: +# return json.dumps(self.results) + +# entities = self.n_dict.keys() + +# row_format = "{:>20}{:>20.2%}{:>20.2%}{:>20}" +# header_format = "{:>20}" * 4 +# return_str += str( +# header_format.format( +# *("Entity", "Precision", "Recall", "Number of samples") +# ) +# ) +# for entity in entities: +# return_str += "\n" + row_format.format( +# entity, +# self.entity_precision_dict[entity], +# self.entity_recall_dict[entity], +# self.n_dict[entity], +# ) + +# # add PII values +# return_str += "\n" + row_format.format( +# "PII", +# self.pii_precision, +# self.pii_recall, +# self.n, +# ) + +# return_str += f"\nPII F measure: {self.pii_f:.2%}" +# return return_str + +# def __repr__(self): +# return f"stats={self.results}" + +# def to_log(self): +# metrics_dict = { +# "pii_f": self.pii_f, +# } +# if self.entity_precision_dict: +# metrics_dict.update( +# { +# f"{ent}_precision": v +# for (ent, v) in self.entity_precision_dict.items() +# } +# ) +# if self.entity_recall_dict: +# metrics_dict.update( +# {f"{ent}_recall": v for (ent, v) in self.entity_recall_dict.items()} +# ) +# if self.n: +# metrics_dict.update(self.n_dict) +# return metrics_dict + +# def to_confusion_matrix(self) -> Tuple[List[str], List[List[int]]]: +# entities = sorted(list(set(self.n_dict.keys()).union("O"))) +# confusion_matrix = [[0] * len(entities) for _ in range(len(entities))] +# for i, actual in enumerate(entities): +# for j, predicted in enumerate(entities): +# confusion_matrix[i][j] = self.results[(actual, predicted)] + +# return entities, confusion_matrix From c64d2c226f1b390347c24d1e2ef4a33db2d1edd2 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Fri, 27 Jan 2023 16:25:50 +0100 Subject: [PATCH 07/16] Implement compare_span function --- presidio_evaluator/evaluation/evaluator.py | 194 ++++++++++++++++++++- 1 file changed, 191 insertions(+), 3 deletions(-) diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py index 2ed020b..92c4acd 100644 --- a/presidio_evaluator/evaluation/evaluator.py +++ b/presidio_evaluator/evaluation/evaluator.py @@ -1,6 +1,8 @@ from collections import Counter from typing import List, Optional, Dict, Tuple from pathlib import Path +from copy import deepcopy +from difflib import SequenceMatcher import numpy as np from tqdm import tqdm @@ -14,6 +16,7 @@ ModelPrediction, EvaluationResult, SampleError) +import evaluation_helpers class Evaluator: @@ -21,7 +24,7 @@ def __init__( self, verbose: bool = False, compare_by_io=True, - entities_to_keep: Optional[List[str]] = None, + entities_to_keep=True, span_overlap_threshold: float = 0.5 ): """ @@ -37,6 +40,25 @@ def __init__( self.entities_to_keep = entities_to_keep self.span_overlap_threshold = span_overlap_threshold + # setup a dict for storing the span metrics + self.span_model_metrics = { + 'correct': 0, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'possible': 0, + 'actual': 0, + 'precision': 0, + 'recall': 0, + } + # Copy results dict to cover the four evaluation schemes. + self.span_results = { + 'strict': deepcopy(self.span_model_metrics), + 'ent_type': deepcopy(self.span_model_metrics), + 'partial':deepcopy(self.span_model_metrics), + 'exact':deepcopy(self.span_model_metrics), + } def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOutput], Counter]: """ @@ -55,8 +77,174 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp List[SpanOutput]: a list of SpanOutput dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}} """ - - return List[SpanOutput], dict[dict] + # get annotated and predicted span from ModelPrediction + annotated_spans = model_prediction.input_sample.spans + predicted_spans = model_prediction.predicted_spans + + eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0} + evaluation = { + 'strict': deepcopy(eval_metrics), + 'ent_type': deepcopy(eval_metrics), + 'partial': deepcopy(eval_metrics), + 'exact': deepcopy(eval_metrics) + } + # results by entity type + evaluation_agg_entities_type = {e: deepcopy(evaluation) for e in self.entities_to_keep} + + # keep track of entities that overlapped + true_which_overlapped_with_pred = [] + # keep track for the explainibility + span_outputs = [] + + # go through each predicted + for pred in predicted_spans: + found_overlap = False + # Scenario I: Exact match between true and pred + if pred in annotated_spans: + true_which_overlapped_with_pred.append(pred) + span_outputs.append(SpanOutput( + output_type = "STRICT", + gold_span = true, + annotated_span = pred, + overlap_score = 1 + )) + evaluation['strict']['correct'] += 1 + evaluation['ent_type']['correct'] += 1 + evaluation['exact']['correct'] += 1 + evaluation['partial']['correct'] += 1 + + # for the agg. by entity_type results + evaluation_agg_entities_type[pred.entity_type]['strict']['correct'] += 1 + evaluation_agg_entities_type[pred.entity_type]['ent_type']['correct'] += 1 + evaluation_agg_entities_type[pred.entity_type]['exact']['correct'] += 1 + evaluation_agg_entities_type[pred.entity_type]['partial']['correct'] += 1 + else: + # check for overlaps with eny of true entities + for true in annotated_spans: + pred_range = range(pred.start_position, pred.end_position) + true_range = range(true.start_position, true.end_position) + # Scenario IV: Offsets match, but entity type is wrong + if true.start_position == pred.start_position and true.end_position == pred.end_position \ + and true.entity_type != pred.entity_type: + span_outputs.append(SpanOutput( + output_type = "EXACT", + gold_span = true, + annotated_span = pred, + overlap_score = 1 + )) + # overall results + evaluation['strict']['incorrect'] += 1 + evaluation['ent_type']['incorrect'] += 1 + evaluation['partial']['correct'] += 1 + evaluation['exact']['correct'] += 1 + + # aggregated by entity type results + evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1 + evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1 + evaluation_agg_entities_type[true.e_type]['partial']['correct'] += 1 + evaluation_agg_entities_type[true.e_type]['exact']['correct'] += 1 + + true_which_overlapped_with_pred.append(true) + found_overlap = True + break + # Check overlapping between true and pred + elif evaluation_helpers.find_overlap(true_range, pred_range): + overlap_ratio = SequenceMatcher(None, + pred.entity_value, + true.entity_value).ratio() + true_which_overlapped_with_pred.append(true) + # Scenario V: There is an overlap (but offsets do not match exactly), + # and the entity type is the same + if pred.entity_type == true.entity_type: + span_outputs.append(SpanOutput( + output_type = "ENT_TYPE", + gold_span = true, + annotated_span = pred, + overlap_score = overlap_ratio + )) + # overall results + evaluation['strict']['incorrect'] += 1 + evaluation['ent_type']['correct'] += 1 + evaluation['partial']['partial'] += 1 + evaluation['exact']['incorrect'] += 1 + # aggregated by entity type results + evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1 + evaluation_agg_entities_type[true.e_type]['ent_type']['correct'] += 1 + evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1 + evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1 + found_overlap = True + break + # Offset overlap but entity type is different + else: + span_outputs.append(SpanOutput( + output_type = "PARTIAL", + gold_span = true, + annotated_span = pred, + overlap_score = overlap_ratio + )) + # overall results + evaluation['strict']['incorrect'] += 1 + evaluation['ent_type']['incorrect'] += 1 + evaluation['partial']['partial'] += 1 + evaluation['exact']['incorrect'] += 1 + + # aggregated by entity type results + # Results against the true entity + + evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1 + evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1 + evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1 + evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1 + found_overlap = True + break + if not found_overlap: + span_outputs.append(SpanOutput( + output_type = "SPURIOUS", + gold_span = None, + annotated_span = pred, + overlap_score = overlap_ratio + )) + # Overal result + evaluation['strict']['spurious'] += 1 + evaluation['ent_type']['spurious'] += 1 + evaluation['partial']['spurious'] += 1 + evaluation['exact']['spurious'] += 1 + ## NOTE: when pred is not found in tags + # or when it simply does not appear in the test set, then it is + # spurious, but it is not clear where to assign it at the tag + # level. In this case, it is applied to all target_tags + # found in this example. This will mean that the sum of the + # evaluation_agg_entities will not equal evaluation. + for true in self.entities_to_keep: + evaluation_agg_entities_type[true]['strict']['spurious'] += 1 + evaluation_agg_entities_type[true]['ent_type']['spurious'] += 1 + evaluation_agg_entities_type[true]['partial']['spurious'] += 1 + evaluation_agg_entities_type[true]['exact']['spurious'] += 1 + + # Scenario III: Entity was misses entirely. + for true in annotated_spans: + if true in true_which_overlapped_with_pred: + continue + else: + span_outputs.append(SpanOutput( + output_type = "MISSED", + gold_span = true, + annotated_span = pred, + overlap_score = overlap_ratio + )) + # overall results + evaluation['strict']['missed'] += 1 + evaluation['ent_type']['missed'] += 1 + evaluation['partial']['missed'] += 1 + evaluation['exact']['missed'] += 1 + + # for the agg. by e_type + evaluation_agg_entities_type[true.e_type]['strict']['missed'] += 1 + evaluation_agg_entities_type[true.e_type]['ent_type']['missed'] += 1 + evaluation_agg_entities_type[true.e_type]['partial']['missed'] += 1 + evaluation_agg_entities_type[true.e_type]['exact']['missed'] += 1 + + return span_outputs, evaluation, evaluation_agg_entities_type def evaluate_all(self, model_predictions: List[ModelPrediction]) -> EvaluationResult: """ From b0b7dcb6c2b52a9b18a27ed6380881163817d7ec Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Sun, 29 Jan 2023 07:28:09 +0100 Subject: [PATCH 08/16] Add __eq__ method for SpanOutput class --- presidio_evaluator/evaluation/evaluator_objects.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/presidio_evaluator/evaluation/evaluator_objects.py b/presidio_evaluator/evaluation/evaluator_objects.py index 0bf2bb3..34d2243 100644 --- a/presidio_evaluator/evaluation/evaluator_objects.py +++ b/presidio_evaluator/evaluation/evaluator_objects.py @@ -83,6 +83,14 @@ def __repr__(self): f"Annotated span: {self.annotated_span}\n" f"Predicted span: {self.predicted_span}\n" ) + + def __eq__(self, other): + return ( + self.output_type == other.output_type + and self.overlap_score == other.overlap_score + and self.annotated_span == other.annotated_span + and self.predicted_span == other.predicted_span + ) @staticmethod def get_span_output_by_type(outputs=List["SpanOutput"], From 9bf20e5d5648d2cd849df22baef4aa91434e234d Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Sun, 29 Jan 2023 07:29:28 +0100 Subject: [PATCH 09/16] Implement __eq__ for TokenOutput class --- presidio_evaluator/evaluation/evaluator_objects.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/presidio_evaluator/evaluation/evaluator_objects.py b/presidio_evaluator/evaluation/evaluator_objects.py index 34d2243..c79f111 100644 --- a/presidio_evaluator/evaluation/evaluator_objects.py +++ b/presidio_evaluator/evaluation/evaluator_objects.py @@ -40,6 +40,14 @@ def __str__(self): def __repr__(self): return f" Date: Mon, 30 Jan 2023 11:06:29 +0100 Subject: [PATCH 10/16] Add unittest + fix bugs --- presidio_evaluator/evaluation/evaluator.py | 100 ++- tests/test_evaluator.py | 981 ++++++++++++++------- 2 files changed, 715 insertions(+), 366 deletions(-) diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py index 92c4acd..bb105fd 100644 --- a/presidio_evaluator/evaluation/evaluator.py +++ b/presidio_evaluator/evaluation/evaluator.py @@ -10,13 +10,13 @@ import plotly.express as px import pandas as pd -from presidio_evaluator import InputSample +from presidio_evaluator import InputSample, Span from presidio_evaluator.evaluation import (TokenOutput, SpanOutput, ModelPrediction, EvaluationResult, SampleError) -import evaluation_helpers +from presidio_evaluator import evaluation_helpers class Evaluator: @@ -69,17 +69,19 @@ def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOu return List[TokenOutput], Counter - def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]: + # def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]: + def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) -> Tuple[List[SpanOutput], dict[dict]]: """ Compares ground truth tags (annotation) and predicted (prediction) at span level. - :param model_prediction: model_prediction containing an InputSample and a list of predicted tags and tokens + :param annotated_spans: model_prediction containing an InputSample and a list of predicted tags and tokens + :param predicted_spans: Returns: List[SpanOutput]: a list of SpanOutput dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}} """ # get annotated and predicted span from ModelPrediction - annotated_spans = model_prediction.input_sample.spans - predicted_spans = model_prediction.predicted_spans + # annotated_spans = model_prediction.input_sample.spans + # predicted_spans = model_prediction.predicted_spans eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0} evaluation = { @@ -104,7 +106,7 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp true_which_overlapped_with_pred.append(pred) span_outputs.append(SpanOutput( output_type = "STRICT", - gold_span = true, + predicted_span = pred, annotated_span = pred, overlap_score = 1 )) @@ -128,8 +130,8 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp and true.entity_type != pred.entity_type: span_outputs.append(SpanOutput( output_type = "EXACT", - gold_span = true, - annotated_span = pred, + predicted_span = pred, + annotated_span = true, overlap_score = 1 )) # overall results @@ -139,27 +141,29 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp evaluation['exact']['correct'] += 1 # aggregated by entity type results - evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1 - evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1 - evaluation_agg_entities_type[true.e_type]['partial']['correct'] += 1 - evaluation_agg_entities_type[true.e_type]['exact']['correct'] += 1 + evaluation_agg_entities_type[true.entity_type]['strict']['incorrect'] += 1 + evaluation_agg_entities_type[true.entity_type]['ent_type']['incorrect'] += 1 + evaluation_agg_entities_type[true.entity_type]['partial']['correct'] += 1 + evaluation_agg_entities_type[true.entity_type]['exact']['correct'] += 1 true_which_overlapped_with_pred.append(true) found_overlap = True break # Check overlapping between true and pred elif evaluation_helpers.find_overlap(true_range, pred_range): - overlap_ratio = SequenceMatcher(None, - pred.entity_value, - true.entity_value).ratio() + # overlap_ratio = SequenceMatcher(None, + # pred.entity_value, + # true.entity_value).ratio() + overlap_ratio = pred.intersect(true) + print(overlap_ratio) true_which_overlapped_with_pred.append(true) # Scenario V: There is an overlap (but offsets do not match exactly), # and the entity type is the same if pred.entity_type == true.entity_type: span_outputs.append(SpanOutput( output_type = "ENT_TYPE", - gold_span = true, - annotated_span = pred, + predicted_span = pred, + annotated_span = true, overlap_score = overlap_ratio )) # overall results @@ -168,18 +172,18 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp evaluation['partial']['partial'] += 1 evaluation['exact']['incorrect'] += 1 # aggregated by entity type results - evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1 - evaluation_agg_entities_type[true.e_type]['ent_type']['correct'] += 1 - evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1 - evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1 + evaluation_agg_entities_type[true.entity_type]['strict']['incorrect'] += 1 + evaluation_agg_entities_type[true.entity_type]['ent_type']['correct'] += 1 + evaluation_agg_entities_type[true.entity_type]['partial']['partial'] += 1 + evaluation_agg_entities_type[true.entity_type]['exact']['incorrect'] += 1 found_overlap = True break # Offset overlap but entity type is different else: span_outputs.append(SpanOutput( output_type = "PARTIAL", - gold_span = true, - annotated_span = pred, + predicted_span = pred, + annotated_span = true, overlap_score = overlap_ratio )) # overall results @@ -191,18 +195,18 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp # aggregated by entity type results # Results against the true entity - evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1 - evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1 - evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1 - evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1 + evaluation_agg_entities_type[true.entity_type]['strict']['incorrect'] += 1 + evaluation_agg_entities_type[true.entity_type]['partial']['partial'] += 1 + evaluation_agg_entities_type[true.entity_type]['ent_type']['incorrect'] += 1 + evaluation_agg_entities_type[true.entity_type]['exact']['incorrect'] += 1 found_overlap = True break if not found_overlap: span_outputs.append(SpanOutput( output_type = "SPURIOUS", - gold_span = None, - annotated_span = pred, - overlap_score = overlap_ratio + predicted_span = pred, + annotated_span = None, + overlap_score = 0 )) # Overal result evaluation['strict']['spurious'] += 1 @@ -228,9 +232,9 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp else: span_outputs.append(SpanOutput( output_type = "MISSED", - gold_span = true, - annotated_span = pred, - overlap_score = overlap_ratio + predicted_span = None, + annotated_span = true, + overlap_score = 0 )) # overall results evaluation['strict']['missed'] += 1 @@ -239,10 +243,30 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp evaluation['exact']['missed'] += 1 # for the agg. by e_type - evaluation_agg_entities_type[true.e_type]['strict']['missed'] += 1 - evaluation_agg_entities_type[true.e_type]['ent_type']['missed'] += 1 - evaluation_agg_entities_type[true.e_type]['partial']['missed'] += 1 - evaluation_agg_entities_type[true.e_type]['exact']['missed'] += 1 + evaluation_agg_entities_type[true.entity_type]['strict']['missed'] += 1 + evaluation_agg_entities_type[true.entity_type]['ent_type']['missed'] += 1 + evaluation_agg_entities_type[true.entity_type]['partial']['missed'] += 1 + evaluation_agg_entities_type[true.entity_type]['exact']['missed'] += 1 + + # Compute 'possible', 'actual' according to SemEval-2013 Task 9.1 on the + # overall results, and use these to calculate precision and recall. + + for eval_type in evaluation: + evaluation[eval_type] = evaluation_helpers.span_compute_actual_possible(evaluation[eval_type]) + + # Compute 'possible', 'actual', and precision and recall on entity level + # results. Start by cycling through the accumulated results. + + for entity_type, entity_level in evaluation_agg_entities_type.items(): + + # Cycle through the evaluation types for each dict containing entity + # level results. + + for eval_type in entity_level: + + evaluation_agg_entities_type[entity_type][eval_type] = evaluation_helpers.span_compute_actual_possible( + entity_level[eval_type] + ) return span_outputs, evaluation, evaluation_agg_entities_type diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index 8319e05..0305009 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -5,7 +5,9 @@ from presidio_evaluator import InputSample, Span -from presidio_evaluator.evaluation import EvaluationResult, Evaluator +from presidio_evaluator.evaluation import (Evaluator, + ModelPrediction, + SpanOutput) from tests.mocks import ( IdentityTokensMockModel, FiftyFiftyIdentityTokensMockModel, @@ -13,335 +15,658 @@ ) -def test_evaluator_simple(): - prediction = ["O", "O", "O", "U-ANIMAL"] - model = MockTokensModel(prediction=prediction, entities_to_keep=["ANIMAL"]) - - evaluator = Evaluator(model=model) - sample = InputSample( - full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None - ) - sample.tokens = ["I", "am", "the", "walrus"] - sample.tags = ["O", "O", "O", "U-ANIMAL"] - - evaluated = evaluator.evaluate_sample(sample, prediction) - final_evaluation = evaluator.calculate_score([evaluated]) - - assert final_evaluation.pii_precision == 1 - assert final_evaluation.pii_recall == 1 - - -def test_evaluate_sample_wrong_entities_to_keep_correct_statistics(): - prediction = ["O", "O", "O", "U-ANIMAL"] - model = MockTokensModel(prediction=prediction) - - evaluator = Evaluator(model=model, entities_to_keep=["SPACESHIP"]) - - sample = InputSample( - full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None - ) - sample.tokens = ["I", "am", "the", "walrus"] - sample.tags = ["O", "O", "O", "U-ANIMAL"] - - evaluated = evaluator.evaluate_sample(sample, prediction) - assert evaluated.results[("O", "O")] == 4 - - -def test_evaluate_same_entity_correct_statistics(): - prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"] - model = MockTokensModel(prediction=prediction) - evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) - sample = InputSample( - full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None - ) - sample.tokens = ["I", "am", "the", "walrus"] - sample.tags = ["O", "O", "O", "U-ANIMAL"] - - evaluation_result = evaluator.evaluate_sample(sample, prediction) - assert evaluation_result.results[("O", "O")] == 2 - assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1 - assert evaluation_result.results[("O", "ANIMAL")] == 1 - - -def test_evaluate_multiple_entities_to_keep_correct_statistics(): - prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"] - entities_to_keep = ["ANIMAL", "PLANT", "SPACESHIP"] - model = MockTokensModel(prediction=prediction) - evaluator = Evaluator(model=model, entities_to_keep=entities_to_keep) - - sample = InputSample( - full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None - ) - sample.tokens = ["I", "am", "the", "walrus"] - sample.tags = ["O", "O", "O", "U-ANIMAL"] - - evaluation_result = evaluator.evaluate_sample(sample, prediction) - assert evaluation_result.results[("O", "O")] == 2 - assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1 - assert evaluation_result.results[("O", "ANIMAL")] == 1 - - -def test_evaluate_multiple_tokens_correct_statistics(): - prediction = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] - model = MockTokensModel(prediction=prediction) - evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) - sample = InputSample( - "I am the walrus amaericanus magnifico", masked=None, spans=None - ) - sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] - sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] - - evaluated = evaluator.evaluate_sample(sample, prediction) - evaluation = evaluator.calculate_score([evaluated]) - - assert evaluation.pii_precision == 1 - assert evaluation.pii_recall == 1 - - -def test_evaluate_multiple_tokens_partial_match_correct_statistics(): - prediction = ["O", "O", "O", "B-ANIMAL", "L-ANIMAL", "O"] - model = MockTokensModel(prediction=prediction) - evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) - sample = InputSample( - "I am the walrus amaericanus magnifico", masked=None, spans=None - ) - sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] - sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] - - evaluated = evaluator.evaluate_sample(sample, prediction) - evaluation = evaluator.calculate_score([evaluated]) - - assert evaluation.pii_precision == 1 - assert evaluation.pii_recall == 4 / 6 - - -def test_evaluate_multiple_tokens_no_match_match_correct_statistics(): - prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"] - model = MockTokensModel(prediction=prediction) - evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) - sample = InputSample( - "I am the walrus amaericanus magnifico", masked=None, spans=None - ) - sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] - sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] - - evaluated = evaluator.evaluate_sample(sample, prediction) - evaluation = evaluator.calculate_score([evaluated]) - - assert np.isnan(evaluation.pii_precision) - assert evaluation.pii_recall == 0 - - -def test_evaluate_multiple_examples_correct_statistics(): - prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"] - model = MockTokensModel(prediction=prediction) - evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) - input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) - input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] - input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] - - evaluated = evaluator.evaluate_all( - [input_sample, input_sample, input_sample, input_sample] - ) - scores = evaluator.calculate_score(evaluated) - assert scores.pii_precision == 0.5 - assert scores.pii_recall == 0.5 - - -def test_evaluate_multiple_examples_ignore_entity_correct_statistics(): - prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"] - model = MockTokensModel(prediction=prediction) - - evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "TENNIS_PLAYER"]) - input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) - input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] - input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] - - evaluated = evaluator.evaluate_all( - [input_sample, input_sample, input_sample, input_sample] - ) - scores = evaluator.calculate_score(evaluated) - assert scores.pii_precision == 1 - assert scores.pii_recall == 1 - - -def test_confusion_matrix_correct_metrics(): - from collections import Counter - - evaluated = [ - EvaluationResult( - results=Counter( - { - ("O", "O"): 150, - ("O", "PERSON"): 30, - ("O", "COMPANY"): 30, - ("PERSON", "PERSON"): 40, - ("COMPANY", "COMPANY"): 40, - ("PERSON", "COMPANY"): 10, - ("COMPANY", "PERSON"): 10, - ("PERSON", "O"): 30, - ("COMPANY", "O"): 30, - } - ), - model_errors=None, - text=None, - ) - ] - - model = MockTokensModel(prediction=None) - evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "COMPANY"]) - scores = evaluator.calculate_score(evaluated, beta=2.5) - - assert scores.pii_precision == 0.625 - assert scores.pii_recall == 0.625 - assert scores.entity_recall_dict["PERSON"] == 0.5 - assert scores.entity_precision_dict["PERSON"] == 0.5 - assert scores.entity_recall_dict["COMPANY"] == 0.5 - assert scores.entity_precision_dict["COMPANY"] == 0.5 - - -def test_confusion_matrix_2_correct_metrics(): - from collections import Counter - - evaluated = [ - EvaluationResult( - results=Counter( - { - ("O", "O"): 65467, - ("O", "ORG"): 4189, - ("GPE", "O"): 3370, - ("PERSON", "PERSON"): 2024, - ("GPE", "PERSON"): 1488, - ("GPE", "GPE"): 1033, - ("O", "GPE"): 964, - ("ORG", "ORG"): 914, - ("O", "PERSON"): 834, - ("GPE", "ORG"): 401, - ("PERSON", "ORG"): 35, - ("PERSON", "O"): 33, - ("ORG", "O"): 8, - ("PERSON", "GPE"): 5, - ("ORG", "PERSON"): 1, +def test_compare_span_simple_case_1(): + annotated_spans =[Span(entity_type = "PER", entity_value = "", start_position = 59, end_position=69), + Span(entity_type = "LOC", entity_value = "", start_position = 127, end_position=134), + Span(entity_type = "LOC", entity_value = "", start_position = 164, end_position=174), + Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205), + Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219), + Span(entity_type = "MISC", entity_value = "", start_position = 230, end_position=240)] + predicted_spans = [Span(entity_type = "PER", entity_value = "", start_position = 24, end_position=30), + Span(entity_type = "LOC", entity_value = "", start_position = 124, end_position=134), + Span(entity_type = "PER", entity_value = "", start_position = 164, end_position=174), + Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205), + Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219), + Span(entity_type = "LOC", entity_value = "", start_position = 225, end_position=243)] + + evaluator = Evaluator(entities_to_keep=['PER', 'LOC', 'MISC']) + span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) + + expected_evaluation = {'strict': {'correct': 2, + 'incorrect': 3, + 'partial': 0, + 'missed': 1, + 'spurious': 1, + 'possible': 6, + 'actual': 6}, + 'ent_type': {'correct': 3, + 'incorrect': 2, + 'partial': 0, + 'missed': 1, + 'spurious': 1, + 'possible': 6, + 'actual': 6}, + 'partial': {'correct': 3, + 'incorrect': 0, + 'partial': 2, + 'missed': 1, + 'spurious': 1, + 'possible': 6, + 'actual': 6}, + 'exact': {'correct': 3, + 'incorrect': 2, + 'partial': 0, + 'missed': 1, + 'spurious': 1, + 'possible': 6, + 'actual': 6} } - ), - model_errors=None, - text=None, - ) - ] - - model = MockTokensModel(prediction=None) - evaluator = Evaluator(model=model) - scores = evaluator.calculate_score(evaluated, beta=2.5) - - pii_tp = ( - evaluated[0].results[("PERSON", "PERSON")] - + evaluated[0].results[("ORG", "ORG")] - + evaluated[0].results[("GPE", "GPE")] - + evaluated[0].results[("ORG", "GPE")] - + evaluated[0].results[("ORG", "PERSON")] - + evaluated[0].results[("GPE", "ORG")] - + evaluated[0].results[("GPE", "PERSON")] - + evaluated[0].results[("PERSON", "GPE")] - + evaluated[0].results[("PERSON", "ORG")] - ) - - pii_fp = ( - evaluated[0].results[("O", "PERSON")] - + evaluated[0].results[("O", "GPE")] - + evaluated[0].results[("O", "ORG")] - ) - - pii_fn = ( - evaluated[0].results[("PERSON", "O")] - + evaluated[0].results[("GPE", "O")] - + evaluated[0].results[("ORG", "O")] - ) - - assert scores.pii_precision == pii_tp / (pii_tp + pii_fp) - assert scores.pii_recall == pii_tp / (pii_tp + pii_fn) - - -def test_dataset_to_metric_identity_model(): - import os - - dir_path = os.path.dirname(os.path.realpath(__file__)) - input_samples = InputSample.read_dataset_json( - "{}/data/generated_small.json".format(dir_path), length=10 - ) - - model = IdentityTokensMockModel() - evaluator = Evaluator(model=model) - evaluation_results = evaluator.evaluate_all(input_samples) - metrics = evaluator.calculate_score(evaluation_results) - - assert metrics.pii_precision == 1 - assert metrics.pii_recall == 1 - - -def test_dataset_to_metric_50_50_model(): - import os - - dir_path = os.path.dirname(os.path.realpath(__file__)) - input_samples = InputSample.read_dataset_json( - "{}/data/generated_small.json".format(dir_path), length=100 - ) - - # Replace 50% of the predictions with a list of "O" - model = FiftyFiftyIdentityTokensMockModel() - evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) - evaluation_results = evaluator.evaluate_all(input_samples) - metrics = evaluator.calculate_score(evaluation_results) - - print(metrics.pii_precision) - print(metrics.pii_recall) - print(metrics.pii_f) - - assert metrics.pii_precision == 1 - assert metrics.pii_recall < 0.75 - assert metrics.pii_recall > 0.25 - - -def test_align_entity_types_correct_output(): - - sample1 = InputSample( - "I live in ABC", - spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)], - create_tags_from_span=False, - ) - sample2 = InputSample( - "I live in ABC", - spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("C", "c", 100, 101)], - create_tags_from_span=False, - ) - samples = [sample1, sample2] - mapping = { - "A": "1", - "B": "2", - "C": "1", + print(span_outputs) + print(expected_evaluation) + assert evaluation == expected_evaluation + +def test_compare_span_strict(): + annotated_spans =[Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] + predicted_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] + + evaluator = Evaluator(entities_to_keep=["ANIMAL"]) + span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) + + expected_evaluation = { + 'strict': { + 'correct': 1, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'ent_type': { + 'correct': 1, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'partial': { + 'correct': 1, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'exact': { + 'correct': 1, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + } + } + expected_span_outputs = [SpanOutput( + output_type = "STRICT", + predicted_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), + annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), + overlap_score = 1)] + + assert len(span_outputs) == len(expected_span_outputs) + assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) + assert evaluation['strict'] == expected_evaluation['strict'] + assert evaluation['ent_type'] == expected_evaluation['ent_type'] + assert evaluation['partial'] == expected_evaluation['partial'] + assert evaluation['exact'] == expected_evaluation['exact'] + + +def test_compare_span_ent_type(): + annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] + predicted_spans =[Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24)] + + evaluator = Evaluator(entities_to_keep=["ANIMAL"]) + span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) + + expected_evaluation = { + 'strict': { + 'correct': 0, + 'incorrect': 1, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'ent_type': { + 'correct': 1, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'partial': { + 'correct': 0, + 'incorrect': 0, + 'partial': 1, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': + 1, + 'possible': 1 + }, + 'exact': { + 'correct': 0, + 'incorrect': 1, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + } } - new_samples = Evaluator.align_entity_types(samples, mapping) - - count_per_entity = Counter() - for sample in new_samples: - for span in sample.spans: - count_per_entity[span.entity_type] += 1 - - assert count_per_entity["1"] == 5 - assert count_per_entity["2"] == 1 - - -def test_align_entity_types_wrong_mapping_exception(): - - sample1 = InputSample( - "I live in ABC", - spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)], - create_tags_from_span=False, - ) + expected_span_outputs = [SpanOutput( + output_type = "ENT_TYPE", + predicted_span = Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24), + annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), + overlap_score = 0.72)] + + assert len(span_outputs) == len(expected_span_outputs) + assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) + assert evaluation['strict'] == expected_evaluation['strict'] + assert evaluation['ent_type'] == expected_evaluation['ent_type'] + assert evaluation['partial'] == expected_evaluation['partial'] + assert evaluation['exact'] == expected_evaluation['exact'] + +def test_compare_span_exact(): + annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] + predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24)] + + evaluator = Evaluator(entities_to_keep=["ANIMAL"]) + span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) + + expected_evaluation = { + 'strict': { + 'correct': 0, + 'incorrect': 1, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'ent_type': { + 'correct': 0, + 'incorrect': 1, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'partial': { + 'correct': 1, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'exact': { + 'correct': 1, + 'incorrect': 0, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + } + } - entities_mapping = {"Z": "z"} + expected_span_outputs = [SpanOutput( + output_type = "EXACT", + predicted_span = Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24), + annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), + overlap_score = 1)] + print(span_outputs) + + assert len(span_outputs) == len(expected_span_outputs) + assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) + assert evaluation['strict'] == expected_evaluation['strict'] + assert evaluation['ent_type'] == expected_evaluation['ent_type'] + assert evaluation['partial'] == expected_evaluation['partial'] + assert evaluation['exact'] == expected_evaluation['exact'] + +def test_compare_span_partial(): + annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] + predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24)] + + evaluator = Evaluator(entities_to_keep=["ANIMAL"]) + span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) + + expected_evaluation = { + 'strict': { + 'correct': 0, + 'incorrect': 1, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'ent_type': { + 'correct': 0, + 'incorrect': 1, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + }, + 'partial': { + 'correct': 0, + 'incorrect': 0, + 'partial': 1, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': + 1, + 'possible': 1 + }, + 'exact': { + 'correct': 0, + 'incorrect': 1, + 'partial': 0, + 'missed': 0, + 'spurious': 0, + 'precision': 0, + 'recall': 0, + 'actual': 1, + 'possible': 1 + } + } - with pytest.raises(ValueError): - Evaluator.align_entity_types( - input_samples=[sample1], entities_mapping=entities_mapping - ) + expected_span_outputs = [SpanOutput( + output_type = "PARTIAL", + predicted_span = Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24), + annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), + overlap_score = 0.72)] + print(span_outputs) + + assert len(span_outputs) == len(expected_span_outputs) + assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) + assert evaluation['strict'] == expected_evaluation['strict'] + assert evaluation['ent_type'] == expected_evaluation['ent_type'] + assert evaluation['partial'] == expected_evaluation['partial'] + assert evaluation['exact'] == expected_evaluation['exact'] + +# TODO: refactor those functions +# def test_evaluator_simple(): +# prediction = ["O", "O", "O", "U-ANIMAL"] +# model = MockTokensModel(prediction=prediction, entities_to_keep=["ANIMAL"]) + +# evaluator = Evaluator(model=model) +# sample = InputSample( +# full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None +# ) +# sample.tokens = ["I", "am", "the", "walrus"] +# sample.tags = ["O", "O", "O", "U-ANIMAL"] + +# evaluated = evaluator.evaluate_sample(sample, prediction) +# final_evaluation = evaluator.calculate_score([evaluated]) + +# assert final_evaluation.pii_precision == 1 +# assert final_evaluation.pii_recall == 1 + + +# def test_evaluate_sample_wrong_entities_to_keep_correct_statistics(): +# prediction = ["O", "O", "O", "U-ANIMAL"] +# model = MockTokensModel(prediction=prediction) + +# evaluator = Evaluator(model=model, entities_to_keep=["SPACESHIP"]) + +# sample = InputSample( +# full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None +# ) +# sample.tokens = ["I", "am", "the", "walrus"] +# sample.tags = ["O", "O", "O", "U-ANIMAL"] + +# evaluated = evaluator.evaluate_sample(sample, prediction) +# assert evaluated.results[("O", "O")] == 4 + + +# def test_evaluate_same_entity_correct_statistics(): +# prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"] +# model = MockTokensModel(prediction=prediction) +# evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) +# sample = InputSample( +# full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None +# ) +# sample.tokens = ["I", "am", "the", "walrus"] +# sample.tags = ["O", "O", "O", "U-ANIMAL"] + +# evaluation_result = evaluator.evaluate_sample(sample, prediction) +# assert evaluation_result.results[("O", "O")] == 2 +# assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1 +# assert evaluation_result.results[("O", "ANIMAL")] == 1 + + +# def test_evaluate_multiple_entities_to_keep_correct_statistics(): +# prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"] +# entities_to_keep = ["ANIMAL", "PLANT", "SPACESHIP"] +# model = MockTokensModel(prediction=prediction) +# evaluator = Evaluator(model=model, entities_to_keep=entities_to_keep) + +# sample = InputSample( +# full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None +# ) +# sample.tokens = ["I", "am", "the", "walrus"] +# sample.tags = ["O", "O", "O", "U-ANIMAL"] + +# evaluation_result = evaluator.evaluate_sample(sample, prediction) +# assert evaluation_result.results[("O", "O")] == 2 +# assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1 +# assert evaluation_result.results[("O", "ANIMAL")] == 1 + + +# def test_evaluate_multiple_tokens_correct_statistics(): +# prediction = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] +# model = MockTokensModel(prediction=prediction) +# evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) +# sample = InputSample( +# "I am the walrus amaericanus magnifico", masked=None, spans=None +# ) +# sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] +# sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] + +# evaluated = evaluator.evaluate_sample(sample, prediction) +# evaluation = evaluator.calculate_score([evaluated]) + +# assert evaluation.pii_precision == 1 +# assert evaluation.pii_recall == 1 + + +# def test_evaluate_multiple_tokens_partial_match_correct_statistics(): +# prediction = ["O", "O", "O", "B-ANIMAL", "L-ANIMAL", "O"] +# model = MockTokensModel(prediction=prediction) +# evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) +# sample = InputSample( +# "I am the walrus amaericanus magnifico", masked=None, spans=None +# ) +# sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] +# sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] + +# evaluated = evaluator.evaluate_sample(sample, prediction) +# evaluation = evaluator.calculate_score([evaluated]) + +# assert evaluation.pii_precision == 1 +# assert evaluation.pii_recall == 4 / 6 + + +# def test_evaluate_multiple_tokens_no_match_match_correct_statistics(): +# prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"] +# model = MockTokensModel(prediction=prediction) +# evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) +# sample = InputSample( +# "I am the walrus amaericanus magnifico", masked=None, spans=None +# ) +# sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] +# sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] + +# evaluated = evaluator.evaluate_sample(sample, prediction) +# evaluation = evaluator.calculate_score([evaluated]) + +# assert np.isnan(evaluation.pii_precision) +# assert evaluation.pii_recall == 0 + + +# def test_evaluate_multiple_examples_correct_statistics(): +# prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"] +# model = MockTokensModel(prediction=prediction) +# evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) +# input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) +# input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] +# input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] + +# evaluated = evaluator.evaluate_all( +# [input_sample, input_sample, input_sample, input_sample] +# ) +# scores = evaluator.calculate_score(evaluated) +# assert scores.pii_precision == 0.5 +# assert scores.pii_recall == 0.5 + + +# def test_evaluate_multiple_examples_ignore_entity_correct_statistics(): +# prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"] +# model = MockTokensModel(prediction=prediction) + +# evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "TENNIS_PLAYER"]) +# input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) +# input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] +# input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] + +# evaluated = evaluator.evaluate_all( +# [input_sample, input_sample, input_sample, input_sample] +# ) +# scores = evaluator.calculate_score(evaluated) +# assert scores.pii_precision == 1 +# assert scores.pii_recall == 1 + + +# def test_confusion_matrix_correct_metrics(): +# from collections import Counter + +# evaluated = [ +# EvaluationResult( +# results=Counter( +# { +# ("O", "O"): 150, +# ("O", "PERSON"): 30, +# ("O", "COMPANY"): 30, +# ("PERSON", "PERSON"): 40, +# ("COMPANY", "COMPANY"): 40, +# ("PERSON", "COMPANY"): 10, +# ("COMPANY", "PERSON"): 10, +# ("PERSON", "O"): 30, +# ("COMPANY", "O"): 30, +# } +# ), +# model_errors=None, +# text=None, +# ) +# ] + +# model = MockTokensModel(prediction=None) +# evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "COMPANY"]) +# scores = evaluator.calculate_score(evaluated, beta=2.5) + +# assert scores.pii_precision == 0.625 +# assert scores.pii_recall == 0.625 +# assert scores.entity_recall_dict["PERSON"] == 0.5 +# assert scores.entity_precision_dict["PERSON"] == 0.5 +# assert scores.entity_recall_dict["COMPANY"] == 0.5 +# assert scores.entity_precision_dict["COMPANY"] == 0.5 + + +# def test_confusion_matrix_2_correct_metrics(): +# from collections import Counter + +# evaluated = [ +# EvaluationResult( +# results=Counter( +# { +# ("O", "O"): 65467, +# ("O", "ORG"): 4189, +# ("GPE", "O"): 3370, +# ("PERSON", "PERSON"): 2024, +# ("GPE", "PERSON"): 1488, +# ("GPE", "GPE"): 1033, +# ("O", "GPE"): 964, +# ("ORG", "ORG"): 914, +# ("O", "PERSON"): 834, +# ("GPE", "ORG"): 401, +# ("PERSON", "ORG"): 35, +# ("PERSON", "O"): 33, +# ("ORG", "O"): 8, +# ("PERSON", "GPE"): 5, +# ("ORG", "PERSON"): 1, +# } +# ), +# model_errors=None, +# text=None, +# ) +# ] + +# model = MockTokensModel(prediction=None) +# evaluator = Evaluator(model=model) +# scores = evaluator.calculate_score(evaluated, beta=2.5) + +# pii_tp = ( +# evaluated[0].results[("PERSON", "PERSON")] +# + evaluated[0].results[("ORG", "ORG")] +# + evaluated[0].results[("GPE", "GPE")] +# + evaluated[0].results[("ORG", "GPE")] +# + evaluated[0].results[("ORG", "PERSON")] +# + evaluated[0].results[("GPE", "ORG")] +# + evaluated[0].results[("GPE", "PERSON")] +# + evaluated[0].results[("PERSON", "GPE")] +# + evaluated[0].results[("PERSON", "ORG")] +# ) + +# pii_fp = ( +# evaluated[0].results[("O", "PERSON")] +# + evaluated[0].results[("O", "GPE")] +# + evaluated[0].results[("O", "ORG")] +# ) + +# pii_fn = ( +# evaluated[0].results[("PERSON", "O")] +# + evaluated[0].results[("GPE", "O")] +# + evaluated[0].results[("ORG", "O")] +# ) + +# assert scores.pii_precision == pii_tp / (pii_tp + pii_fp) +# assert scores.pii_recall == pii_tp / (pii_tp + pii_fn) + + +# def test_dataset_to_metric_identity_model(): +# import os + +# dir_path = os.path.dirname(os.path.realpath(__file__)) +# input_samples = InputSample.read_dataset_json( +# "{}/data/generated_small.json".format(dir_path), length=10 +# ) + +# model = IdentityTokensMockModel() +# evaluator = Evaluator(model=model) +# evaluation_results = evaluator.evaluate_all(input_samples) +# metrics = evaluator.calculate_score(evaluation_results) + +# assert metrics.pii_precision == 1 +# assert metrics.pii_recall == 1 + + +# def test_dataset_to_metric_50_50_model(): +# import os + +# dir_path = os.path.dirname(os.path.realpath(__file__)) +# input_samples = InputSample.read_dataset_json( +# "{}/data/generated_small.json".format(dir_path), length=100 +# ) + +# # Replace 50% of the predictions with a list of "O" +# model = FiftyFiftyIdentityTokensMockModel() +# evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) +# evaluation_results = evaluator.evaluate_all(input_samples) +# metrics = evaluator.calculate_score(evaluation_results) + +# print(metrics.pii_precision) +# print(metrics.pii_recall) +# print(metrics.pii_f) + +# assert metrics.pii_precision == 1 +# assert metrics.pii_recall < 0.75 +# assert metrics.pii_recall > 0.25 + + +# def test_align_entity_types_correct_output(): + +# sample1 = InputSample( +# "I live in ABC", +# spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)], +# create_tags_from_span=False, +# ) +# sample2 = InputSample( +# "I live in ABC", +# spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("C", "c", 100, 101)], +# create_tags_from_span=False, +# ) +# samples = [sample1, sample2] +# mapping = { +# "A": "1", +# "B": "2", +# "C": "1", +# } + +# new_samples = Evaluator.align_entity_types(samples, mapping) + +# count_per_entity = Counter() +# for sample in new_samples: +# for span in sample.spans: +# count_per_entity[span.entity_type] += 1 + +# assert count_per_entity["1"] == 5 +# assert count_per_entity["2"] == 1 + + +# def test_align_entity_types_wrong_mapping_exception(): + +# sample1 = InputSample( +# "I live in ABC", +# spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)], +# create_tags_from_span=False, +# ) + +# entities_mapping = {"Z": "z"} + +# with pytest.raises(ValueError): +# Evaluator.align_entity_types( +# input_samples=[sample1], entities_mapping=entities_mapping +# ) From 106aeeb6f1cdf05b1e968ce66b014e3e3a0fb7dd Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 30 Jan 2023 15:23:00 +0100 Subject: [PATCH 11/16] Add simple case unittest for compare_span function --- tests/test_evaluator.py | 297 ++++------------------------------------ 1 file changed, 29 insertions(+), 268 deletions(-) diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index 0305009..0aff5ee 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -32,6 +32,35 @@ def test_compare_span_simple_case_1(): evaluator = Evaluator(entities_to_keep=['PER', 'LOC', 'MISC']) span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) + expected_span_outputs = [SpanOutput(output_type="SPURIOUS", + annotated_span=None, + predicted_span=Span(entity_type = "PER", entity_value = "", start_position = 24, end_position=30), + overlap_score=0), + SpanOutput(output_type="ENT_TYPE", + annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 127, end_position=134), + predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 124, end_position=134), + overlap_score=0.82), + SpanOutput(output_type="EXACT", + annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 164, end_position=174), + predicted_span=Span(entity_type = "PER", entity_value = "", start_position = 164, end_position=174), + overlap_score=1), + SpanOutput(output_type="STRICT", + annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205), + predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205), + overlap_score=1), + SpanOutput(output_type="STRICT", + annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219), + predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219), + overlap_score=1), + SpanOutput(output_type="PARTIAL", + annotated_span=Span(entity_type = "MISC", entity_value = "", start_position = 230, end_position=240), + predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 225, end_position=243), + overlap_score=0.71), + SpanOutput(output_type="MISSED", + annotated_span=Span(entity_type = "PER", entity_value = "", start_position = 59, end_position=69), + predicted_span=None, + overlap_score=0)] + expected_evaluation = {'strict': {'correct': 2, 'incorrect': 3, 'partial': 0, @@ -61,138 +90,6 @@ def test_compare_span_simple_case_1(): 'possible': 6, 'actual': 6} } - print(span_outputs) - print(expected_evaluation) - assert evaluation == expected_evaluation - -def test_compare_span_strict(): - annotated_spans =[Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] - predicted_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] - - evaluator = Evaluator(entities_to_keep=["ANIMAL"]) - span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) - - expected_evaluation = { - 'strict': { - 'correct': 1, - 'incorrect': 0, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'ent_type': { - 'correct': 1, - 'incorrect': 0, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'partial': { - 'correct': 1, - 'incorrect': 0, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'exact': { - 'correct': 1, - 'incorrect': 0, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - } - } - expected_span_outputs = [SpanOutput( - output_type = "STRICT", - predicted_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), - annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), - overlap_score = 1)] - - assert len(span_outputs) == len(expected_span_outputs) - assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) - assert evaluation['strict'] == expected_evaluation['strict'] - assert evaluation['ent_type'] == expected_evaluation['ent_type'] - assert evaluation['partial'] == expected_evaluation['partial'] - assert evaluation['exact'] == expected_evaluation['exact'] - - -def test_compare_span_ent_type(): - annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] - predicted_spans =[Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24)] - - evaluator = Evaluator(entities_to_keep=["ANIMAL"]) - span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) - - expected_evaluation = { - 'strict': { - 'correct': 0, - 'incorrect': 1, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'ent_type': { - 'correct': 1, - 'incorrect': 0, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'partial': { - 'correct': 0, - 'incorrect': 0, - 'partial': 1, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': - 1, - 'possible': 1 - }, - 'exact': { - 'correct': 0, - 'incorrect': 1, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - } - } - - expected_span_outputs = [SpanOutput( - output_type = "ENT_TYPE", - predicted_span = Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24), - annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), - overlap_score = 0.72)] - assert len(span_outputs) == len(expected_span_outputs) assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) assert evaluation['strict'] == expected_evaluation['strict'] @@ -200,142 +97,6 @@ def test_compare_span_ent_type(): assert evaluation['partial'] == expected_evaluation['partial'] assert evaluation['exact'] == expected_evaluation['exact'] -def test_compare_span_exact(): - annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] - predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24)] - - evaluator = Evaluator(entities_to_keep=["ANIMAL"]) - span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) - - expected_evaluation = { - 'strict': { - 'correct': 0, - 'incorrect': 1, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'ent_type': { - 'correct': 0, - 'incorrect': 1, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'partial': { - 'correct': 1, - 'incorrect': 0, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'exact': { - 'correct': 1, - 'incorrect': 0, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - } - } - - expected_span_outputs = [SpanOutput( - output_type = "EXACT", - predicted_span = Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24), - annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), - overlap_score = 1)] - print(span_outputs) - - assert len(span_outputs) == len(expected_span_outputs) - assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) - assert evaluation['strict'] == expected_evaluation['strict'] - assert evaluation['ent_type'] == expected_evaluation['ent_type'] - assert evaluation['partial'] == expected_evaluation['partial'] - assert evaluation['exact'] == expected_evaluation['exact'] - -def test_compare_span_partial(): - annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] - predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24)] - - evaluator = Evaluator(entities_to_keep=["ANIMAL"]) - span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans) - - expected_evaluation = { - 'strict': { - 'correct': 0, - 'incorrect': 1, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'ent_type': { - 'correct': 0, - 'incorrect': 1, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - }, - 'partial': { - 'correct': 0, - 'incorrect': 0, - 'partial': 1, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': - 1, - 'possible': 1 - }, - 'exact': { - 'correct': 0, - 'incorrect': 1, - 'partial': 0, - 'missed': 0, - 'spurious': 0, - 'precision': 0, - 'recall': 0, - 'actual': 1, - 'possible': 1 - } - } - - expected_span_outputs = [SpanOutput( - output_type = "PARTIAL", - predicted_span = Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24), - annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24), - overlap_score = 0.72)] - print(span_outputs) - - assert len(span_outputs) == len(expected_span_outputs) - assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)]) - assert evaluation['strict'] == expected_evaluation['strict'] - assert evaluation['ent_type'] == expected_evaluation['ent_type'] - assert evaluation['partial'] == expected_evaluation['partial'] - assert evaluation['exact'] == expected_evaluation['exact'] # TODO: refactor those functions # def test_evaluator_simple(): From abc9d5da237c5ef33da4c8be221473da54ea4146 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 30 Jan 2023 15:50:22 +0100 Subject: [PATCH 12/16] Add function and unittest for get overlap score --- presidio_evaluator/data_objects.py | 9 +++++++++ tests/test_data_objects.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py index 2c028bc..2f320ed 100644 --- a/presidio_evaluator/data_objects.py +++ b/presidio_evaluator/data_objects.py @@ -4,6 +4,7 @@ from collections import Counter import pandas as pd +import numpy as np import spacy from spacy import Language from spacy.tokens import Doc, DocBin @@ -73,6 +74,14 @@ def intersect(self, other, ignore_entity_type: bool): return min(self.end_position, other.end_position) - max( self.start_position, other.start_position ) + + def get_overlap_ratio(self, other): + """ + Calculates the ratio as: ratio = 2.0*M / T , where M = matches , T = total number of elements in both sequences + """ + nb_matches = self.intersect(other, ignore_entity_type = True) + total_characters = (self.end_position - self.start_position) + (other.end_position - other.start_position) + return np.round((2*nb_matches/total_characters), 2) @classmethod def from_faker_span(cls, faker_span: FakerSpan) -> "Span": diff --git a/tests/test_data_objects.py b/tests/test_data_objects.py index 97e2713..96963ab 100644 --- a/tests/test_data_objects.py +++ b/tests/test_data_objects.py @@ -181,3 +181,25 @@ def test_spans_intersection( intersection = span1.intersect(span2, ignore_entity_type=ignore_entity_type) assert intersection == intersection_length + +@pytest.mark.parametrize( + "start1, end1, start2, end2, expected_overlap_ratio", + [ + (150, 153, 160, 165, 0.0), + (150, 153, 150, 153, 1.0), + (150, 153, 152, 154, 0.4), + (150, 153, 100, 151, 0.04), + ], +) +def test_get_overlap_ratio( + start1, end1, start2, end2, expected_overlap_ratio +): + span1 = Span( + entity_type="A", entity_value="123", start_position=start1, end_position=end1 + ) + span2 = Span( + entity_type="B", entity_value="123", start_position=start2, end_position=end2 + ) + + overlap_ratio = span1.get_overlap_ratio(span2) + assert overlap_ratio == expected_overlap_ratio From e285ef4f8f59650beb9ce992e0ddd9cfa2d9d080 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 30 Jan 2023 15:55:18 +0100 Subject: [PATCH 13/16] Fix bugs in compare_span function. --- presidio_evaluator/evaluation/evaluator.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py index bb105fd..620352f 100644 --- a/presidio_evaluator/evaluation/evaluator.py +++ b/presidio_evaluator/evaluation/evaluator.py @@ -83,7 +83,7 @@ def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) # annotated_spans = model_prediction.input_sample.spans # predicted_spans = model_prediction.predicted_spans - eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0} + eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0} evaluation = { 'strict': deepcopy(eval_metrics), 'ent_type': deepcopy(eval_metrics), @@ -151,11 +151,7 @@ def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) break # Check overlapping between true and pred elif evaluation_helpers.find_overlap(true_range, pred_range): - # overlap_ratio = SequenceMatcher(None, - # pred.entity_value, - # true.entity_value).ratio() - overlap_ratio = pred.intersect(true) - print(overlap_ratio) + overlap_ratio = pred.get_overlap_ratio(true) true_which_overlapped_with_pred.append(true) # Scenario V: There is an overlap (but offsets do not match exactly), # and the entity type is the same From e3acf9c96ecbb5caea04b5767f817edb62a2a882 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 30 Jan 2023 15:56:00 +0100 Subject: [PATCH 14/16] Add functions to helpers --- presidio_evaluator/evaluation_helpers.py | 114 ++++++++++++++--------- 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/presidio_evaluator/evaluation_helpers.py b/presidio_evaluator/evaluation_helpers.py index 286cc24..54d5b7c 100644 --- a/presidio_evaluator/evaluation_helpers.py +++ b/presidio_evaluator/evaluation_helpers.py @@ -22,57 +22,79 @@ def get_matched_gold(predicted_span: Span, overlap_score=0 ) +def find_overlap(true_range, pred_range): + """Find the overlap between two ranges + Find the overlap between two ranges. Return the overlapping values if + present, else return an empty set(). + Examples: + >>> find_overlap((1, 2), (2, 3)) + 2 + >>> find_overlap((1, 2), (3, 4)) + set() + """ + + true_set = set(true_range) + pred_set = set(pred_range) + + overlaps = true_set.intersection(pred_set) + + return overlaps + def span_compute_actual_possible(results: dict) -> dict: - """ - Take the result dict and calculate the actual and possible spans - """ - strict = results["strict"] - exact = results["exact"] - incorrect = results["incorrect"] - partial = results["partial"] - missed = results["miss"] - spurious = results["spurious"] - # Possible: Number of annotations in the gold-standard which contribute to the final score - possible = strict + exact + incorrect + partial + missed - # Actual: Number of annotations produced by the PII detection system - actual = strict + exact + incorrect + partial + spurious - - results["actual"] = actual - results["possible"] = possible - - return results - -def span_compute_precision_recall(results: dict) -> dict: """ - Take the result dict to calculate the strict and flexible precision/ recall + Takes a result dict that has been output by compute metrics. + Returns the results dict with actual, possible populated. + When the results dicts is from partial or ent_type metrics, then + partial_or_type=True to ensure the right calculation is used for + calculating precision and recall. + """ + + correct = results['correct'] + incorrect = results['incorrect'] + partial = results['partial'] + missed = results['missed'] + spurious = results['spurious'] + + # Possible: number annotations in the gold-standard which contribute to the + # final score + + possible = correct + incorrect + partial + missed + + # Actual: number of annotations produced by the NER system + + actual = correct + incorrect + partial + spurious + + results["actual"] = actual + results["possible"] = possible + + return results + +def span_compute_precision_recall(results: dict, partial_or_type) -> dict: + """ + Takes a result dict that has been output by compute metrics. + Returns the results dict with precison and recall populated. + When the results dicts is from partial or ent_type metrics, then + partial_or_type=True to ensure the right calculation is used for + calculating precision and recall. """ - metrics = {} - strict = results["strict"] - exact = results["exact"] - partial = results["partial"] + actual = results["actual"] possible = results["possible"] - - # Calculate the strict performance - strict_precision = strict / actual if actual > 0 else 0 - strict_recall = strict / possible if possible > 0 else 0 - - # Calculate the flexible performance - flexible_precision = (strict + exact)/ actual if actual > 0 else 0 - flexible_recall = (strict + exact) / possible if possible > 0 else 0 - - # Calculate the partial performance - partial_precision = (strict + exact + 0.5 * partial) / actual if actual > 0 else 0 - partial_recall = (strict + exact + 0.5 * partial) / possible if possible > 0 else 0 - - - metrics["strict precision"] = strict_precision - metrics["strict recall"] = strict_recall - metrics["flexible precision"] = flexible_precision - metrics["flexible recall"] = flexible_recall - metrics["partial precision"] = partial_precision - metrics["partial recall"] = partial_recall - return metrics + partial = results['partial'] + correct = results['correct'] + + if partial_or_type: + precision = (correct + 0.5 * partial) / actual if actual > 0 else 0 + recall = (correct + 0.5 * partial) / possible if possible > 0 else 0 + + else: + precision = correct / actual if actual > 0 else 0 + recall = correct / possible if possible > 0 else 0 + + results["precision"] = precision + results["recall"] = recall + + return results # TODO: Implement this function def dict_merge(dict_1: dict, dict2: dict) -> dict: From c92b543b81f94442162f89336b2ff60bb4dfdc1c Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Mon, 30 Jan 2023 16:30:35 +0100 Subject: [PATCH 15/16] Add test for span equal function --- tests/test_data_objects.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_data_objects.py b/tests/test_data_objects.py index 96963ab..084d9c9 100644 --- a/tests/test_data_objects.py +++ b/tests/test_data_objects.py @@ -203,3 +203,25 @@ def test_get_overlap_ratio( overlap_ratio = span1.get_overlap_ratio(span2) assert overlap_ratio == expected_overlap_ratio + +@pytest.mark.parametrize( + "start1, end1, entity_value1, entity_type1, start2, end2, entity_value2, entity_type2, expected_output", + [ + (150, 153, "123", "A", 150, 153, "123", "A", True), + (150, 153, "123", "B", 150, 153, "123", "A", False), + (150, 153, "123", "A", 150, 153, "345", "A", False), + (150, 153, "123", "A", 153, 156, "123", "A", False), + ], +) +def test_span_eq( + start1, end1, entity_value1, entity_type1, start2, end2, entity_value2, entity_type2, expected_output +): + span1 = Span( + entity_type=entity_type1, entity_value=entity_value1, start_position=start1, end_position=end1 + ) + span2 = Span( + entity_type=entity_type2, entity_value=entity_value2, start_position=start2, end_position=end2 + ) + + output = span1.__eq__(span2) + assert output == expected_output From d123ccadd9236ce2e54cda9c4dac9df38640e4c0 Mon Sep 17 00:00:00 2001 From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com> Date: Tue, 31 Jan 2023 08:12:52 +0100 Subject: [PATCH 16/16] Update docs --- presidio_evaluator/evaluation/evaluator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py index 620352f..634c1ba 100644 --- a/presidio_evaluator/evaluation/evaluator.py +++ b/presidio_evaluator/evaluation/evaluator.py @@ -70,14 +70,15 @@ def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOu return List[TokenOutput], Counter # def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]: - def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) -> Tuple[List[SpanOutput], dict[dict]]: + def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) -> Tuple[List[SpanOutput], dict[dict], dict[dict]]: """ Compares ground truth tags (annotation) and predicted (prediction) at span level. - :param annotated_spans: model_prediction containing an InputSample and a list of predicted tags and tokens - :param predicted_spans: + :param annotated_spans: truth annotation from InputSample + :param predicted_spans: predicted span from PII model/system Returns: List[SpanOutput]: a list of SpanOutput - dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}} + dict: a dictionary of global PII results with structure {eval_type : {}} + dict: a dictionary of PII results per entity with structure {entity_name: {eval_type : {}}} """ # get annotated and predicted span from ModelPrediction # annotated_spans = model_prediction.input_sample.spans