From 9018e6c9059893628d032a94ac905e085430f3f3 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 16 Jan 2023 13:31:48 +0100
Subject: [PATCH 01/16] Initialize SampleError class

---
 presidio_evaluator/evaluation/model_error.py  | 175 ------------------
 presidio_evaluator/evaluation/sample_error.py |  44 +++++
 2 files changed, 44 insertions(+), 175 deletions(-)
 delete mode 100644 presidio_evaluator/evaluation/model_error.py
 create mode 100644 presidio_evaluator/evaluation/sample_error.py

diff --git a/presidio_evaluator/evaluation/model_error.py b/presidio_evaluator/evaluation/model_error.py
deleted file mode 100644
index cbbe241..0000000
--- a/presidio_evaluator/evaluation/model_error.py
+++ /dev/null
@@ -1,175 +0,0 @@
-from typing import Dict, List
-
-import pandas as pd
-from spacy.tokens import Token
-
-
-class ModelError:
-    def __init__(
-        self,
-        error_type: str,
-        annotation: str,
-        prediction: str,
-        token: Token,
-        full_text: str,
-        metadata: Dict,
-    ):
-        """
-        Holds information about an error a model made for analysis purposes
-        :param error_type: str, e.g. FP, FN, Person->Address etc.
-        :param annotation: ground truth value
-        :param prediction: predicted value
-        :param token: token in question
-        :param full_text: full input text
-        :param metadata: metadata on text from InputSample
-        """
-
-        self.error_type = error_type
-        self.annotation = annotation
-        self.prediction = prediction
-        self.token = token
-        self.full_text = full_text
-        self.metadata = metadata
-
-    def __str__(self):
-        return (
-            "type: {}, "
-            "Annotation = {}, "
-            "prediction = {}, "
-            "Token = {}, "
-            "Full text = {}, "
-            "Metadata = {}".format(
-                self.error_type,
-                self.annotation,
-                self.prediction,
-                self.token,
-                self.full_text,
-                self.metadata,
-            )
-        )
-
-    def __repr__(self):
-        return f"<ModelError {self.__str__()}"
-
-    @staticmethod
-    def most_common_fp_tokens(errors=List["ModelError"], n: int = 10, entity=None):
-        """
-        Print the n most common false positive tokens
-        (tokens thought to be an entity)
-        """
-        fps = ModelError.get_false_positives(errors, entity)
-
-        tokens = [err.token.text for err in fps]
-        from collections import Counter
-
-        by_frequency = Counter(tokens)
-        most_common = by_frequency.most_common(n)
-        print("Most common false positive tokens:")
-        print(most_common)
-        print("Example sentence with each FP token:")
-        for tok, val in most_common:
-            with_tok = [err for err in fps if err.token.text == tok]
-            print(with_tok[0].full_text)
-
-    @staticmethod
-    def most_common_fn_tokens(errors=List["ModelError"], n: int = 10, entity=None):
-        """
-        Print all tokens that were missed by the model,
-        including an example of the full text in which they appear.
-        """
-        fns = ModelError.get_false_negatives(errors, entity)
-
-        fns_tokens = [err.token.text for err in fns]
-        from collections import Counter
-
-        by_frequency_fns = Counter(fns_tokens)
-        most_common_fns = by_frequency_fns.most_common(n)
-        print(most_common_fns)
-        for tok, val in most_common_fns:
-            with_tok = [err for err in fns if err.token.text == tok]
-            print(
-                "Token: {}, Annotation: {}, Full text: {}".format(
-                    with_tok[0].token, with_tok[0].annotation, with_tok[0].full_text
-                )
-            )
-
-    @staticmethod
-    def get_errors_df(
-        errors=List["ModelError"], entity: List[str] = None, error_type: str = "FN"
-    ):
-        """
-        Get ModelErrors as pd.DataFrame
-        """
-        if error_type == "FN":
-            filtered_errors = ModelError.get_false_negatives(errors, entity)
-        elif error_type == "FP":
-            filtered_errors = ModelError.get_false_positives(errors, entity)
-        else:
-            raise ValueError("error_type should be either FP or FN")
-
-        if len(filtered_errors) == 0:
-            print(
-                "No errors of type {} and entity {} were found".format(
-                    error_type, entity
-                )
-            )
-            return None
-
-        errors_df = pd.DataFrame.from_records(
-            [error.__dict__ for error in filtered_errors]
-        )
-        metadata_df = pd.DataFrame(errors_df["metadata"].tolist())
-        errors_df.drop(["metadata"], axis=1, inplace=True)
-        new_errors_df = pd.concat([errors_df, metadata_df], axis=1)
-        return new_errors_df
-
-    @staticmethod
-    def get_fps_dataframe(errors=List["ModelError"], entity: List[str] = None):
-        """
-        Get false positive ModelErrors as pd.DataFrame
-        """
-        return ModelError.get_errors_df(errors, entity, error_type="FP")
-
-    @staticmethod
-    def get_fns_dataframe(errors=List["ModelError"], entity: List[str] = None):
-        """
-        Get false negative ModelErrors as pd.DataFrame
-        """
-        return ModelError.get_errors_df(errors, entity, error_type="FN")
-
-    @staticmethod
-    def get_false_positives(errors=List["ModelError"], entity=None):
-        """
-        Get a list of all false positive errors in the results
-        """
-        if isinstance(entity, str):
-            entity = [entity]
-
-        if entity:
-            return [
-                model_error
-                for model_error in errors
-                if model_error.error_type == "FP" and model_error.prediction in entity
-            ]
-        else:
-            return [
-                model_error for model_error in errors if model_error.error_type == "FP"
-            ]
-
-    @staticmethod
-    def get_false_negatives(errors=List["ModelError"], entity=None):
-        """
-        Get a list of all false positive negative errors in the results (False negatives and wrong entity detection)
-        """
-        if isinstance(entity, str):
-            entity = [entity]
-        if entity:
-            return [
-                model_error
-                for model_error in errors
-                if model_error.error_type != "FP" and model_error.annotation in entity
-            ]
-        else:
-            return [
-                model_error for model_error in errors if model_error.error_type != "FP"
-            ]
diff --git a/presidio_evaluator/evaluation/sample_error.py b/presidio_evaluator/evaluation/sample_error.py
new file mode 100644
index 0000000..941b2cc
--- /dev/null
+++ b/presidio_evaluator/evaluation/sample_error.py
@@ -0,0 +1,44 @@
+from typing import Dict, List
+
+import pandas as pd
+from typing import List, Optional, Dict
+from spacy.tokens import Token
+
+from presidio_evaluator.evaluation import TokenOutput, SpanOutput
+
+
+class SampleError:
+    def __init__(
+        self,
+        full_text: str,
+        metadata: Dict = None,
+        token_output: Optional[List[TokenOutput]] = None,
+        span_output: Optional[List[SpanOutput]] = None
+    ):
+        """
+        Holds information about token and span errors for made a given sample for analysis purposes
+        :param full_text: full input text from InputSample
+        :param metadata: metadata on text from InputSample
+        :param token_output: list of token errors of a given model for a sample
+        :param span_output: list of span outputs of a given model for a sample 
+        """
+        self.full_text = full_text
+        self.metadata = metadata
+        self.token_output = token_output
+        self.span_output = span_output
+
+    def __str__(self):
+        return (
+            "Full text = {}, "
+            "Token errors = {}, "
+            "Span outputs = {}, "
+            "Metadata = {}".format(
+                self.full_text,
+                self.token_output,
+                self.span_output,
+                self.metadata
+            )
+        )
+
+    def __repr__(self):
+        return f"<ModelError {self.__str__()}"

From c7c8f309c9249f857e0fdee701f2cb03ff5590d0 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 16 Jan 2023 15:47:48 +0100
Subject: [PATCH 02/16] Initilize TokenOutput class

---
 presidio_evaluator/evaluation/token_output.py | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 presidio_evaluator/evaluation/token_output.py

diff --git a/presidio_evaluator/evaluation/token_output.py b/presidio_evaluator/evaluation/token_output.py
new file mode 100644
index 0000000..d8e518a
--- /dev/null
+++ b/presidio_evaluator/evaluation/token_output.py
@@ -0,0 +1,56 @@
+from typing import Optional, List
+from spacy.tokens import Token
+
+from presidio_evaluator import Span, InputSample
+
+
+class TokenOutput:
+    def __init__(
+        self,
+        error_type: str,
+        annotated_tag: str,
+        predicted_tag: str,
+        token: Token,
+    ):
+        """
+        Holds information about a token error a model made for analysis purposes
+        :param error_type: str, e.g. FP, FN, Person->Address etc.
+        :param annotated_tag: str, actual label, e.g. Person
+        :param predicted_tag: str, predicted label, e.g. Address
+        :param token: str, token in question
+        """
+
+        self.error_type = error_type
+        self.annotated_tag = annotated_tag
+        self.predicted_tag = predicted_tag
+        self.token = token
+
+    def __str__(self):
+        return (
+            "type: {}, "
+            "Annotated tag = {}, "
+            "Predicted tag = {}, "
+            "Token = {}".format(
+                self.error_type,
+                self.annotated_tag,
+                self.predicted_tag,
+                self.token
+            )
+        )
+
+    def __repr__(self):
+        return f"<TokenOutput {self.__str__()}"
+
+    @staticmethod
+    def most_common_tokens_by_error_type(errors=List["TokenOutput"], 
+                                error_type=str,
+                                n: int=10, 
+                                entity=None) -> List["TokenOutput"]:
+        """
+        Print the n most common tokens by error type
+        :param error_type: str, token error type, e.g. FP, FN
+        :param errors: List of token error in TokenOutput format.
+        :param n: int, top n most common fp to filter. 
+        :param entity: str, List of entities to filter, e.g. Person, Address
+        """
+        return List["TokenOutput"]

From f6c38403f025e3f240ec9c20403cb262af3b8f93 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 16 Jan 2023 16:15:31 +0100
Subject: [PATCH 03/16] Initialize SpanOutput. Rename method in TokenOuput

---
 presidio_evaluator/evaluation/span_output.py  | 47 +++++++++++++++++++
 presidio_evaluator/evaluation/token_output.py | 12 ++---
 2 files changed, 52 insertions(+), 7 deletions(-)
 create mode 100644 presidio_evaluator/evaluation/span_output.py

diff --git a/presidio_evaluator/evaluation/span_output.py b/presidio_evaluator/evaluation/span_output.py
new file mode 100644
index 0000000..3e7d4f0
--- /dev/null
+++ b/presidio_evaluator/evaluation/span_output.py
@@ -0,0 +1,47 @@
+from typing import Optional, List
+
+from presidio_evaluator import Span
+
+
+class SpanOutput:
+    def __init__(
+        self,
+        output_type: str,
+        overlap_score: float,
+        annotated_span: Optional[Span] = None,
+        predicted_span: Optional[Span] = None
+    ):
+        """
+        Holds information about span prediction output for analysis purposes
+        :param error_type: str, e.g. strict, exact, partial, incorrect, miss, spurious.
+        :param overlap_score: float, overlapping ratio between annotated_span and predicted_span
+        :param annotated_span: str, actual span which comes from the annotated file, e.g. Address
+        :param predicted_span: str, predicted span of a given model
+        """
+        self.output_type = output_type
+        self.overlap_score = overlap_score
+        self.annotated_span = annotated_span
+        self.predicted_span = predicted_span
+
+    def __repr__(self):
+        return (
+            f"Output type: {self.output_type}\n"
+            f"Overlap score: {self.overlap_score}\n"
+            f"Annotated span: {self.annotated_span}\n"
+            f"Predicted span: {self.predicted_span}\n"
+        )
+
+    @staticmethod
+    def get_span_output_by_type(outputs=List["SpanOutput"], 
+                                error_type=str,
+                                n: Optional[int]=None, 
+                                entity=None) -> List["SpanOutput"]:
+        """
+        Print the n most common tokens by error type
+        :param outputs: List of span errors in SpanOutput format.
+        :param error_type: str, span error type, e.g. strict, exact, partial, incorrect, miss, spurious
+        :param n: int, top n most common output to filter. If n is None, all token errors of error_type are returned.
+        :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned.
+        """
+        return List["SpanOutput"]
+        
\ No newline at end of file
diff --git a/presidio_evaluator/evaluation/token_output.py b/presidio_evaluator/evaluation/token_output.py
index d8e518a..e9a1152 100644
--- a/presidio_evaluator/evaluation/token_output.py
+++ b/presidio_evaluator/evaluation/token_output.py
@@ -1,8 +1,6 @@
 from typing import Optional, List
 from spacy.tokens import Token
 
-from presidio_evaluator import Span, InputSample
-
 
 class TokenOutput:
     def __init__(
@@ -42,15 +40,15 @@ def __repr__(self):
         return f"<TokenOutput {self.__str__()}"
 
     @staticmethod
-    def most_common_tokens_by_error_type(errors=List["TokenOutput"], 
+    def get_token_error_by_type(errors=List["TokenOutput"], 
                                 error_type=str,
-                                n: int=10, 
+                                n: Optional[int]=None,
                                 entity=None) -> List["TokenOutput"]:
         """
         Print the n most common tokens by error type
-        :param error_type: str, token error type, e.g. FP, FN
         :param errors: List of token error in TokenOutput format.
-        :param n: int, top n most common fp to filter. 
-        :param entity: str, List of entities to filter, e.g. Person, Address
+        :param error_type: str, token error type, e.g. FP, FN
+        :param n: int, top n most common error to filter. If n is None, all token errors of error_type are returned.
+        :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned.
         """
         return List["TokenOutput"]

From c6fb0e4e2abd05d3929d02c44af58e22de553b82 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 16 Jan 2023 16:33:34 +0100
Subject: [PATCH 04/16] Initialize Evaluator class

---
 presidio_evaluator/evaluation/__init__.py     |   5 +-
 presidio_evaluator/evaluation/evaluator.py    | 673 ++++++++++--------
 .../evaluation/evaluator_objects.py           | 115 +++
 presidio_evaluator/evaluation/span_output.py  |  47 --
 presidio_evaluator/evaluation/token_output.py |  54 --
 presidio_evaluator/evaluation_helpers.py      |   0
 6 files changed, 485 insertions(+), 409 deletions(-)
 create mode 100644 presidio_evaluator/evaluation/evaluator_objects.py
 delete mode 100644 presidio_evaluator/evaluation/span_output.py
 delete mode 100644 presidio_evaluator/evaluation/token_output.py
 create mode 100644 presidio_evaluator/evaluation_helpers.py

diff --git a/presidio_evaluator/evaluation/__init__.py b/presidio_evaluator/evaluation/__init__.py
index f2cc9cd..1ee63f2 100644
--- a/presidio_evaluator/evaluation/__init__.py
+++ b/presidio_evaluator/evaluation/__init__.py
@@ -1,5 +1,6 @@
-from .model_error import ModelError
+from .evaluator_objects import SpanOutput, TokenOutput, ModelPrediction
+from .sample_error import SampleError
 from .evaluation_result import EvaluationResult
 from .evaluator import Evaluator
 
-__all__ = ["ModelError", "EvaluationResult", "Evaluator"]
+__all__ = ["SpanOutput", "TokenOutput", "ModelPrediction", "SampleError", "EvaluationResult", "Evaluator"]
\ No newline at end of file
diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py
index 6532657..2ed020b 100644
--- a/presidio_evaluator/evaluation/evaluator.py
+++ b/presidio_evaluator/evaluation/evaluator.py
@@ -1,5 +1,5 @@
 from collections import Counter
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Tuple
 from pathlib import Path
 
 import numpy as np
@@ -9,346 +9,407 @@
 import pandas as pd
 
 from presidio_evaluator import InputSample
-from presidio_evaluator.evaluation import EvaluationResult, ModelError
-from presidio_evaluator.models import BaseModel
+from presidio_evaluator.evaluation import (TokenOutput, 
+                                            SpanOutput, 
+                                            ModelPrediction, 
+                                            EvaluationResult, 
+                                            SampleError)
 
 
 class Evaluator:
     def __init__(
         self,
-        model: BaseModel,
         verbose: bool = False,
         compare_by_io=True,
         entities_to_keep: Optional[List[str]] = None,
+        span_overlap_threshold: float = 0.5
     ):
         """
         Evaluate a PII detection model or a Presidio analyzer / recognizer
-
-        :param model: Instance of a fitted model (of base type BaseModel)
         :param compare_by_io: True if comparison should be done on the entity
         level and not the sub-entity level
         :param entities_to_keep: List of entity names to focus the evaluator on (and ignore the rest).
         Default is None = all entities. If the provided model has a list of entities to keep,
         this list would be used for evaluation.
         """
-        self.model = model
         self.verbose = verbose
         self.compare_by_io = compare_by_io
         self.entities_to_keep = entities_to_keep
-        if self.entities_to_keep is None and self.model.entities:
-            self.entities_to_keep = self.model.entities
+        self.span_overlap_threshold = span_overlap_threshold
 
-    def compare(self, input_sample: InputSample, prediction: List[str]):
-
-        """
-        Compares ground truth tags (annotation) and predicted (prediction)
-        :param input_sample: input sample containing list of tags with scheme
-        :param prediction: predicted value for each token
-        self.labeling_scheme
 
+    def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOutput], Counter]:
         """
-        annotation = input_sample.tags
-        tokens = input_sample.tokens
-
-        if len(annotation) != len(prediction):
-            print(
-                "Annotation and prediction do not have the"
-                "same length. Sample={}".format(input_sample)
-            )
-            return Counter(), []
-
-        results = Counter()
-        mistakes = []
-
-        new_annotation = annotation.copy()
-
-        if self.compare_by_io:
-            new_annotation = self._to_io(new_annotation)
-            prediction = self._to_io(prediction)
-
-        # Ignore annotations that aren't in the list of
-        # requested entities.
-        if self.entities_to_keep:
-            prediction = self._adjust_per_entities(prediction)
-            new_annotation = self._adjust_per_entities(new_annotation)
-        for i in range(0, len(new_annotation)):
-            results[(new_annotation[i], prediction[i])] += 1
-
-            if self.verbose:
-                print("Annotation:", new_annotation[i])
-                print("Prediction:", prediction[i])
-                print(results)
-
-            # check if there was an error
-            is_error = new_annotation[i] != prediction[i]
-            if is_error:
-                if prediction[i] == "O":
-                    mistakes.append(
-                        ModelError(
-                            error_type="FN",
-                            annotation=new_annotation[i],
-                            prediction=prediction[i],
-                            token=tokens[i],
-                            full_text=input_sample.full_text,
-                            metadata=input_sample.metadata,
-                        )
-                    )
-                elif new_annotation[i] == "O":
-                    mistakes.append(
-                        ModelError(
-                            error_type="FP",
-                            annotation=new_annotation[i],
-                            prediction=prediction[i],
-                            token=tokens[i],
-                            full_text=input_sample.full_text,
-                            metadata=input_sample.metadata,
-                        )
-                    )
-                else:
-                    mistakes.append(
-                        ModelError(
-                            error_type="Wrong entity",
-                            annotation=new_annotation[i],
-                            prediction=prediction[i],
-                            token=tokens[i],
-                            full_text=input_sample.full_text,
-                            metadata=input_sample.metadata,
-                        )
-                    )
-
-        return results, mistakes
-
-    def _adjust_per_entities(self, tags):
-        if self.entities_to_keep:
-            return [tag if tag in self.entities_to_keep else "O" for tag in tags]
-        else:
-            return tags
-
-    @staticmethod
-    def _to_io(tags):
-        """
-        Translates BILUO/BIO/IOB to IO - only In or Out of entity.
-        ['B-PERSON','I-PERSON','L-PERSON'] is translated into
-        ['PERSON','PERSON','PERSON']
-        :param tags: the input tags in BILUO/IOB/BIO format
-        :return: a new list of IO tags
+        Compares ground truth tags (annotation) and predicted (prediction) at token level. 
+        Return a list of TokenOutput and a list of objects of type Counter with structure {(actual, predicted) : count}
+        :param model_prediction: model_prediction containing an InputSample and a list of predicted tags and tokens
         """
-        return [tag[2:] if "-" in tag else tag for tag in tags]
-
-    def evaluate_sample(
-        self, sample: InputSample, prediction: List[str]
-    ) -> EvaluationResult:
-        if self.verbose:
-            print("Input sentence: {}".format(sample.full_text))
-
-        results, mistakes = self.compare(input_sample=sample, prediction=prediction)
-        return EvaluationResult(results, mistakes, sample.full_text)
-
-    def evaluate_all(self, dataset: List[InputSample]) -> List[EvaluationResult]:
-        evaluation_results = []
-        if self.model.entity_mapping:
-            print(
-                f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
-            )
-        for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
-
-            # Align tag values to the ones expected by the model
-            self.model.align_entity_types(sample)
-
-            # Predict
-            prediction = self.model.predict(sample)
-
-            # Remove entities not requested
-            prediction = self.model.filter_tags_in_supported_entities(prediction)
-
-            # Switch to requested labeling scheme (IO/BIO/BILUO)
-            prediction = self.model.to_scheme(prediction)
-
-            evaluation_result = self.evaluate_sample(
-                sample=sample, prediction=prediction
-            )
-            evaluation_results.append(evaluation_result)
 
-        return evaluation_results
+        return List[TokenOutput], Counter
 
-    @staticmethod
-    def align_entity_types(
-        input_samples: List[InputSample],
-        entities_mapping: Dict[str, str] = None,
-        allow_missing_mappings: bool = False,
-    ) -> List[InputSample]:
+    def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]:
         """
-        Change input samples to conform with Presidio's entities
-        :return: new list of InputSample
+        Compares ground truth tags (annotation) and predicted (prediction) at span level. 
+        :param model_prediction: model_prediction containing an InputSample and a list of predicted tags and tokens
+        Returns:
+        List[SpanOutput]: a list of SpanOutput
+        dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}}
         """
 
-        new_input_samples = input_samples.copy()
-
-        # A list that will contain updated input samples,
-        new_list = []
-
-        for input_sample in new_input_samples:
-            contains_field_in_mapping = False
-            new_spans = []
-            # Update spans to match the entity types in the values of entities_mapping
-            for span in input_sample.spans:
-                if span.entity_type in entities_mapping.keys():
-                    new_name = entities_mapping.get(span.entity_type)
-                    span.entity_type = new_name
-                    contains_field_in_mapping = True
-
-                    new_spans.append(span)
-                else:
-                    if not allow_missing_mappings:
-                        raise ValueError(
-                            f"Key {span.entity_type} cannot be found in the provided entities_mapping"
-                        )
-            input_sample.spans = new_spans
-
-            # Update tags in case this sample has relevant entities for evaluation
-            if contains_field_in_mapping:
-                for i, tag in enumerate(input_sample.tags):
-                    has_prefix = "-" in tag
-                    if has_prefix:
-                        prefix = tag[:2]
-                        clean = tag[2:]
-                    else:
-                        prefix = ""
-                        clean = tag
-
-                    if clean in entities_mapping.keys():
-                        new_name = entities_mapping.get(clean)
-                        input_sample.tags[i] = "{}{}".format(prefix, new_name)
-                    else:
-                        input_sample.tags[i] = "O"
-
-            new_list.append(input_sample)
-
-        return new_list
-        # Iterate on all samples
-
-    def calculate_score(
-        self,
-        evaluation_results: List[EvaluationResult],
-        entities: Optional[List[str]] = None,
-        beta: float = 2.5,
-    ) -> EvaluationResult:
-        """
-        Returns the pii_precision, pii_recall, f_measure either and number of records for each entity
-        or for all entities (ignore_entity_type = True)
-        :param evaluation_results: List of EvaluationResult
-        :param entities: List of entities to calculate score to. Default is None: all entities
-        :param beta: F measure beta value
-        between different entity types, or to treat these as misclassifications
-        :return: EvaluationResult with precision, recall and f measures
-        """
-
-        # aggregate results
-        all_results = sum([er.results for er in evaluation_results], Counter())
-
-        # compute pii_recall per entity
-        entity_recall = {}
-        entity_precision = {}
-        n = {}
-        if not entities:
-            entities = list(set([x[0] for x in all_results.keys() if x[0] != "O"]))
-
-        for entity in entities:
-            # all annotation of given type
-            annotated = sum([all_results[x] for x in all_results if x[0] == entity])
-            predicted = sum([all_results[x] for x in all_results if x[1] == entity])
-            n[entity] = annotated
-            tp = all_results[(entity, entity)]
-
-            if annotated > 0:
-                entity_recall[entity] = tp / annotated
-            else:
-                entity_recall[entity] = np.NaN
-
-            if predicted > 0:
-                per_entity_tp = all_results[(entity, entity)]
-                entity_precision[entity] = per_entity_tp / predicted
-            else:
-                entity_precision[entity] = np.NaN
-
-        # compute pii_precision and pii_recall
-        annotated_all = sum([all_results[x] for x in all_results if x[0] != "O"])
-        predicted_all = sum([all_results[x] for x in all_results if x[1] != "O"])
-        if annotated_all > 0:
-            pii_recall = (
-                sum(
-                    [
-                        all_results[x]
-                        for x in all_results
-                        if (x[0] != "O" and x[1] != "O")
-                    ]
-                )
-                / annotated_all
-            )
-        else:
-            pii_recall = np.NaN
-        if predicted_all > 0:
-            pii_precision = (
-                sum(
-                    [
-                        all_results[x]
-                        for x in all_results
-                        if (x[0] != "O" and x[1] != "O")
-                    ]
-                )
-                / predicted_all
-            )
-        else:
-            pii_precision = np.NaN
-        # compute pii_f_beta-score
-        pii_f_beta = self.f_beta(pii_precision, pii_recall, beta)
-
-        # aggregate errors
-        errors = []
-        for res in evaluation_results:
-            if res.model_errors:
-                errors.extend(res.model_errors)
-
-        evaluation_result = EvaluationResult(
-            results=all_results,
-            model_errors=errors,
-            pii_precision=pii_precision,
-            pii_recall=pii_recall,
-            entity_recall_dict=entity_recall,
-            entity_precision_dict=entity_precision,
-            n_dict=n,
-            pii_f=pii_f_beta,
-            n=sum(n.values()),
-        )
-
-        return evaluation_result
-
-    @staticmethod
-    def precision(tp: int, fp: int) -> float:
-        return tp / (tp + fp + 1e-100)
+        return List[SpanOutput], dict[dict]
 
-    @staticmethod
-    def recall(tp: int, fn: int) -> float:
-        return tp / (tp + fn + 1e-100)
-
-    @staticmethod
-    def f_beta(precision: float, recall: float, beta: float) -> float:
+    def evaluate_all(self, model_predictions: List[ModelPrediction]) -> EvaluationResult:
         """
-        Returns the F score for precision, recall and a beta parameter
-        :param precision: a float with the precision value
-        :param recall: a float with the recall value
-        :param beta: a float with the beta parameter of the F measure,
-        which gives more or less weight to precision
-        vs. recall
-        :return: a float value of the f(beta) measure.
+        Evaluate the PII performance at token and span levels. 
+        :param model_predictions: list of ModelPrediction
+        Returns:
+        EvaluationResult: the evaluation outcomes in EvaluationResult format
         """
-        if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
-            return np.nan
 
-        return ((1 + beta ** 2) * precision * recall) / (
-            ((beta ** 2) * precision) + recall
+        return EvaluationResult(
+            sample_errors = None,
+            token_confusion_matrix = None,
+            token_model_metrics = None,
+            span_model_metrics = None
         )
 
+# TODO: Old class, will be replace by new Evaluator class
+# class Evaluator:
+#     def __init__(
+#         self,
+#         model: BaseModel,
+#         verbose: bool = False,
+#         compare_by_io=True,
+#         entities_to_keep: Optional[List[str]] = None,
+#     ):
+#         """
+#         Evaluate a PII detection model or a Presidio analyzer / recognizer
+
+#         :param model: Instance of a fitted model (of base type BaseModel)
+#         :param compare_by_io: True if comparison should be done on the entity
+#         level and not the sub-entity level
+#         :param entities_to_keep: List of entity names to focus the evaluator on (and ignore the rest).
+#         Default is None = all entities. If the provided model has a list of entities to keep,
+#         this list would be used for evaluation.
+#         """
+#         self.model = model
+#         self.verbose = verbose
+#         self.compare_by_io = compare_by_io
+#         self.entities_to_keep = entities_to_keep
+#         if self.entities_to_keep is None and self.model.entities:
+#             self.entities_to_keep = self.model.entities
+
+#     def compare(self, input_sample: InputSample, prediction: List[str]):
+
+#         """
+#         Compares ground truth tags (annotation) and predicted (prediction)
+#         :param input_sample: input sample containing list of tags with scheme
+#         :param prediction: predicted value for each token
+#         self.labeling_scheme
+
+#         """
+#         annotation = input_sample.tags
+#         tokens = input_sample.tokens
+
+#         if len(annotation) != len(prediction):
+#             print(
+#                 "Annotation and prediction do not have the"
+#                 "same length. Sample={}".format(input_sample)
+#             )
+#             return Counter(), []
+
+#         results = Counter()
+#         mistakes = []
+
+#         new_annotation = annotation.copy()
+
+#         if self.compare_by_io:
+#             new_annotation = self._to_io(new_annotation)
+#             prediction = self._to_io(prediction)
+
+#         # Ignore annotations that aren't in the list of
+#         # requested entities.
+#         if self.entities_to_keep:
+#             prediction = self._adjust_per_entities(prediction)
+#             new_annotation = self._adjust_per_entities(new_annotation)
+#         for i in range(0, len(new_annotation)):
+#             results[(new_annotation[i], prediction[i])] += 1
+
+#             if self.verbose:
+#                 print("Annotation:", new_annotation[i])
+#                 print("Prediction:", prediction[i])
+#                 print(results)
+
+#             # check if there was an error
+#             is_error = new_annotation[i] != prediction[i]
+#             if is_error:
+#                 if prediction[i] == "O":
+#                     mistakes.append(
+#                         ModelError(
+#                             error_type="FN",
+#                             annotation=new_annotation[i],
+#                             prediction=prediction[i],
+#                             token=tokens[i],
+#                             full_text=input_sample.full_text,
+#                             metadata=input_sample.metadata,
+#                         )
+#                     )
+#                 elif new_annotation[i] == "O":
+#                     mistakes.append(
+#                         ModelError(
+#                             error_type="FP",
+#                             annotation=new_annotation[i],
+#                             prediction=prediction[i],
+#                             token=tokens[i],
+#                             full_text=input_sample.full_text,
+#                             metadata=input_sample.metadata,
+#                         )
+#                     )
+#                 else:
+#                     mistakes.append(
+#                         ModelError(
+#                             error_type="Wrong entity",
+#                             annotation=new_annotation[i],
+#                             prediction=prediction[i],
+#                             token=tokens[i],
+#                             full_text=input_sample.full_text,
+#                             metadata=input_sample.metadata,
+#                         )
+#                     )
+
+#         return results, mistakes
+
+#     def _adjust_per_entities(self, tags):
+#         if self.entities_to_keep:
+#             return [tag if tag in self.entities_to_keep else "O" for tag in tags]
+#         else:
+#             return tags
+
+#     @staticmethod
+#     def _to_io(tags):
+#         """
+#         Translates BILUO/BIO/IOB to IO - only In or Out of entity.
+#         ['B-PERSON','I-PERSON','L-PERSON'] is translated into
+#         ['PERSON','PERSON','PERSON']
+#         :param tags: the input tags in BILUO/IOB/BIO format
+#         :return: a new list of IO tags
+#         """
+#         return [tag[2:] if "-" in tag else tag for tag in tags]
+
+#     def evaluate_sample(
+#         self, sample: InputSample, prediction: List[str]
+#     ) -> EvaluationResult:
+#         if self.verbose:
+#             print("Input sentence: {}".format(sample.full_text))
+
+#         results, mistakes = self.compare(input_sample=sample, prediction=prediction)
+#         return EvaluationResult(results, mistakes, sample.full_text)
+
+#     def evaluate_all(self, dataset: List[InputSample]) -> List[EvaluationResult]:
+#         evaluation_results = []
+#         if self.model.entity_mapping:
+#             print(
+#                 f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
+#             )
+#         for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
+
+#             # Align tag values to the ones expected by the model
+#             self.model.align_entity_types(sample)
+
+#             # Predict
+#             prediction = self.model.predict(sample)
+
+#             # Remove entities not requested
+#             prediction = self.model.filter_tags_in_supported_entities(prediction)
+
+#             # Switch to requested labeling scheme (IO/BIO/BILUO)
+#             prediction = self.model.to_scheme(prediction)
+
+#             evaluation_result = self.evaluate_sample(
+#                 sample=sample, prediction=prediction
+#             )
+#             evaluation_results.append(evaluation_result)
+
+#         return evaluation_results
+
+#     @staticmethod
+#     def align_entity_types(
+#         input_samples: List[InputSample],
+#         entities_mapping: Dict[str, str] = None,
+#         allow_missing_mappings: bool = False,
+#     ) -> List[InputSample]:
+#         """
+#         Change input samples to conform with Presidio's entities
+#         :return: new list of InputSample
+#         """
+
+#         new_input_samples = input_samples.copy()
+
+#         # A list that will contain updated input samples,
+#         new_list = []
+
+#         for input_sample in new_input_samples:
+#             contains_field_in_mapping = False
+#             new_spans = []
+#             # Update spans to match the entity types in the values of entities_mapping
+#             for span in input_sample.spans:
+#                 if span.entity_type in entities_mapping.keys():
+#                     new_name = entities_mapping.get(span.entity_type)
+#                     span.entity_type = new_name
+#                     contains_field_in_mapping = True
+
+#                     new_spans.append(span)
+#                 else:
+#                     if not allow_missing_mappings:
+#                         raise ValueError(
+#                             f"Key {span.entity_type} cannot be found in the provided entities_mapping"
+#                         )
+#             input_sample.spans = new_spans
+
+#             # Update tags in case this sample has relevant entities for evaluation
+#             if contains_field_in_mapping:
+#                 for i, tag in enumerate(input_sample.tags):
+#                     has_prefix = "-" in tag
+#                     if has_prefix:
+#                         prefix = tag[:2]
+#                         clean = tag[2:]
+#                     else:
+#                         prefix = ""
+#                         clean = tag
+
+#                     if clean in entities_mapping.keys():
+#                         new_name = entities_mapping.get(clean)
+#                         input_sample.tags[i] = "{}{}".format(prefix, new_name)
+#                     else:
+#                         input_sample.tags[i] = "O"
+
+#             new_list.append(input_sample)
+
+#         return new_list
+#         # Iterate on all samples
+
+#     def calculate_score(
+#         self,
+#         evaluation_results: List[EvaluationResult],
+#         entities: Optional[List[str]] = None,
+#         beta: float = 2.5,
+#     ) -> EvaluationResult:
+#         """
+#         Returns the pii_precision, pii_recall, f_measure either and number of records for each entity
+#         or for all entities (ignore_entity_type = True)
+#         :param evaluation_results: List of EvaluationResult
+#         :param entities: List of entities to calculate score to. Default is None: all entities
+#         :param beta: F measure beta value
+#         between different entity types, or to treat these as misclassifications
+#         :return: EvaluationResult with precision, recall and f measures
+#         """
+
+#         # aggregate results
+#         all_results = sum([er.results for er in evaluation_results], Counter())
+
+#         # compute pii_recall per entity
+#         entity_recall = {}
+#         entity_precision = {}
+#         n = {}
+#         if not entities:
+#             entities = list(set([x[0] for x in all_results.keys() if x[0] != "O"]))
+
+#         for entity in entities:
+#             # all annotation of given type
+#             annotated = sum([all_results[x] for x in all_results if x[0] == entity])
+#             predicted = sum([all_results[x] for x in all_results if x[1] == entity])
+#             n[entity] = annotated
+#             tp = all_results[(entity, entity)]
+
+#             if annotated > 0:
+#                 entity_recall[entity] = tp / annotated
+#             else:
+#                 entity_recall[entity] = np.NaN
+
+#             if predicted > 0:
+#                 per_entity_tp = all_results[(entity, entity)]
+#                 entity_precision[entity] = per_entity_tp / predicted
+#             else:
+#                 entity_precision[entity] = np.NaN
+
+#         # compute pii_precision and pii_recall
+#         annotated_all = sum([all_results[x] for x in all_results if x[0] != "O"])
+#         predicted_all = sum([all_results[x] for x in all_results if x[1] != "O"])
+#         if annotated_all > 0:
+#             pii_recall = (
+#                 sum(
+#                     [
+#                         all_results[x]
+#                         for x in all_results
+#                         if (x[0] != "O" and x[1] != "O")
+#                     ]
+#                 )
+#                 / annotated_all
+#             )
+#         else:
+#             pii_recall = np.NaN
+#         if predicted_all > 0:
+#             pii_precision = (
+#                 sum(
+#                     [
+#                         all_results[x]
+#                         for x in all_results
+#                         if (x[0] != "O" and x[1] != "O")
+#                     ]
+#                 )
+#                 / predicted_all
+#             )
+#         else:
+#             pii_precision = np.NaN
+#         # compute pii_f_beta-score
+#         pii_f_beta = self.f_beta(pii_precision, pii_recall, beta)
+
+#         # aggregate errors
+#         errors = []
+#         for res in evaluation_results:
+#             if res.model_errors:
+#                 errors.extend(res.model_errors)
+
+#         evaluation_result = EvaluationResult(
+#             results=all_results,
+#             model_errors=errors,
+#             pii_precision=pii_precision,
+#             pii_recall=pii_recall,
+#             entity_recall_dict=entity_recall,
+#             entity_precision_dict=entity_precision,
+#             n_dict=n,
+#             pii_f=pii_f_beta,
+#             n=sum(n.values()),
+#         )
+
+#         return evaluation_result
+
+#     @staticmethod
+#     def precision(tp: int, fp: int) -> float:
+#         return tp / (tp + fp + 1e-100)
+
+#     @staticmethod
+#     def recall(tp: int, fn: int) -> float:
+#         return tp / (tp + fn + 1e-100)
+
+#     @staticmethod
+#     def f_beta(precision: float, recall: float, beta: float) -> float:
+#         """
+#         Returns the F score for precision, recall and a beta parameter
+#         :param precision: a float with the precision value
+#         :param recall: a float with the recall value
+#         :param beta: a float with the beta parameter of the F measure,
+#         which gives more or less weight to precision
+#         vs. recall
+#         :return: a float value of the f(beta) measure.
+#         """
+#         if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
+#             return np.nan
+
+#         return ((1 + beta ** 2) * precision * recall) / (
+#             ((beta ** 2) * precision) + recall
+#         )
+
     class Plotter:
         """
         Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives) 
diff --git a/presidio_evaluator/evaluation/evaluator_objects.py b/presidio_evaluator/evaluation/evaluator_objects.py
new file mode 100644
index 0000000..0bf2bb3
--- /dev/null
+++ b/presidio_evaluator/evaluation/evaluator_objects.py
@@ -0,0 +1,115 @@
+from typing import Optional, List
+from spacy.tokens import Token
+
+from presidio_evaluator import Span, InputSample
+
+
+class TokenOutput:
+    def __init__(
+        self,
+        error_type: str,
+        annotated_tag: str,
+        predicted_tag: str,
+        token: Token,
+    ):
+        """
+        Holds information about a token error a model made for analysis purposes
+        :param error_type: str, e.g. FP, FN, Person->Address etc.
+        :param annotated_tag: str, actual label, e.g. Person
+        :param predicted_tag: str, predicted label, e.g. Address
+        :param token: str, token in question
+        """
+
+        self.error_type = error_type
+        self.annotated_tag = annotated_tag
+        self.predicted_tag = predicted_tag
+        self.token = token
+
+    def __str__(self):
+        return (
+            "type: {}, "
+            "Annotated tag = {}, "
+            "Predicted tag = {}, "
+            "Token = {}".format(
+                self.error_type,
+                self.annotated_tag,
+                self.predicted_tag,
+                self.token
+            )
+        )
+
+    def __repr__(self):
+        return f"<TokenOutput {self.__str__()}"
+
+    @staticmethod
+    def get_token_error_by_type(errors=List["TokenOutput"], 
+                                error_type=str,
+                                n: Optional[int]=None,
+                                entity=None) -> List["TokenOutput"]:
+        """
+        Print the n most common tokens by error type
+        :param errors: List of token error in TokenOutput format.
+        :param error_type: str, token error type, e.g. FP, FN
+        :param n: int, top n most common error to filter. Default is None = all token errors of error_type are returned.
+        :param entity: str, List of entities to filter, e.g. Person, Address. Default is None = all entities
+        """
+        return List["TokenOutput"]
+
+
+class SpanOutput:
+    def __init__(
+        self,
+        output_type: str,
+        overlap_score: float,
+        annotated_span: Optional[Span] = None,
+        predicted_span: Optional[Span] = None
+    ):
+        """
+        Holds information about span prediction output for analysis purposes
+        :param error_type: str, e.g. strict, exact, partial, incorrect, miss, spurious.
+        :param overlap_score: float, overlapping ratio between annotated_span and predicted_span
+        :param annotated_span: str, actual span which comes from the annotated file, e.g. Address
+        :param predicted_span: str, predicted span of a given model
+        """
+        self.output_type = output_type
+        self.overlap_score = overlap_score
+        self.annotated_span = annotated_span
+        self.predicted_span = predicted_span
+
+    def __repr__(self):
+        return (
+            f"Output type: {self.output_type}\n"
+            f"Overlap score: {self.overlap_score}\n"
+            f"Annotated span: {self.annotated_span}\n"
+            f"Predicted span: {self.predicted_span}\n"
+        )
+
+    @staticmethod
+    def get_span_output_by_type(outputs=List["SpanOutput"], 
+                                error_type=str,
+                                n: Optional[int]=None, 
+                                entity=None) -> List["SpanOutput"]:
+        """
+        Print the n most common tokens by error type
+        :param outputs: List of span errors in SpanOutput format.
+        :param error_type: str, span error type, e.g. strict, exact, partial, incorrect, miss, spurious
+        :param n: int, top n most common output to filter. Default is None = all token errors of error_type are returned.
+        :param entity: str, List of entities to filter, e.g. Person, Address. Default is None = all entities.
+        """
+        return List["SpanOutput"]
+        
+
+class ModelPrediction:
+    def __init__(
+        self,
+        input_sample: InputSample,
+        predicted_tags: Optional[List[str]],
+        predicted_spans: Optional[List[Span]]
+    ):
+        """
+        Holds information about model prediction in both span and token level
+        :params
+        """
+        self.input_sample = input_sample
+        self.predicted_tags = predicted_tags
+        self.predicted_spans = predicted_spans
\ No newline at end of file
diff --git a/presidio_evaluator/evaluation/span_output.py b/presidio_evaluator/evaluation/span_output.py
deleted file mode 100644
index 3e7d4f0..0000000
--- a/presidio_evaluator/evaluation/span_output.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from typing import Optional, List
-
-from presidio_evaluator import Span
-
-
-class SpanOutput:
-    def __init__(
-        self,
-        output_type: str,
-        overlap_score: float,
-        annotated_span: Optional[Span] = None,
-        predicted_span: Optional[Span] = None
-    ):
-        """
-        Holds information about span prediction output for analysis purposes
-        :param error_type: str, e.g. strict, exact, partial, incorrect, miss, spurious.
-        :param overlap_score: float, overlapping ratio between annotated_span and predicted_span
-        :param annotated_span: str, actual span which comes from the annotated file, e.g. Address
-        :param predicted_span: str, predicted span of a given model
-        """
-        self.output_type = output_type
-        self.overlap_score = overlap_score
-        self.annotated_span = annotated_span
-        self.predicted_span = predicted_span
-
-    def __repr__(self):
-        return (
-            f"Output type: {self.output_type}\n"
-            f"Overlap score: {self.overlap_score}\n"
-            f"Annotated span: {self.annotated_span}\n"
-            f"Predicted span: {self.predicted_span}\n"
-        )
-
-    @staticmethod
-    def get_span_output_by_type(outputs=List["SpanOutput"], 
-                                error_type=str,
-                                n: Optional[int]=None, 
-                                entity=None) -> List["SpanOutput"]:
-        """
-        Print the n most common tokens by error type
-        :param outputs: List of span errors in SpanOutput format.
-        :param error_type: str, span error type, e.g. strict, exact, partial, incorrect, miss, spurious
-        :param n: int, top n most common output to filter. If n is None, all token errors of error_type are returned.
-        :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned.
-        """
-        return List["SpanOutput"]
-        
\ No newline at end of file
diff --git a/presidio_evaluator/evaluation/token_output.py b/presidio_evaluator/evaluation/token_output.py
deleted file mode 100644
index e9a1152..0000000
--- a/presidio_evaluator/evaluation/token_output.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from typing import Optional, List
-from spacy.tokens import Token
-
-
-class TokenOutput:
-    def __init__(
-        self,
-        error_type: str,
-        annotated_tag: str,
-        predicted_tag: str,
-        token: Token,
-    ):
-        """
-        Holds information about a token error a model made for analysis purposes
-        :param error_type: str, e.g. FP, FN, Person->Address etc.
-        :param annotated_tag: str, actual label, e.g. Person
-        :param predicted_tag: str, predicted label, e.g. Address
-        :param token: str, token in question
-        """
-
-        self.error_type = error_type
-        self.annotated_tag = annotated_tag
-        self.predicted_tag = predicted_tag
-        self.token = token
-
-    def __str__(self):
-        return (
-            "type: {}, "
-            "Annotated tag = {}, "
-            "Predicted tag = {}, "
-            "Token = {}".format(
-                self.error_type,
-                self.annotated_tag,
-                self.predicted_tag,
-                self.token
-            )
-        )
-
-    def __repr__(self):
-        return f"<TokenOutput {self.__str__()}"
-
-    @staticmethod
-    def get_token_error_by_type(errors=List["TokenOutput"], 
-                                error_type=str,
-                                n: Optional[int]=None,
-                                entity=None) -> List["TokenOutput"]:
-        """
-        Print the n most common tokens by error type
-        :param errors: List of token error in TokenOutput format.
-        :param error_type: str, token error type, e.g. FP, FN
-        :param n: int, top n most common error to filter. If n is None, all token errors of error_type are returned.
-        :param entity: str, List of entities to filter, e.g. Person, Address. If entity is None, all entities are returned.
-        """
-        return List["TokenOutput"]
diff --git a/presidio_evaluator/evaluation_helpers.py b/presidio_evaluator/evaluation_helpers.py
new file mode 100644
index 0000000..e69de29

From bcd142a97b012257ce92b696133fe5f24294c185 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 16 Jan 2023 16:37:29 +0100
Subject: [PATCH 05/16] Initialize some utils function in helpers

---
 presidio_evaluator/evaluation_helpers.py | 134 +++++++++++++++++++++++
 1 file changed, 134 insertions(+)

diff --git a/presidio_evaluator/evaluation_helpers.py b/presidio_evaluator/evaluation_helpers.py
index e69de29..286cc24 100644
--- a/presidio_evaluator/evaluation_helpers.py
+++ b/presidio_evaluator/evaluation_helpers.py
@@ -0,0 +1,134 @@
+import numpy as np
+from typing import List, Dict
+from collections import Counter
+
+from presidio_evaluator import Span
+from presidio_evaluator.evaluation import SpanOutput
+
+
+def get_matched_gold(predicted_span: Span, 
+                    annotated_span: List[Span], 
+                    overlap_threshold) -> SpanOutput:
+        """
+        Given a predicted_span, get the best matchest annotated_span based on the overlap_threshold. 
+        Return a SpanOutput
+        :param sample: InputSample
+        :param pred_span: Span,  Predicted span
+        :param gold_span: List[Span]: List of gold spans from the annotation input
+        """
+        return SpanOutput(output_type="",
+                            predicted_span=None,
+                            annotated_span=None,
+                            overlap_score=0
+                            )
+
+def span_compute_actual_possible(results: dict) -> dict:
+        """
+        Take the result dict and calculate the actual and possible spans
+        """
+        strict = results["strict"]
+        exact = results["exact"]
+        incorrect = results["incorrect"]
+        partial = results["partial"]
+        missed = results["miss"]
+        spurious = results["spurious"]
+        # Possible: Number of annotations in the gold-standard which contribute to the final score
+        possible = strict + exact + incorrect + partial + missed
+        # Actual: Number of annotations produced by the PII detection system
+        actual = strict + exact + incorrect + partial + spurious
+
+        results["actual"] = actual
+        results["possible"] = possible
+        
+        return results
+
+def span_compute_precision_recall(results: dict) -> dict:
+    """
+    Take the result dict to calculate the strict and flexible precision/ recall
+    """
+    metrics = {}
+    strict = results["strict"]
+    exact = results["exact"]
+    partial = results["partial"]
+    actual = results["actual"]
+    possible = results["possible"]
+    
+    # Calculate the strict performance
+    strict_precision = strict / actual if actual > 0 else 0
+    strict_recall = strict / possible if possible > 0 else 0
+
+    # Calculate the flexible performance
+    flexible_precision = (strict + exact)/ actual if actual > 0 else 0
+    flexible_recall = (strict + exact) / possible if possible > 0 else 0
+
+    # Calculate the partial performance
+    partial_precision = (strict + exact + 0.5 * partial) / actual if actual > 0 else 0
+    partial_recall = (strict + exact + 0.5 * partial) / possible if possible > 0 else 0
+    
+
+    metrics["strict precision"] = strict_precision
+    metrics["strict recall"] = strict_recall
+    metrics["flexible precision"] = flexible_precision
+    metrics["flexible recall"] = flexible_recall
+    metrics["partial precision"] = partial_precision
+    metrics["partial recall"] = partial_recall
+    return metrics
+
+# TODO: Implement this function
+def dict_merge(dict_1: dict, dict2: dict) -> dict:
+    """
+    Examples: Sum up the value of two dictionaries by keys 
+    >>> dict_1 = {'PII': {
+                        'correct': 2,
+                        'partial': 1
+                    },
+                    'PERSON': {
+                        'correct': 2,
+                        'partial': 0,
+                    }
+                }
+    >>> dict_2 = {'PII': {
+                        'correct': 3,
+                        'partial': 0
+                    },
+                    'PERSON': {
+                        'correct': 1,
+                        'partial': 1,
+                    }
+                }    
+    >>> dict_merge(dict1, dict2)
+    {'PII': {
+                'correct': 5,
+                'partial': 1
+            },
+    'PERSON': {
+        'correct': 3,
+        'partial': 1,
+            }
+    }
+    """
+    results = {}
+    return results
+
+# TODO: Implement this function
+def token_calulate_score(token_confusion_matrix: Counter) -> Dict:
+    """
+    Calculate the token model metrics from token confusion matrix
+    Examples: Sum up the value of two dictionaries by keys 
+    >>> token_confusion_matrix = Counter({('O', 'O'): X, ('O', 'DateTime'): X, ('DateTime', 'O'): X, ('DateTime', 'DateTime'): X})
+    >>> token_calulate_score(token_confusion_matrix)
+    {'PII': {
+                'recall': xxx,
+                'precision': xxx,
+                'F measure': xxx
+            },
+    'PERSON': {
+                'recall': xxx,
+                'precision': xxx,
+    }
+    }
+    """
+    token_model_metrics = {}
+    return token_model_metrics
+
+    
\ No newline at end of file

From d74aba1adc0159ce07f3ab274399964d4fac6831 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 16 Jan 2023 16:51:20 +0100
Subject: [PATCH 06/16] Initialize EvaluationResult class

---
 .../evaluation/evaluation_result.py           | 219 ++++++++++--------
 1 file changed, 122 insertions(+), 97 deletions(-)

diff --git a/presidio_evaluator/evaluation/evaluation_result.py b/presidio_evaluator/evaluation/evaluation_result.py
index e3f88c5..8756efb 100644
--- a/presidio_evaluator/evaluation/evaluation_result.py
+++ b/presidio_evaluator/evaluation/evaluation_result.py
@@ -2,110 +2,135 @@
 from collections import Counter
 from typing import List, Optional, Dict, Tuple
 
-from presidio_evaluator.evaluation import ModelError
+from presidio_evaluator.evaluation import SampleError
 
 
 class EvaluationResult:
     def __init__(
         self,
-        results: Counter,
-        model_errors: Optional[List[ModelError]] = None,
-        text: Optional[str] = None,
-        pii_recall: Optional[float] = None,
-        pii_precision: Optional[float] = None,
-        pii_f: Optional[float] = None,
-        n: Optional[int] = None,
-        entity_recall_dict: Optional[Dict[str, float]] = None,
-        entity_precision_dict: Optional[Dict[str, float]] = None,
-        n_dict: Optional[Dict[str, int]] = None,
+        sample_errors: List[SampleError],
+        token_confusion_matrix: Counter,
+        token_model_metrics: Dict[str, Dict[str, float]],
+        span_model_metrics: Dict[str, Dict[str, float]]
+
     ):
         """
-        Holds the output of a comparison between ground truth and predicted
-        :param results: List of objects of type Counter
+        Holds the output of token and span evaluation for a given dataset
+        :param model_errors: List of token and span errors for further inspection
+        :param token_confusion_matrix: List of objects of type Counter
         with structure {(actual, predicted) : count}
-        :param model_errors: List of specific model errors for further inspection
-        :param text: sample's full text (if used for one sample)
-        :param pii_recall: Recall for all entities (PII or not)
-        :param pii_precision: Precision for all entities (PII or not)
-        :param pii_f: F measure for all entities (PII or not)
-        :param n: Number of total entity tokens
-        :param entity_recall_dict: Recall per entity
-        :param entity_precision_dict: Precision per entity
-        :param n_dict: Number of tokens per entity
+        :param token_model_metrics: metrics calculated based on token results
+        :param span_model_metrics: metrics calculated based on span results
         """
 
-        self.results = results
-        self.model_errors = model_errors
-        self.text = text
-
-        self.pii_recall = pii_recall
-        self.pii_precision = pii_precision
-        self.pii_f = pii_f
-        self.n = n
-        self.entity_recall_dict = entity_recall_dict
-        self.entity_precision_dict = entity_precision_dict
-        self.n_dict = n_dict
-
-    def __str__(self):
-        return_str = ""
-        if not self.entity_precision_dict or not self.entity_recall_dict:
-            return json.dumps(self.results)
-
-        entities = self.n_dict.keys()
-
-        row_format = "{:>20}{:>20.2%}{:>20.2%}{:>20}"
-        header_format = "{:>20}" * 4
-        return_str += str(
-            header_format.format(
-                *("Entity", "Precision", "Recall", "Number of samples")
-            )
-        )
-        for entity in entities:
-            return_str += "\n" + row_format.format(
-                entity,
-                self.entity_precision_dict[entity],
-                self.entity_recall_dict[entity],
-                self.n_dict[entity],
-            )
-
-        # add PII values
-        return_str += "\n" + row_format.format(
-            "PII",
-            self.pii_precision,
-            self.pii_recall,
-            self.n,
-        )
-
-        return_str += f"\nPII F measure: {self.pii_f:.2%}"
-        return return_str
-
-    def __repr__(self):
-        return f"stats={self.results}"
-
-    def to_log(self):
-        metrics_dict = {
-            "pii_f": self.pii_f,
-        }
-        if self.entity_precision_dict:
-            metrics_dict.update(
-                {
-                    f"{ent}_precision": v
-                    for (ent, v) in self.entity_precision_dict.items()
-                }
-            )
-        if self.entity_recall_dict:
-            metrics_dict.update(
-                {f"{ent}_recall": v for (ent, v) in self.entity_recall_dict.items()}
-            )
-        if self.n:
-            metrics_dict.update(self.n_dict)
-        return metrics_dict
-
-    def to_confusion_matrix(self) -> Tuple[List[str], List[List[int]]]:
-        entities = sorted(list(set(self.n_dict.keys()).union("O")))
-        confusion_matrix = [[0] * len(entities) for _ in range(len(entities))]
-        for i, actual in enumerate(entities):
-            for j, predicted in enumerate(entities):
-                confusion_matrix[i][j] = self.results[(actual, predicted)]
-
-        return entities, confusion_matrix
+        self.sample_errors = sample_errors
+        self.token_confusion_matrix = token_confusion_matrix
+        self.token_model_metrics = token_model_metrics
+        self.span_model_metrics = span_model_metrics
+
+
+# TODO: Review and refactor the method in old EvaluationResult to new one
+# class EvaluationResult:
+#     def __init__(
+#         self,
+#         results: Counter,
+#         model_errors: Optional[List[ModelError]] = None,
+#         text: Optional[str] = None,
+#         pii_recall: Optional[float] = None,
+#         pii_precision: Optional[float] = None,
+#         pii_f: Optional[float] = None,
+#         n: Optional[int] = None,
+#         entity_recall_dict: Optional[Dict[str, float]] = None,
+#         entity_precision_dict: Optional[Dict[str, float]] = None,
+#         n_dict: Optional[Dict[str, int]] = None,
+#     ):
+#         """
+#         Holds the output of a comparison between ground truth and predicted
+#         :param results: List of objects of type Counter
+#         with structure {(actual, predicted) : count}
+#         :param model_errors: List of specific model errors for further inspection
+#         :param text: sample's full text (if used for one sample)
+#         :param pii_recall: Recall for all entities (PII or not)
+#         :param pii_precision: Precision for all entities (PII or not)
+#         :param pii_f: F measure for all entities (PII or not)
+#         :param n: Number of total entity tokens
+#         :param entity_recall_dict: Recall per entity
+#         :param entity_precision_dict: Precision per entity
+#         :param n_dict: Number of tokens per entity
+#         """
+
+#         self.results = results
+#         self.model_errors = model_errors
+#         self.text = text
+
+#         self.pii_recall = pii_recall
+#         self.pii_precision = pii_precision
+#         self.pii_f = pii_f
+#         self.n = n
+#         self.entity_recall_dict = entity_recall_dict
+#         self.entity_precision_dict = entity_precision_dict
+#         self.n_dict = n_dict
+
+#     def __str__(self):
+#         return_str = ""
+#         if not self.entity_precision_dict or not self.entity_recall_dict:
+#             return json.dumps(self.results)
+
+#         entities = self.n_dict.keys()
+
+#         row_format = "{:>20}{:>20.2%}{:>20.2%}{:>20}"
+#         header_format = "{:>20}" * 4
+#         return_str += str(
+#             header_format.format(
+#                 *("Entity", "Precision", "Recall", "Number of samples")
+#             )
+#         )
+#         for entity in entities:
+#             return_str += "\n" + row_format.format(
+#                 entity,
+#                 self.entity_precision_dict[entity],
+#                 self.entity_recall_dict[entity],
+#                 self.n_dict[entity],
+#             )
+
+#         # add PII values
+#         return_str += "\n" + row_format.format(
+#             "PII",
+#             self.pii_precision,
+#             self.pii_recall,
+#             self.n,
+#         )
+
+#         return_str += f"\nPII F measure: {self.pii_f:.2%}"
+#         return return_str
+
+#     def __repr__(self):
+#         return f"stats={self.results}"
+
+#     def to_log(self):
+#         metrics_dict = {
+#             "pii_f": self.pii_f,
+#         }
+#         if self.entity_precision_dict:
+#             metrics_dict.update(
+#                 {
+#                     f"{ent}_precision": v
+#                     for (ent, v) in self.entity_precision_dict.items()
+#                 }
+#             )
+#         if self.entity_recall_dict:
+#             metrics_dict.update(
+#                 {f"{ent}_recall": v for (ent, v) in self.entity_recall_dict.items()}
+#             )
+#         if self.n:
+#             metrics_dict.update(self.n_dict)
+#         return metrics_dict
+
+#     def to_confusion_matrix(self) -> Tuple[List[str], List[List[int]]]:
+#         entities = sorted(list(set(self.n_dict.keys()).union("O")))
+#         confusion_matrix = [[0] * len(entities) for _ in range(len(entities))]
+#         for i, actual in enumerate(entities):
+#             for j, predicted in enumerate(entities):
+#                 confusion_matrix[i][j] = self.results[(actual, predicted)]
+
+#         return entities, confusion_matrix

From c64d2c226f1b390347c24d1e2ef4a33db2d1edd2 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Fri, 27 Jan 2023 16:25:50 +0100
Subject: [PATCH 07/16] Implement compare_span function

---
 presidio_evaluator/evaluation/evaluator.py | 194 ++++++++++++++++++++-
 1 file changed, 191 insertions(+), 3 deletions(-)

diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py
index 2ed020b..92c4acd 100644
--- a/presidio_evaluator/evaluation/evaluator.py
+++ b/presidio_evaluator/evaluation/evaluator.py
@@ -1,6 +1,8 @@
 from collections import Counter
 from typing import List, Optional, Dict, Tuple
 from pathlib import Path
+from copy import deepcopy
+from difflib import SequenceMatcher
 
 import numpy as np
 from tqdm import tqdm
@@ -14,6 +16,7 @@
                                             ModelPrediction, 
                                             EvaluationResult, 
                                             SampleError)
+import evaluation_helpers
 
 
 class Evaluator:
@@ -21,7 +24,7 @@ def __init__(
         self,
         verbose: bool = False,
         compare_by_io=True,
-        entities_to_keep: Optional[List[str]] = None,
+        entities_to_keep=True,
         span_overlap_threshold: float = 0.5
     ):
         """
@@ -37,6 +40,25 @@ def __init__(
         self.entities_to_keep = entities_to_keep
         self.span_overlap_threshold = span_overlap_threshold
 
+        # setup a dict for storing the span metrics
+        self.span_model_metrics = {
+            'correct': 0,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'possible': 0,
+            'actual': 0,
+            'precision': 0,
+            'recall': 0,
+        }
+        # Copy results dict to cover the four evaluation schemes.
+        self.span_results = {
+            'strict': deepcopy(self.span_model_metrics),
+            'ent_type': deepcopy(self.span_model_metrics),
+            'partial':deepcopy(self.span_model_metrics),
+            'exact':deepcopy(self.span_model_metrics),
+            }
 
     def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOutput], Counter]:
         """
@@ -55,8 +77,174 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
         List[SpanOutput]: a list of SpanOutput
         dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}}
         """
-
-        return List[SpanOutput], dict[dict]
+        # get annotated and predicted span from ModelPrediction
+        annotated_spans = model_prediction.input_sample.spans
+        predicted_spans = model_prediction.predicted_spans
+
+        eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0}
+        evaluation = {
+            'strict': deepcopy(eval_metrics),
+            'ent_type': deepcopy(eval_metrics),
+            'partial': deepcopy(eval_metrics),
+            'exact': deepcopy(eval_metrics)
+        }
+        # results by entity type
+        evaluation_agg_entities_type = {e: deepcopy(evaluation) for e in self.entities_to_keep}
+
+        # keep track of entities that overlapped
+        true_which_overlapped_with_pred = []
+        # keep track for the explainibility
+        span_outputs = []
+
+        # go through each predicted
+        for pred in predicted_spans: 
+            found_overlap = False
+            # Scenario I: Exact match between true and pred
+            if pred in annotated_spans:
+                true_which_overlapped_with_pred.append(pred)
+                span_outputs.append(SpanOutput(
+                        output_type = "STRICT",
+                        gold_span = true,
+                        annotated_span = pred,
+                        overlap_score = 1
+                    ))
+                evaluation['strict']['correct'] += 1
+                evaluation['ent_type']['correct'] += 1
+                evaluation['exact']['correct'] += 1
+                evaluation['partial']['correct'] += 1
+
+                # for the agg. by entity_type results
+                evaluation_agg_entities_type[pred.entity_type]['strict']['correct'] += 1
+                evaluation_agg_entities_type[pred.entity_type]['ent_type']['correct'] += 1
+                evaluation_agg_entities_type[pred.entity_type]['exact']['correct'] += 1
+                evaluation_agg_entities_type[pred.entity_type]['partial']['correct'] += 1
+            else:
+                # check for overlaps with eny of true entities
+                for true in annotated_spans:
+                    pred_range = range(pred.start_position, pred.end_position)
+                    true_range = range(true.start_position, true.end_position)
+                    # Scenario IV: Offsets match, but entity type is wrong
+                    if true.start_position == pred.start_position and true.end_position == pred.end_position \
+                        and true.entity_type != pred.entity_type:
+                        span_outputs.append(SpanOutput(
+                                    output_type = "EXACT",
+                                    gold_span = true,
+                                    annotated_span = pred,
+                                    overlap_score = 1
+                                    ))  
+                        # overall results
+                        evaluation['strict']['incorrect'] += 1
+                        evaluation['ent_type']['incorrect'] += 1
+                        evaluation['partial']['correct'] += 1
+                        evaluation['exact']['correct'] += 1
+
+                        # aggregated by entity type results
+                        evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
+                        evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
+                        evaluation_agg_entities_type[true.e_type]['partial']['correct'] += 1
+                        evaluation_agg_entities_type[true.e_type]['exact']['correct'] += 1
+
+                        true_which_overlapped_with_pred.append(true)
+                        found_overlap = True
+                        break
+                    # Check overlapping between true and pred
+                    elif evaluation_helpers.find_overlap(true_range, pred_range):
+                        overlap_ratio = SequenceMatcher(None, 
+                                                        pred.entity_value,
+                                                        true.entity_value).ratio()
+                        true_which_overlapped_with_pred.append(true)
+                        # Scenario V: There is an overlap (but offsets do not match exactly), 
+                        # and the entity type is the same
+                        if pred.entity_type == true.entity_type:
+                            span_outputs.append(SpanOutput(
+                                    output_type = "ENT_TYPE",
+                                    gold_span = true,
+                                    annotated_span = pred,
+                                    overlap_score = overlap_ratio
+                                    ))  
+                            # overall results
+                            evaluation['strict']['incorrect'] += 1
+                            evaluation['ent_type']['correct'] += 1
+                            evaluation['partial']['partial'] += 1
+                            evaluation['exact']['incorrect'] += 1
+                            # aggregated by entity type results
+                            evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.e_type]['ent_type']['correct'] += 1
+                            evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1
+                            evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1
+                            found_overlap = True
+                            break
+                        # Offset overlap but entity type is different
+                        else:
+                            span_outputs.append(SpanOutput(
+                                    output_type = "PARTIAL",
+                                    gold_span = true,
+                                    annotated_span = pred,
+                                    overlap_score = overlap_ratio
+                                    ))
+                            # overall results
+                            evaluation['strict']['incorrect'] += 1
+                            evaluation['ent_type']['incorrect'] += 1
+                            evaluation['partial']['partial'] += 1
+                            evaluation['exact']['incorrect'] += 1
+
+                            # aggregated by entity type results
+                            # Results against the true entity
+
+                            evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1
+                            evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1
+                            found_overlap = True
+                            break
+                if not found_overlap:
+                    span_outputs.append(SpanOutput(
+                                    output_type = "SPURIOUS",
+                                    gold_span = None,
+                                    annotated_span = pred,
+                                    overlap_score = overlap_ratio
+                                    ))
+                    # Overal result
+                    evaluation['strict']['spurious'] += 1
+                    evaluation['ent_type']['spurious'] += 1
+                    evaluation['partial']['spurious'] += 1
+                    evaluation['exact']['spurious'] += 1
+                    ## NOTE: when pred is not found in tags
+                    # or when it simply does not appear in the test set, then it is
+                    # spurious, but it is not clear where to assign it at the tag
+                    # level. In this case, it is applied to all target_tags
+                    # found in this example. This will mean that the sum of the
+                    # evaluation_agg_entities will not equal evaluation.
+                    for true in self.entities_to_keep:
+                        evaluation_agg_entities_type[true]['strict']['spurious'] += 1
+                        evaluation_agg_entities_type[true]['ent_type']['spurious'] += 1
+                        evaluation_agg_entities_type[true]['partial']['spurious'] += 1
+                        evaluation_agg_entities_type[true]['exact']['spurious'] += 1
+
+        # Scenario III: Entity was misses entirely.
+        for true in annotated_spans:
+            if true in true_which_overlapped_with_pred:
+                continue
+            else:
+                span_outputs.append(SpanOutput(
+                                    output_type = "MISSED",
+                                    gold_span = true,
+                                    annotated_span = pred,
+                                    overlap_score = overlap_ratio
+                                    ))
+                # overall results
+                evaluation['strict']['missed'] += 1
+                evaluation['ent_type']['missed'] += 1
+                evaluation['partial']['missed'] += 1
+                evaluation['exact']['missed'] += 1
+
+                # for the agg. by e_type
+                evaluation_agg_entities_type[true.e_type]['strict']['missed'] += 1
+                evaluation_agg_entities_type[true.e_type]['ent_type']['missed'] += 1
+                evaluation_agg_entities_type[true.e_type]['partial']['missed'] += 1
+                evaluation_agg_entities_type[true.e_type]['exact']['missed'] += 1
+                
+        return span_outputs, evaluation, evaluation_agg_entities_type
 
     def evaluate_all(self, model_predictions: List[ModelPrediction]) -> EvaluationResult:
         """

From b0b7dcb6c2b52a9b18a27ed6380881163817d7ec Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Sun, 29 Jan 2023 07:28:09 +0100
Subject: [PATCH 08/16] Add __eq__ method for SpanOutput class

---
 presidio_evaluator/evaluation/evaluator_objects.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/presidio_evaluator/evaluation/evaluator_objects.py b/presidio_evaluator/evaluation/evaluator_objects.py
index 0bf2bb3..34d2243 100644
--- a/presidio_evaluator/evaluation/evaluator_objects.py
+++ b/presidio_evaluator/evaluation/evaluator_objects.py
@@ -83,6 +83,14 @@ def __repr__(self):
             f"Annotated span: {self.annotated_span}\n"
             f"Predicted span: {self.predicted_span}\n"
         )
+    
+    def __eq__(self, other):
+        return (
+            self.output_type == other.output_type
+            and self.overlap_score == other.overlap_score
+            and self.annotated_span == other.annotated_span
+            and self.predicted_span == other.predicted_span
+        )
 
     @staticmethod
     def get_span_output_by_type(outputs=List["SpanOutput"], 

From 9bf20e5d5648d2cd849df22baef4aa91434e234d Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Sun, 29 Jan 2023 07:29:28 +0100
Subject: [PATCH 09/16] Implement __eq__ for TokenOutput class

---
 presidio_evaluator/evaluation/evaluator_objects.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/presidio_evaluator/evaluation/evaluator_objects.py b/presidio_evaluator/evaluation/evaluator_objects.py
index 34d2243..c79f111 100644
--- a/presidio_evaluator/evaluation/evaluator_objects.py
+++ b/presidio_evaluator/evaluation/evaluator_objects.py
@@ -40,6 +40,14 @@ def __str__(self):
 
     def __repr__(self):
         return f"<TokenOutput {self.__str__()}"
+    
+    def __eq__(self, other):
+        return (
+            self.error_type == other.error_type
+            and self.annotated_tag == other.annotated_tag
+            and self.predicted_tag == other.predicted_tag
+            and self.token == other.token
+        )
 
     @staticmethod
     def get_token_error_by_type(errors=List["TokenOutput"], 

From d3c8caaf03b70df32f0269c35ff85ecd17ebeba3 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 30 Jan 2023 11:06:29 +0100
Subject: [PATCH 10/16] Add unittest + fix bugs

---
 presidio_evaluator/evaluation/evaluator.py | 100 ++-
 tests/test_evaluator.py                    | 981 ++++++++++++++-------
 2 files changed, 715 insertions(+), 366 deletions(-)

diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py
index 92c4acd..bb105fd 100644
--- a/presidio_evaluator/evaluation/evaluator.py
+++ b/presidio_evaluator/evaluation/evaluator.py
@@ -10,13 +10,13 @@
 import plotly.express as px
 import pandas as pd
 
-from presidio_evaluator import InputSample
+from presidio_evaluator import InputSample, Span
 from presidio_evaluator.evaluation import (TokenOutput, 
                                             SpanOutput, 
                                             ModelPrediction, 
                                             EvaluationResult, 
                                             SampleError)
-import evaluation_helpers
+from presidio_evaluator import evaluation_helpers
 
 
 class Evaluator:
@@ -69,17 +69,19 @@ def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOu
 
         return List[TokenOutput], Counter
 
-    def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]:
+    # def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]:
+    def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) -> Tuple[List[SpanOutput], dict[dict]]:
         """
         Compares ground truth tags (annotation) and predicted (prediction) at span level. 
-        :param model_prediction: model_prediction containing an InputSample and a list of predicted tags and tokens
+        :param annotated_spans: model_prediction containing an InputSample and a list of predicted tags and tokens
+        :param predicted_spans: 
         Returns:
         List[SpanOutput]: a list of SpanOutput
         dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}}
         """
         # get annotated and predicted span from ModelPrediction
-        annotated_spans = model_prediction.input_sample.spans
-        predicted_spans = model_prediction.predicted_spans
+        # annotated_spans = model_prediction.input_sample.spans
+        # predicted_spans = model_prediction.predicted_spans
 
         eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0}
         evaluation = {
@@ -104,7 +106,7 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
                 true_which_overlapped_with_pred.append(pred)
                 span_outputs.append(SpanOutput(
                         output_type = "STRICT",
-                        gold_span = true,
+                        predicted_span = pred,
                         annotated_span = pred,
                         overlap_score = 1
                     ))
@@ -128,8 +130,8 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
                         and true.entity_type != pred.entity_type:
                         span_outputs.append(SpanOutput(
                                     output_type = "EXACT",
-                                    gold_span = true,
-                                    annotated_span = pred,
+                                    predicted_span = pred,
+                                    annotated_span = true,
                                     overlap_score = 1
                                     ))  
                         # overall results
@@ -139,27 +141,29 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
                         evaluation['exact']['correct'] += 1
 
                         # aggregated by entity type results
-                        evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
-                        evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
-                        evaluation_agg_entities_type[true.e_type]['partial']['correct'] += 1
-                        evaluation_agg_entities_type[true.e_type]['exact']['correct'] += 1
+                        evaluation_agg_entities_type[true.entity_type]['strict']['incorrect'] += 1
+                        evaluation_agg_entities_type[true.entity_type]['ent_type']['incorrect'] += 1
+                        evaluation_agg_entities_type[true.entity_type]['partial']['correct'] += 1
+                        evaluation_agg_entities_type[true.entity_type]['exact']['correct'] += 1
 
                         true_which_overlapped_with_pred.append(true)
                         found_overlap = True
                         break
                     # Check overlapping between true and pred
                     elif evaluation_helpers.find_overlap(true_range, pred_range):
-                        overlap_ratio = SequenceMatcher(None, 
-                                                        pred.entity_value,
-                                                        true.entity_value).ratio()
+                        # overlap_ratio = SequenceMatcher(None, 
+                        #                                 pred.entity_value,
+                        #                                 true.entity_value).ratio()
+                        overlap_ratio = pred.intersect(true)
+                        print(overlap_ratio)
                         true_which_overlapped_with_pred.append(true)
                         # Scenario V: There is an overlap (but offsets do not match exactly), 
                         # and the entity type is the same
                         if pred.entity_type == true.entity_type:
                             span_outputs.append(SpanOutput(
                                     output_type = "ENT_TYPE",
-                                    gold_span = true,
-                                    annotated_span = pred,
+                                    predicted_span = pred,
+                                    annotated_span = true,
                                     overlap_score = overlap_ratio
                                     ))  
                             # overall results
@@ -168,18 +172,18 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
                             evaluation['partial']['partial'] += 1
                             evaluation['exact']['incorrect'] += 1
                             # aggregated by entity type results
-                            evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
-                            evaluation_agg_entities_type[true.e_type]['ent_type']['correct'] += 1
-                            evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1
-                            evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['strict']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['ent_type']['correct'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['partial']['partial'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['exact']['incorrect'] += 1
                             found_overlap = True
                             break
                         # Offset overlap but entity type is different
                         else:
                             span_outputs.append(SpanOutput(
                                     output_type = "PARTIAL",
-                                    gold_span = true,
-                                    annotated_span = pred,
+                                    predicted_span = pred,
+                                    annotated_span = true,
                                     overlap_score = overlap_ratio
                                     ))
                             # overall results
@@ -191,18 +195,18 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
                             # aggregated by entity type results
                             # Results against the true entity
 
-                            evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
-                            evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1
-                            evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
-                            evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['strict']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['partial']['partial'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['ent_type']['incorrect'] += 1
+                            evaluation_agg_entities_type[true.entity_type]['exact']['incorrect'] += 1
                             found_overlap = True
                             break
                 if not found_overlap:
                     span_outputs.append(SpanOutput(
                                     output_type = "SPURIOUS",
-                                    gold_span = None,
-                                    annotated_span = pred,
-                                    overlap_score = overlap_ratio
+                                    predicted_span = pred,
+                                    annotated_span = None,
+                                    overlap_score = 0
                                     ))
                     # Overal result
                     evaluation['strict']['spurious'] += 1
@@ -228,9 +232,9 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
             else:
                 span_outputs.append(SpanOutput(
                                     output_type = "MISSED",
-                                    gold_span = true,
-                                    annotated_span = pred,
-                                    overlap_score = overlap_ratio
+                                    predicted_span = None,
+                                    annotated_span = true,
+                                    overlap_score = 0
                                     ))
                 # overall results
                 evaluation['strict']['missed'] += 1
@@ -239,10 +243,30 @@ def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutp
                 evaluation['exact']['missed'] += 1
 
                 # for the agg. by e_type
-                evaluation_agg_entities_type[true.e_type]['strict']['missed'] += 1
-                evaluation_agg_entities_type[true.e_type]['ent_type']['missed'] += 1
-                evaluation_agg_entities_type[true.e_type]['partial']['missed'] += 1
-                evaluation_agg_entities_type[true.e_type]['exact']['missed'] += 1
+                evaluation_agg_entities_type[true.entity_type]['strict']['missed'] += 1
+                evaluation_agg_entities_type[true.entity_type]['ent_type']['missed'] += 1
+                evaluation_agg_entities_type[true.entity_type]['partial']['missed'] += 1
+                evaluation_agg_entities_type[true.entity_type]['exact']['missed'] += 1
+
+        # Compute 'possible', 'actual' according to SemEval-2013 Task 9.1 on the
+        # overall results, and use these to calculate precision and recall.
+
+        for eval_type in evaluation:
+            evaluation[eval_type] = evaluation_helpers.span_compute_actual_possible(evaluation[eval_type])
+
+        # Compute 'possible', 'actual', and precision and recall on entity level
+        # results. Start by cycling through the accumulated results.
+
+        for entity_type, entity_level in evaluation_agg_entities_type.items():
+
+            # Cycle through the evaluation types for each dict containing entity
+            # level results.
+
+            for eval_type in entity_level:
+
+                evaluation_agg_entities_type[entity_type][eval_type] = evaluation_helpers.span_compute_actual_possible(
+                    entity_level[eval_type]
+                )
                 
         return span_outputs, evaluation, evaluation_agg_entities_type
 
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 8319e05..0305009 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -5,7 +5,9 @@
 
 from presidio_evaluator import InputSample, Span
 
-from presidio_evaluator.evaluation import EvaluationResult, Evaluator
+from presidio_evaluator.evaluation import (Evaluator, 
+                                            ModelPrediction,
+                                            SpanOutput)
 from tests.mocks import (
     IdentityTokensMockModel,
     FiftyFiftyIdentityTokensMockModel,
@@ -13,335 +15,658 @@
 )
 
 
-def test_evaluator_simple():
-    prediction = ["O", "O", "O", "U-ANIMAL"]
-    model = MockTokensModel(prediction=prediction, entities_to_keep=["ANIMAL"])
-
-    evaluator = Evaluator(model=model)
-    sample = InputSample(
-        full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None
-    )
-    sample.tokens = ["I", "am", "the", "walrus"]
-    sample.tags = ["O", "O", "O", "U-ANIMAL"]
-
-    evaluated = evaluator.evaluate_sample(sample, prediction)
-    final_evaluation = evaluator.calculate_score([evaluated])
-
-    assert final_evaluation.pii_precision == 1
-    assert final_evaluation.pii_recall == 1
-
-
-def test_evaluate_sample_wrong_entities_to_keep_correct_statistics():
-    prediction = ["O", "O", "O", "U-ANIMAL"]
-    model = MockTokensModel(prediction=prediction)
-
-    evaluator = Evaluator(model=model, entities_to_keep=["SPACESHIP"])
-
-    sample = InputSample(
-        full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None
-    )
-    sample.tokens = ["I", "am", "the", "walrus"]
-    sample.tags = ["O", "O", "O", "U-ANIMAL"]
-
-    evaluated = evaluator.evaluate_sample(sample, prediction)
-    assert evaluated.results[("O", "O")] == 4
-
-
-def test_evaluate_same_entity_correct_statistics():
-    prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
-    model = MockTokensModel(prediction=prediction)
-    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
-    sample = InputSample(
-        full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None
-    )
-    sample.tokens = ["I", "am", "the", "walrus"]
-    sample.tags = ["O", "O", "O", "U-ANIMAL"]
-
-    evaluation_result = evaluator.evaluate_sample(sample, prediction)
-    assert evaluation_result.results[("O", "O")] == 2
-    assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
-    assert evaluation_result.results[("O", "ANIMAL")] == 1
-
-
-def test_evaluate_multiple_entities_to_keep_correct_statistics():
-    prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
-    entities_to_keep = ["ANIMAL", "PLANT", "SPACESHIP"]
-    model = MockTokensModel(prediction=prediction)
-    evaluator = Evaluator(model=model, entities_to_keep=entities_to_keep)
-
-    sample = InputSample(
-        full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None
-    )
-    sample.tokens = ["I", "am", "the", "walrus"]
-    sample.tags = ["O", "O", "O", "U-ANIMAL"]
-
-    evaluation_result = evaluator.evaluate_sample(sample, prediction)
-    assert evaluation_result.results[("O", "O")] == 2
-    assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
-    assert evaluation_result.results[("O", "ANIMAL")] == 1
-
-
-def test_evaluate_multiple_tokens_correct_statistics():
-    prediction = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
-    model = MockTokensModel(prediction=prediction)
-    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
-    sample = InputSample(
-        "I am the walrus amaericanus magnifico", masked=None, spans=None
-    )
-    sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
-    sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
-
-    evaluated = evaluator.evaluate_sample(sample, prediction)
-    evaluation = evaluator.calculate_score([evaluated])
-
-    assert evaluation.pii_precision == 1
-    assert evaluation.pii_recall == 1
-
-
-def test_evaluate_multiple_tokens_partial_match_correct_statistics():
-    prediction = ["O", "O", "O", "B-ANIMAL", "L-ANIMAL", "O"]
-    model = MockTokensModel(prediction=prediction)
-    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
-    sample = InputSample(
-        "I am the walrus amaericanus magnifico", masked=None, spans=None
-    )
-    sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
-    sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
-
-    evaluated = evaluator.evaluate_sample(sample, prediction)
-    evaluation = evaluator.calculate_score([evaluated])
-
-    assert evaluation.pii_precision == 1
-    assert evaluation.pii_recall == 4 / 6
-
-
-def test_evaluate_multiple_tokens_no_match_match_correct_statistics():
-    prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"]
-    model = MockTokensModel(prediction=prediction)
-    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
-    sample = InputSample(
-        "I am the walrus amaericanus magnifico", masked=None, spans=None
-    )
-    sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
-    sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
-
-    evaluated = evaluator.evaluate_sample(sample, prediction)
-    evaluation = evaluator.calculate_score([evaluated])
-
-    assert np.isnan(evaluation.pii_precision)
-    assert evaluation.pii_recall == 0
-
-
-def test_evaluate_multiple_examples_correct_statistics():
-    prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"]
-    model = MockTokensModel(prediction=prediction)
-    evaluator = Evaluator(model=model, entities_to_keep=["PERSON"])
-    input_sample = InputSample("My name is Raphael or David", masked=None, spans=None)
-    input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
-    input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]
-
-    evaluated = evaluator.evaluate_all(
-        [input_sample, input_sample, input_sample, input_sample]
-    )
-    scores = evaluator.calculate_score(evaluated)
-    assert scores.pii_precision == 0.5
-    assert scores.pii_recall == 0.5
-
-
-def test_evaluate_multiple_examples_ignore_entity_correct_statistics():
-    prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"]
-    model = MockTokensModel(prediction=prediction)
-
-    evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "TENNIS_PLAYER"])
-    input_sample = InputSample("My name is Raphael or David", masked=None, spans=None)
-    input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
-    input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]
-
-    evaluated = evaluator.evaluate_all(
-        [input_sample, input_sample, input_sample, input_sample]
-    )
-    scores = evaluator.calculate_score(evaluated)
-    assert scores.pii_precision == 1
-    assert scores.pii_recall == 1
-
-
-def test_confusion_matrix_correct_metrics():
-    from collections import Counter
-
-    evaluated = [
-        EvaluationResult(
-            results=Counter(
-                {
-                    ("O", "O"): 150,
-                    ("O", "PERSON"): 30,
-                    ("O", "COMPANY"): 30,
-                    ("PERSON", "PERSON"): 40,
-                    ("COMPANY", "COMPANY"): 40,
-                    ("PERSON", "COMPANY"): 10,
-                    ("COMPANY", "PERSON"): 10,
-                    ("PERSON", "O"): 30,
-                    ("COMPANY", "O"): 30,
-                }
-            ),
-            model_errors=None,
-            text=None,
-        )
-    ]
-
-    model = MockTokensModel(prediction=None)
-    evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "COMPANY"])
-    scores = evaluator.calculate_score(evaluated, beta=2.5)
-
-    assert scores.pii_precision == 0.625
-    assert scores.pii_recall == 0.625
-    assert scores.entity_recall_dict["PERSON"] == 0.5
-    assert scores.entity_precision_dict["PERSON"] == 0.5
-    assert scores.entity_recall_dict["COMPANY"] == 0.5
-    assert scores.entity_precision_dict["COMPANY"] == 0.5
-
-
-def test_confusion_matrix_2_correct_metrics():
-    from collections import Counter
-
-    evaluated = [
-        EvaluationResult(
-            results=Counter(
-                {
-                    ("O", "O"): 65467,
-                    ("O", "ORG"): 4189,
-                    ("GPE", "O"): 3370,
-                    ("PERSON", "PERSON"): 2024,
-                    ("GPE", "PERSON"): 1488,
-                    ("GPE", "GPE"): 1033,
-                    ("O", "GPE"): 964,
-                    ("ORG", "ORG"): 914,
-                    ("O", "PERSON"): 834,
-                    ("GPE", "ORG"): 401,
-                    ("PERSON", "ORG"): 35,
-                    ("PERSON", "O"): 33,
-                    ("ORG", "O"): 8,
-                    ("PERSON", "GPE"): 5,
-                    ("ORG", "PERSON"): 1,
+def test_compare_span_simple_case_1():
+    annotated_spans =[Span(entity_type = "PER", entity_value = "", start_position = 59, end_position=69),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 127, end_position=134),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 164, end_position=174),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219),
+                      Span(entity_type = "MISC", entity_value = "", start_position = 230, end_position=240)]
+    predicted_spans = [Span(entity_type = "PER", entity_value = "", start_position = 24, end_position=30),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 124, end_position=134),
+                      Span(entity_type = "PER", entity_value = "", start_position = 164, end_position=174),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219),
+                      Span(entity_type = "LOC", entity_value = "", start_position = 225, end_position=243)]
+
+    evaluator = Evaluator(entities_to_keep=['PER', 'LOC', 'MISC'])
+    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
+
+    expected_evaluation = {'strict': {'correct': 2,
+                           'incorrect': 3,
+                           'partial': 0,
+                           'missed': 1,
+                           'spurious': 1,
+                           'possible': 6,
+                           'actual': 6},
+                'ent_type': {'correct': 3,
+                             'incorrect': 2,
+                             'partial': 0,
+                             'missed': 1,
+                             'spurious': 1,
+                             'possible': 6,
+                             'actual': 6},
+                'partial': {'correct': 3,
+                            'incorrect': 0,
+                            'partial': 2,
+                            'missed': 1,
+                            'spurious': 1,
+                            'possible': 6,
+                            'actual': 6},
+                'exact': {'correct': 3,
+                          'incorrect': 2,
+                          'partial': 0,
+                          'missed': 1,
+                          'spurious': 1,
+                          'possible': 6,
+                          'actual': 6}
                 }
-            ),
-            model_errors=None,
-            text=None,
-        )
-    ]
-
-    model = MockTokensModel(prediction=None)
-    evaluator = Evaluator(model=model)
-    scores = evaluator.calculate_score(evaluated, beta=2.5)
-
-    pii_tp = (
-        evaluated[0].results[("PERSON", "PERSON")]
-        + evaluated[0].results[("ORG", "ORG")]
-        + evaluated[0].results[("GPE", "GPE")]
-        + evaluated[0].results[("ORG", "GPE")]
-        + evaluated[0].results[("ORG", "PERSON")]
-        + evaluated[0].results[("GPE", "ORG")]
-        + evaluated[0].results[("GPE", "PERSON")]
-        + evaluated[0].results[("PERSON", "GPE")]
-        + evaluated[0].results[("PERSON", "ORG")]
-    )
-
-    pii_fp = (
-        evaluated[0].results[("O", "PERSON")]
-        + evaluated[0].results[("O", "GPE")]
-        + evaluated[0].results[("O", "ORG")]
-    )
-
-    pii_fn = (
-        evaluated[0].results[("PERSON", "O")]
-        + evaluated[0].results[("GPE", "O")]
-        + evaluated[0].results[("ORG", "O")]
-    )
-
-    assert scores.pii_precision == pii_tp / (pii_tp + pii_fp)
-    assert scores.pii_recall == pii_tp / (pii_tp + pii_fn)
-
-
-def test_dataset_to_metric_identity_model():
-    import os
-
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    input_samples = InputSample.read_dataset_json(
-        "{}/data/generated_small.json".format(dir_path), length=10
-    )
-
-    model = IdentityTokensMockModel()
-    evaluator = Evaluator(model=model)
-    evaluation_results = evaluator.evaluate_all(input_samples)
-    metrics = evaluator.calculate_score(evaluation_results)
-
-    assert metrics.pii_precision == 1
-    assert metrics.pii_recall == 1
-
-
-def test_dataset_to_metric_50_50_model():
-    import os
-
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    input_samples = InputSample.read_dataset_json(
-        "{}/data/generated_small.json".format(dir_path), length=100
-    )
-
-    # Replace 50% of the predictions with a list of "O"
-    model = FiftyFiftyIdentityTokensMockModel()
-    evaluator = Evaluator(model=model, entities_to_keep=["PERSON"])
-    evaluation_results = evaluator.evaluate_all(input_samples)
-    metrics = evaluator.calculate_score(evaluation_results)
-
-    print(metrics.pii_precision)
-    print(metrics.pii_recall)
-    print(metrics.pii_f)
-
-    assert metrics.pii_precision == 1
-    assert metrics.pii_recall < 0.75
-    assert metrics.pii_recall > 0.25
-
-
-def test_align_entity_types_correct_output():
-
-    sample1 = InputSample(
-        "I live in ABC",
-        spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
-        create_tags_from_span=False,
-    )
-    sample2 = InputSample(
-        "I live in ABC",
-        spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("C", "c", 100, 101)],
-        create_tags_from_span=False,
-    )
-    samples = [sample1, sample2]
-    mapping = {
-        "A": "1",
-        "B": "2",
-        "C": "1",
+    print(span_outputs)
+    print(expected_evaluation)
+    assert evaluation == expected_evaluation
+
+def test_compare_span_strict():
+    annotated_spans =[Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)]
+    predicted_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
+
+    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
+    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
+
+    expected_evaluation = {
+        'strict': {
+            'correct': 1,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'ent_type': {
+            'correct': 1,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'partial': {
+            'correct': 1,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'exact': {
+            'correct': 1,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        }
+    }
+    expected_span_outputs = [SpanOutput(
+                        output_type = "STRICT",
+                        predicted_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
+                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
+                        overlap_score = 1)]
+
+    assert len(span_outputs) == len(expected_span_outputs)
+    assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
+    assert evaluation['strict'] == expected_evaluation['strict']
+    assert evaluation['ent_type'] == expected_evaluation['ent_type']
+    assert evaluation['partial'] == expected_evaluation['partial']
+    assert evaluation['exact'] == expected_evaluation['exact']
+
+
+def test_compare_span_ent_type():
+    annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
+    predicted_spans =[Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24)]
+
+    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
+    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
+
+    expected_evaluation = {
+        'strict': {
+            'correct': 0,
+            'incorrect': 1,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'ent_type': {
+            'correct': 1,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'partial': {
+            'correct': 0,
+            'incorrect': 0,
+            'partial': 1,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual':
+            1,
+            'possible': 1
+        },
+        'exact': {
+            'correct': 0,
+            'incorrect': 1,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        }
     }
 
-    new_samples = Evaluator.align_entity_types(samples, mapping)
-
-    count_per_entity = Counter()
-    for sample in new_samples:
-        for span in sample.spans:
-            count_per_entity[span.entity_type] += 1
-
-    assert count_per_entity["1"] == 5
-    assert count_per_entity["2"] == 1
-
-
-def test_align_entity_types_wrong_mapping_exception():
-
-    sample1 = InputSample(
-        "I live in ABC",
-        spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
-        create_tags_from_span=False,
-    )
+    expected_span_outputs = [SpanOutput(
+                        output_type = "ENT_TYPE",
+                        predicted_span = Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24),
+                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
+                        overlap_score = 0.72)]
+
+    assert len(span_outputs) == len(expected_span_outputs)
+    assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
+    assert evaluation['strict'] == expected_evaluation['strict']
+    assert evaluation['ent_type'] == expected_evaluation['ent_type']
+    assert evaluation['partial'] == expected_evaluation['partial']
+    assert evaluation['exact'] == expected_evaluation['exact']
+
+def test_compare_span_exact():
+    annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
+    predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24)]
+
+    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
+    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
+
+    expected_evaluation = {
+        'strict': {
+            'correct': 0,
+            'incorrect': 1,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'ent_type': {
+            'correct': 0,
+            'incorrect': 1,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'partial': {
+            'correct': 1,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'exact': {
+            'correct': 1,
+            'incorrect': 0,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        }
+    }
 
-    entities_mapping = {"Z": "z"}
+    expected_span_outputs = [SpanOutput(
+                        output_type = "EXACT",
+                        predicted_span = Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24),
+                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
+                        overlap_score = 1)]
+    print(span_outputs)
+
+    assert len(span_outputs) == len(expected_span_outputs)
+    assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
+    assert evaluation['strict'] == expected_evaluation['strict']
+    assert evaluation['ent_type'] == expected_evaluation['ent_type']
+    assert evaluation['partial'] == expected_evaluation['partial']
+    assert evaluation['exact'] == expected_evaluation['exact']
+
+def test_compare_span_partial():
+    annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
+    predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24)]
+
+    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
+    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
+
+    expected_evaluation = {
+        'strict': {
+            'correct': 0,
+            'incorrect': 1,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'ent_type': {
+            'correct': 0,
+            'incorrect': 1,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        },
+        'partial': {
+            'correct': 0,
+            'incorrect': 0,
+            'partial': 1,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual':
+            1,
+            'possible': 1
+        },
+        'exact': {
+            'correct': 0,
+            'incorrect': 1,
+            'partial': 0,
+            'missed': 0,
+            'spurious': 0,
+            'precision': 0,
+            'recall': 0,
+            'actual': 1,
+            'possible': 1
+        }
+    }
 
-    with pytest.raises(ValueError):
-        Evaluator.align_entity_types(
-            input_samples=[sample1], entities_mapping=entities_mapping
-        )
+    expected_span_outputs = [SpanOutput(
+                        output_type = "PARTIAL",
+                        predicted_span = Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24),
+                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
+                        overlap_score = 0.72)]
+    print(span_outputs)
+
+    assert len(span_outputs) == len(expected_span_outputs)
+    assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
+    assert evaluation['strict'] == expected_evaluation['strict']
+    assert evaluation['ent_type'] == expected_evaluation['ent_type']
+    assert evaluation['partial'] == expected_evaluation['partial']
+    assert evaluation['exact'] == expected_evaluation['exact']
+
+# TODO: refactor those functions
+# def test_evaluator_simple():
+#     prediction = ["O", "O", "O", "U-ANIMAL"]
+#     model = MockTokensModel(prediction=prediction, entities_to_keep=["ANIMAL"])
+
+#     evaluator = Evaluator(model=model)
+#     sample = InputSample(
+#         full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None
+#     )
+#     sample.tokens = ["I", "am", "the", "walrus"]
+#     sample.tags = ["O", "O", "O", "U-ANIMAL"]
+
+#     evaluated = evaluator.evaluate_sample(sample, prediction)
+#     final_evaluation = evaluator.calculate_score([evaluated])
+
+#     assert final_evaluation.pii_precision == 1
+#     assert final_evaluation.pii_recall == 1
+
+
+# def test_evaluate_sample_wrong_entities_to_keep_correct_statistics():
+#     prediction = ["O", "O", "O", "U-ANIMAL"]
+#     model = MockTokensModel(prediction=prediction)
+
+#     evaluator = Evaluator(model=model, entities_to_keep=["SPACESHIP"])
+
+#     sample = InputSample(
+#         full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None
+#     )
+#     sample.tokens = ["I", "am", "the", "walrus"]
+#     sample.tags = ["O", "O", "O", "U-ANIMAL"]
+
+#     evaluated = evaluator.evaluate_sample(sample, prediction)
+#     assert evaluated.results[("O", "O")] == 4
+
+
+# def test_evaluate_same_entity_correct_statistics():
+#     prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
+#     model = MockTokensModel(prediction=prediction)
+#     evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
+#     sample = InputSample(
+#         full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None
+#     )
+#     sample.tokens = ["I", "am", "the", "walrus"]
+#     sample.tags = ["O", "O", "O", "U-ANIMAL"]
+
+#     evaluation_result = evaluator.evaluate_sample(sample, prediction)
+#     assert evaluation_result.results[("O", "O")] == 2
+#     assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
+#     assert evaluation_result.results[("O", "ANIMAL")] == 1
+
+
+# def test_evaluate_multiple_entities_to_keep_correct_statistics():
+#     prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
+#     entities_to_keep = ["ANIMAL", "PLANT", "SPACESHIP"]
+#     model = MockTokensModel(prediction=prediction)
+#     evaluator = Evaluator(model=model, entities_to_keep=entities_to_keep)
+
+#     sample = InputSample(
+#         full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None
+#     )
+#     sample.tokens = ["I", "am", "the", "walrus"]
+#     sample.tags = ["O", "O", "O", "U-ANIMAL"]
+
+#     evaluation_result = evaluator.evaluate_sample(sample, prediction)
+#     assert evaluation_result.results[("O", "O")] == 2
+#     assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
+#     assert evaluation_result.results[("O", "ANIMAL")] == 1
+
+
+# def test_evaluate_multiple_tokens_correct_statistics():
+#     prediction = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
+#     model = MockTokensModel(prediction=prediction)
+#     evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
+#     sample = InputSample(
+#         "I am the walrus amaericanus magnifico", masked=None, spans=None
+#     )
+#     sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
+#     sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
+
+#     evaluated = evaluator.evaluate_sample(sample, prediction)
+#     evaluation = evaluator.calculate_score([evaluated])
+
+#     assert evaluation.pii_precision == 1
+#     assert evaluation.pii_recall == 1
+
+
+# def test_evaluate_multiple_tokens_partial_match_correct_statistics():
+#     prediction = ["O", "O", "O", "B-ANIMAL", "L-ANIMAL", "O"]
+#     model = MockTokensModel(prediction=prediction)
+#     evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
+#     sample = InputSample(
+#         "I am the walrus amaericanus magnifico", masked=None, spans=None
+#     )
+#     sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
+#     sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
+
+#     evaluated = evaluator.evaluate_sample(sample, prediction)
+#     evaluation = evaluator.calculate_score([evaluated])
+
+#     assert evaluation.pii_precision == 1
+#     assert evaluation.pii_recall == 4 / 6
+
+
+# def test_evaluate_multiple_tokens_no_match_match_correct_statistics():
+#     prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"]
+#     model = MockTokensModel(prediction=prediction)
+#     evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
+#     sample = InputSample(
+#         "I am the walrus amaericanus magnifico", masked=None, spans=None
+#     )
+#     sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
+#     sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
+
+#     evaluated = evaluator.evaluate_sample(sample, prediction)
+#     evaluation = evaluator.calculate_score([evaluated])
+
+#     assert np.isnan(evaluation.pii_precision)
+#     assert evaluation.pii_recall == 0
+
+
+# def test_evaluate_multiple_examples_correct_statistics():
+#     prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"]
+#     model = MockTokensModel(prediction=prediction)
+#     evaluator = Evaluator(model=model, entities_to_keep=["PERSON"])
+#     input_sample = InputSample("My name is Raphael or David", masked=None, spans=None)
+#     input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
+#     input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]
+
+#     evaluated = evaluator.evaluate_all(
+#         [input_sample, input_sample, input_sample, input_sample]
+#     )
+#     scores = evaluator.calculate_score(evaluated)
+#     assert scores.pii_precision == 0.5
+#     assert scores.pii_recall == 0.5
+
+
+# def test_evaluate_multiple_examples_ignore_entity_correct_statistics():
+#     prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"]
+#     model = MockTokensModel(prediction=prediction)
+
+#     evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "TENNIS_PLAYER"])
+#     input_sample = InputSample("My name is Raphael or David", masked=None, spans=None)
+#     input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
+#     input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]
+
+#     evaluated = evaluator.evaluate_all(
+#         [input_sample, input_sample, input_sample, input_sample]
+#     )
+#     scores = evaluator.calculate_score(evaluated)
+#     assert scores.pii_precision == 1
+#     assert scores.pii_recall == 1
+
+
+# def test_confusion_matrix_correct_metrics():
+#     from collections import Counter
+
+#     evaluated = [
+#         EvaluationResult(
+#             results=Counter(
+#                 {
+#                     ("O", "O"): 150,
+#                     ("O", "PERSON"): 30,
+#                     ("O", "COMPANY"): 30,
+#                     ("PERSON", "PERSON"): 40,
+#                     ("COMPANY", "COMPANY"): 40,
+#                     ("PERSON", "COMPANY"): 10,
+#                     ("COMPANY", "PERSON"): 10,
+#                     ("PERSON", "O"): 30,
+#                     ("COMPANY", "O"): 30,
+#                 }
+#             ),
+#             model_errors=None,
+#             text=None,
+#         )
+#     ]
+
+#     model = MockTokensModel(prediction=None)
+#     evaluator = Evaluator(model=model, entities_to_keep=["PERSON", "COMPANY"])
+#     scores = evaluator.calculate_score(evaluated, beta=2.5)
+
+#     assert scores.pii_precision == 0.625
+#     assert scores.pii_recall == 0.625
+#     assert scores.entity_recall_dict["PERSON"] == 0.5
+#     assert scores.entity_precision_dict["PERSON"] == 0.5
+#     assert scores.entity_recall_dict["COMPANY"] == 0.5
+#     assert scores.entity_precision_dict["COMPANY"] == 0.5
+
+
+# def test_confusion_matrix_2_correct_metrics():
+#     from collections import Counter
+
+#     evaluated = [
+#         EvaluationResult(
+#             results=Counter(
+#                 {
+#                     ("O", "O"): 65467,
+#                     ("O", "ORG"): 4189,
+#                     ("GPE", "O"): 3370,
+#                     ("PERSON", "PERSON"): 2024,
+#                     ("GPE", "PERSON"): 1488,
+#                     ("GPE", "GPE"): 1033,
+#                     ("O", "GPE"): 964,
+#                     ("ORG", "ORG"): 914,
+#                     ("O", "PERSON"): 834,
+#                     ("GPE", "ORG"): 401,
+#                     ("PERSON", "ORG"): 35,
+#                     ("PERSON", "O"): 33,
+#                     ("ORG", "O"): 8,
+#                     ("PERSON", "GPE"): 5,
+#                     ("ORG", "PERSON"): 1,
+#                 }
+#             ),
+#             model_errors=None,
+#             text=None,
+#         )
+#     ]
+
+#     model = MockTokensModel(prediction=None)
+#     evaluator = Evaluator(model=model)
+#     scores = evaluator.calculate_score(evaluated, beta=2.5)
+
+#     pii_tp = (
+#         evaluated[0].results[("PERSON", "PERSON")]
+#         + evaluated[0].results[("ORG", "ORG")]
+#         + evaluated[0].results[("GPE", "GPE")]
+#         + evaluated[0].results[("ORG", "GPE")]
+#         + evaluated[0].results[("ORG", "PERSON")]
+#         + evaluated[0].results[("GPE", "ORG")]
+#         + evaluated[0].results[("GPE", "PERSON")]
+#         + evaluated[0].results[("PERSON", "GPE")]
+#         + evaluated[0].results[("PERSON", "ORG")]
+#     )
+
+#     pii_fp = (
+#         evaluated[0].results[("O", "PERSON")]
+#         + evaluated[0].results[("O", "GPE")]
+#         + evaluated[0].results[("O", "ORG")]
+#     )
+
+#     pii_fn = (
+#         evaluated[0].results[("PERSON", "O")]
+#         + evaluated[0].results[("GPE", "O")]
+#         + evaluated[0].results[("ORG", "O")]
+#     )
+
+#     assert scores.pii_precision == pii_tp / (pii_tp + pii_fp)
+#     assert scores.pii_recall == pii_tp / (pii_tp + pii_fn)
+
+
+# def test_dataset_to_metric_identity_model():
+#     import os
+
+#     dir_path = os.path.dirname(os.path.realpath(__file__))
+#     input_samples = InputSample.read_dataset_json(
+#         "{}/data/generated_small.json".format(dir_path), length=10
+#     )
+
+#     model = IdentityTokensMockModel()
+#     evaluator = Evaluator(model=model)
+#     evaluation_results = evaluator.evaluate_all(input_samples)
+#     metrics = evaluator.calculate_score(evaluation_results)
+
+#     assert metrics.pii_precision == 1
+#     assert metrics.pii_recall == 1
+
+
+# def test_dataset_to_metric_50_50_model():
+#     import os
+
+#     dir_path = os.path.dirname(os.path.realpath(__file__))
+#     input_samples = InputSample.read_dataset_json(
+#         "{}/data/generated_small.json".format(dir_path), length=100
+#     )
+
+#     # Replace 50% of the predictions with a list of "O"
+#     model = FiftyFiftyIdentityTokensMockModel()
+#     evaluator = Evaluator(model=model, entities_to_keep=["PERSON"])
+#     evaluation_results = evaluator.evaluate_all(input_samples)
+#     metrics = evaluator.calculate_score(evaluation_results)
+
+#     print(metrics.pii_precision)
+#     print(metrics.pii_recall)
+#     print(metrics.pii_f)
+
+#     assert metrics.pii_precision == 1
+#     assert metrics.pii_recall < 0.75
+#     assert metrics.pii_recall > 0.25
+
+
+# def test_align_entity_types_correct_output():
+
+#     sample1 = InputSample(
+#         "I live in ABC",
+#         spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
+#         create_tags_from_span=False,
+#     )
+#     sample2 = InputSample(
+#         "I live in ABC",
+#         spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("C", "c", 100, 101)],
+#         create_tags_from_span=False,
+#     )
+#     samples = [sample1, sample2]
+#     mapping = {
+#         "A": "1",
+#         "B": "2",
+#         "C": "1",
+#     }
+
+#     new_samples = Evaluator.align_entity_types(samples, mapping)
+
+#     count_per_entity = Counter()
+#     for sample in new_samples:
+#         for span in sample.spans:
+#             count_per_entity[span.entity_type] += 1
+
+#     assert count_per_entity["1"] == 5
+#     assert count_per_entity["2"] == 1
+
+
+# def test_align_entity_types_wrong_mapping_exception():
+
+#     sample1 = InputSample(
+#         "I live in ABC",
+#         spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
+#         create_tags_from_span=False,
+#     )
+
+#     entities_mapping = {"Z": "z"}
+
+#     with pytest.raises(ValueError):
+#         Evaluator.align_entity_types(
+#             input_samples=[sample1], entities_mapping=entities_mapping
+#         )

From 106aeeb6f1cdf05b1e968ce66b014e3e3a0fb7dd Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 30 Jan 2023 15:23:00 +0100
Subject: [PATCH 11/16] Add simple case unittest for compare_span function

---
 tests/test_evaluator.py | 297 ++++------------------------------------
 1 file changed, 29 insertions(+), 268 deletions(-)

diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 0305009..0aff5ee 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -32,6 +32,35 @@ def test_compare_span_simple_case_1():
     evaluator = Evaluator(entities_to_keep=['PER', 'LOC', 'MISC'])
     span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
 
+    expected_span_outputs = [SpanOutput(output_type="SPURIOUS", 
+                                        annotated_span=None, 
+                                        predicted_span=Span(entity_type = "PER", entity_value = "", start_position = 24, end_position=30), 
+                                        overlap_score=0),
+                             SpanOutput(output_type="ENT_TYPE", 
+                                        annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 127, end_position=134), 
+                                        predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 124, end_position=134), 
+                                        overlap_score=0.82),
+                             SpanOutput(output_type="EXACT", 
+                                        annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 164, end_position=174), 
+                                        predicted_span=Span(entity_type = "PER", entity_value = "", start_position = 164, end_position=174), 
+                                        overlap_score=1),
+                             SpanOutput(output_type="STRICT", 
+                                        annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205), 
+                                        predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 197, end_position=205), 
+                                        overlap_score=1),
+                             SpanOutput(output_type="STRICT", 
+                                        annotated_span=Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219), 
+                                        predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 208, end_position=219), 
+                                        overlap_score=1),
+                             SpanOutput(output_type="PARTIAL", 
+                                        annotated_span=Span(entity_type = "MISC", entity_value = "", start_position = 230, end_position=240), 
+                                        predicted_span=Span(entity_type = "LOC", entity_value = "", start_position = 225, end_position=243), 
+                                        overlap_score=0.71),
+                             SpanOutput(output_type="MISSED", 
+                                        annotated_span=Span(entity_type = "PER", entity_value = "", start_position = 59, end_position=69), 
+                                        predicted_span=None, 
+                                        overlap_score=0)]
+
     expected_evaluation = {'strict': {'correct': 2,
                            'incorrect': 3,
                            'partial': 0,
@@ -61,138 +90,6 @@ def test_compare_span_simple_case_1():
                           'possible': 6,
                           'actual': 6}
                 }
-    print(span_outputs)
-    print(expected_evaluation)
-    assert evaluation == expected_evaluation
-
-def test_compare_span_strict():
-    annotated_spans =[Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)]
-    predicted_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
-
-    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
-    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
-
-    expected_evaluation = {
-        'strict': {
-            'correct': 1,
-            'incorrect': 0,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'ent_type': {
-            'correct': 1,
-            'incorrect': 0,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'partial': {
-            'correct': 1,
-            'incorrect': 0,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'exact': {
-            'correct': 1,
-            'incorrect': 0,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        }
-    }
-    expected_span_outputs = [SpanOutput(
-                        output_type = "STRICT",
-                        predicted_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
-                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
-                        overlap_score = 1)]
-
-    assert len(span_outputs) == len(expected_span_outputs)
-    assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
-    assert evaluation['strict'] == expected_evaluation['strict']
-    assert evaluation['ent_type'] == expected_evaluation['ent_type']
-    assert evaluation['partial'] == expected_evaluation['partial']
-    assert evaluation['exact'] == expected_evaluation['exact']
-
-
-def test_compare_span_ent_type():
-    annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
-    predicted_spans =[Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24)]
-
-    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
-    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
-
-    expected_evaluation = {
-        'strict': {
-            'correct': 0,
-            'incorrect': 1,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'ent_type': {
-            'correct': 1,
-            'incorrect': 0,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'partial': {
-            'correct': 0,
-            'incorrect': 0,
-            'partial': 1,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual':
-            1,
-            'possible': 1
-        },
-        'exact': {
-            'correct': 0,
-            'incorrect': 1,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        }
-    }
-
-    expected_span_outputs = [SpanOutput(
-                        output_type = "ENT_TYPE",
-                        predicted_span = Span(entity_type = "ANIMAL", entity_value = "retriever", start_position = 15, end_position=24),
-                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
-                        overlap_score = 0.72)]
-
     assert len(span_outputs) == len(expected_span_outputs)
     assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
     assert evaluation['strict'] == expected_evaluation['strict']
@@ -200,142 +97,6 @@ def test_compare_span_ent_type():
     assert evaluation['partial'] == expected_evaluation['partial']
     assert evaluation['exact'] == expected_evaluation['exact']
 
-def test_compare_span_exact():
-    annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
-    predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24)]
-
-    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
-    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
-
-    expected_evaluation = {
-        'strict': {
-            'correct': 0,
-            'incorrect': 1,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'ent_type': {
-            'correct': 0,
-            'incorrect': 1,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'partial': {
-            'correct': 1,
-            'incorrect': 0,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'exact': {
-            'correct': 1,
-            'incorrect': 0,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        }
-    }
-
-    expected_span_outputs = [SpanOutput(
-                        output_type = "EXACT",
-                        predicted_span = Span(entity_type = "SPACESHIP", entity_value = "golden retriever", start_position = 9, end_position=24),
-                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
-                        overlap_score = 1)]
-    print(span_outputs)
-
-    assert len(span_outputs) == len(expected_span_outputs)
-    assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
-    assert evaluation['strict'] == expected_evaluation['strict']
-    assert evaluation['ent_type'] == expected_evaluation['ent_type']
-    assert evaluation['partial'] == expected_evaluation['partial']
-    assert evaluation['exact'] == expected_evaluation['exact']
-
-def test_compare_span_partial():
-    annotated_spans = [Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24)] 
-    predicted_spans =[Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24)]
-
-    evaluator = Evaluator(entities_to_keep=["ANIMAL"])
-    span_outputs, evaluation, evaluation_agg_entities_type = evaluator.compare_span(annotated_spans, predicted_spans)
-
-    expected_evaluation = {
-        'strict': {
-            'correct': 0,
-            'incorrect': 1,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'ent_type': {
-            'correct': 0,
-            'incorrect': 1,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        },
-        'partial': {
-            'correct': 0,
-            'incorrect': 0,
-            'partial': 1,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual':
-            1,
-            'possible': 1
-        },
-        'exact': {
-            'correct': 0,
-            'incorrect': 1,
-            'partial': 0,
-            'missed': 0,
-            'spurious': 0,
-            'precision': 0,
-            'recall': 0,
-            'actual': 1,
-            'possible': 1
-        }
-    }
-
-    expected_span_outputs = [SpanOutput(
-                        output_type = "PARTIAL",
-                        predicted_span = Span(entity_type = "SPACESHIP", entity_value = "retriever", start_position = 15, end_position=24),
-                        annotated_span = Span(entity_type = "ANIMAL", entity_value = "golden retriever", start_position = 9, end_position=24),
-                        overlap_score = 0.72)]
-    print(span_outputs)
-
-    assert len(span_outputs) == len(expected_span_outputs)
-    assert all([a.__eq__(b) for a, b in zip(span_outputs, expected_span_outputs)])
-    assert evaluation['strict'] == expected_evaluation['strict']
-    assert evaluation['ent_type'] == expected_evaluation['ent_type']
-    assert evaluation['partial'] == expected_evaluation['partial']
-    assert evaluation['exact'] == expected_evaluation['exact']
 
 # TODO: refactor those functions
 # def test_evaluator_simple():

From abc9d5da237c5ef33da4c8be221473da54ea4146 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 30 Jan 2023 15:50:22 +0100
Subject: [PATCH 12/16] Add function and unittest for get overlap score

---
 presidio_evaluator/data_objects.py |  9 +++++++++
 tests/test_data_objects.py         | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/presidio_evaluator/data_objects.py b/presidio_evaluator/data_objects.py
index 2c028bc..2f320ed 100644
--- a/presidio_evaluator/data_objects.py
+++ b/presidio_evaluator/data_objects.py
@@ -4,6 +4,7 @@
 from collections import Counter
 
 import pandas as pd
+import numpy as np
 import spacy
 from spacy import Language
 from spacy.tokens import Doc, DocBin
@@ -73,6 +74,14 @@ def intersect(self, other, ignore_entity_type: bool):
         return min(self.end_position, other.end_position) - max(
             self.start_position, other.start_position
         )
+    
+    def get_overlap_ratio(self, other):
+        """
+        Calculates the ratio as: ratio = 2.0*M / T , where M = matches , T = total number of elements in both sequences
+        """
+        nb_matches = self.intersect(other, ignore_entity_type = True)
+        total_characters = (self.end_position - self.start_position) + (other.end_position - other.start_position)
+        return np.round((2*nb_matches/total_characters), 2)
 
     @classmethod
     def from_faker_span(cls, faker_span: FakerSpan) -> "Span":
diff --git a/tests/test_data_objects.py b/tests/test_data_objects.py
index 97e2713..96963ab 100644
--- a/tests/test_data_objects.py
+++ b/tests/test_data_objects.py
@@ -181,3 +181,25 @@ def test_spans_intersection(
 
     intersection = span1.intersect(span2, ignore_entity_type=ignore_entity_type)
     assert intersection == intersection_length
+
+@pytest.mark.parametrize(
+    "start1, end1, start2, end2, expected_overlap_ratio",
+    [
+        (150, 153, 160, 165, 0.0),
+        (150, 153, 150, 153, 1.0),
+        (150, 153, 152, 154, 0.4),
+        (150, 153, 100, 151, 0.04),
+    ],
+)
+def test_get_overlap_ratio(
+    start1, end1, start2, end2, expected_overlap_ratio
+):
+    span1 = Span(
+        entity_type="A", entity_value="123", start_position=start1, end_position=end1
+    )
+    span2 = Span(
+        entity_type="B", entity_value="123", start_position=start2, end_position=end2
+    )
+
+    overlap_ratio = span1.get_overlap_ratio(span2)
+    assert overlap_ratio == expected_overlap_ratio

From e285ef4f8f59650beb9ce992e0ddd9cfa2d9d080 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 30 Jan 2023 15:55:18 +0100
Subject: [PATCH 13/16] Fix bugs in compare_span function.

---
 presidio_evaluator/evaluation/evaluator.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py
index bb105fd..620352f 100644
--- a/presidio_evaluator/evaluation/evaluator.py
+++ b/presidio_evaluator/evaluation/evaluator.py
@@ -83,7 +83,7 @@ def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span])
         # annotated_spans = model_prediction.input_sample.spans
         # predicted_spans = model_prediction.predicted_spans
 
-        eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0}
+        eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0}
         evaluation = {
             'strict': deepcopy(eval_metrics),
             'ent_type': deepcopy(eval_metrics),
@@ -151,11 +151,7 @@ def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span])
                         break
                     # Check overlapping between true and pred
                     elif evaluation_helpers.find_overlap(true_range, pred_range):
-                        # overlap_ratio = SequenceMatcher(None, 
-                        #                                 pred.entity_value,
-                        #                                 true.entity_value).ratio()
-                        overlap_ratio = pred.intersect(true)
-                        print(overlap_ratio)
+                        overlap_ratio = pred.get_overlap_ratio(true)
                         true_which_overlapped_with_pred.append(true)
                         # Scenario V: There is an overlap (but offsets do not match exactly), 
                         # and the entity type is the same

From e3acf9c96ecbb5caea04b5767f817edb62a2a882 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 30 Jan 2023 15:56:00 +0100
Subject: [PATCH 14/16] Add functions to helpers

---
 presidio_evaluator/evaluation_helpers.py | 114 ++++++++++++++---------
 1 file changed, 68 insertions(+), 46 deletions(-)

diff --git a/presidio_evaluator/evaluation_helpers.py b/presidio_evaluator/evaluation_helpers.py
index 286cc24..54d5b7c 100644
--- a/presidio_evaluator/evaluation_helpers.py
+++ b/presidio_evaluator/evaluation_helpers.py
@@ -22,57 +22,79 @@ def get_matched_gold(predicted_span: Span,
                             overlap_score=0
                             )
 
+def find_overlap(true_range, pred_range):
+    """Find the overlap between two ranges
+    Find the overlap between two ranges. Return the overlapping values if
+    present, else return an empty set().
+    Examples:
+    >>> find_overlap((1, 2), (2, 3))
+    2
+    >>> find_overlap((1, 2), (3, 4))
+    set()
+    """
+
+    true_set = set(true_range)
+    pred_set = set(pred_range)
+
+    overlaps = true_set.intersection(pred_set)
+
+    return overlaps
+
 def span_compute_actual_possible(results: dict) -> dict:
-        """
-        Take the result dict and calculate the actual and possible spans
-        """
-        strict = results["strict"]
-        exact = results["exact"]
-        incorrect = results["incorrect"]
-        partial = results["partial"]
-        missed = results["miss"]
-        spurious = results["spurious"]
-        # Possible: Number of annotations in the gold-standard which contribute to the final score
-        possible = strict + exact + incorrect + partial + missed
-        # Actual: Number of annotations produced by the PII detection system
-        actual = strict + exact + incorrect + partial + spurious
-
-        results["actual"] = actual
-        results["possible"] = possible
-        
-        return results
-
-def span_compute_precision_recall(results: dict) -> dict:
     """
-    Take the result dict to calculate the strict and flexible precision/ recall
+    Takes a result dict that has been output by compute metrics.
+    Returns the results dict with actual, possible populated.
+    When the results dicts is from partial or ent_type metrics, then
+    partial_or_type=True to ensure the right calculation is used for
+    calculating precision and recall.
+    """
+
+    correct = results['correct']
+    incorrect = results['incorrect']
+    partial = results['partial']
+    missed = results['missed']
+    spurious = results['spurious']
+
+    # Possible: number annotations in the gold-standard which contribute to the
+    # final score
+
+    possible = correct + incorrect + partial + missed
+
+    # Actual: number of annotations produced by the NER system
+
+    actual = correct + incorrect + partial + spurious
+
+    results["actual"] = actual
+    results["possible"] = possible
+
+    return results
+
+def span_compute_precision_recall(results: dict, partial_or_type) -> dict:
+    """
+    Takes a result dict that has been output by compute metrics.
+    Returns the results dict with precison and recall populated.
+    When the results dicts is from partial or ent_type metrics, then
+    partial_or_type=True to ensure the right calculation is used for
+    calculating precision and recall.
     """
-    metrics = {}
-    strict = results["strict"]
-    exact = results["exact"]
-    partial = results["partial"]
+
     actual = results["actual"]
     possible = results["possible"]
-    
-    # Calculate the strict performance
-    strict_precision = strict / actual if actual > 0 else 0
-    strict_recall = strict / possible if possible > 0 else 0
-
-    # Calculate the flexible performance
-    flexible_precision = (strict + exact)/ actual if actual > 0 else 0
-    flexible_recall = (strict + exact) / possible if possible > 0 else 0
-
-    # Calculate the partial performance
-    partial_precision = (strict + exact + 0.5 * partial) / actual if actual > 0 else 0
-    partial_recall = (strict + exact + 0.5 * partial) / possible if possible > 0 else 0
-    
-
-    metrics["strict precision"] = strict_precision
-    metrics["strict recall"] = strict_recall
-    metrics["flexible precision"] = flexible_precision
-    metrics["flexible recall"] = flexible_recall
-    metrics["partial precision"] = partial_precision
-    metrics["partial recall"] = partial_recall
-    return metrics
+    partial = results['partial']
+    correct = results['correct']
+
+    if partial_or_type:
+        precision = (correct + 0.5 * partial) / actual if actual > 0 else 0
+        recall = (correct + 0.5 * partial) / possible if possible > 0 else 0
+
+    else:
+        precision = correct / actual if actual > 0 else 0
+        recall = correct / possible if possible > 0 else 0
+
+    results["precision"] = precision
+    results["recall"] = recall
+
+    return results
 
 # TODO: Implement this function
 def dict_merge(dict_1: dict, dict2: dict) -> dict:

From c92b543b81f94442162f89336b2ff60bb4dfdc1c Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Mon, 30 Jan 2023 16:30:35 +0100
Subject: [PATCH 15/16] Add test for span equal function

---
 tests/test_data_objects.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/test_data_objects.py b/tests/test_data_objects.py
index 96963ab..084d9c9 100644
--- a/tests/test_data_objects.py
+++ b/tests/test_data_objects.py
@@ -203,3 +203,25 @@ def test_get_overlap_ratio(
 
     overlap_ratio = span1.get_overlap_ratio(span2)
     assert overlap_ratio == expected_overlap_ratio
+
+@pytest.mark.parametrize(
+    "start1, end1, entity_value1, entity_type1, start2, end2, entity_value2, entity_type2, expected_output",
+    [
+        (150, 153, "123", "A", 150, 153, "123", "A", True),
+        (150, 153, "123", "B", 150, 153, "123", "A", False),
+        (150, 153, "123", "A", 150, 153, "345", "A", False),
+        (150, 153, "123", "A", 153, 156, "123", "A", False),
+    ],
+)
+def test_span_eq(
+    start1, end1, entity_value1, entity_type1, start2, end2, entity_value2, entity_type2, expected_output
+):
+    span1 = Span(
+        entity_type=entity_type1, entity_value=entity_value1, start_position=start1, end_position=end1
+    )
+    span2 = Span(
+        entity_type=entity_type2, entity_value=entity_value2, start_position=start2, end_position=end2
+    )
+
+    output = span1.__eq__(span2)
+    assert output == expected_output

From d123ccadd9236ce2e54cda9c4dac9df38640e4c0 Mon Sep 17 00:00:00 2001
From: Trang Nguyen <85181462+tranguyen221@users.noreply.github.com>
Date: Tue, 31 Jan 2023 08:12:52 +0100
Subject: [PATCH 16/16] Update docs

---
 presidio_evaluator/evaluation/evaluator.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py
index 620352f..634c1ba 100644
--- a/presidio_evaluator/evaluation/evaluator.py
+++ b/presidio_evaluator/evaluation/evaluator.py
@@ -70,14 +70,15 @@ def compare_token(self, model_prediction: ModelPrediction) -> Tuple[List[TokenOu
         return List[TokenOutput], Counter
 
     # def compare_span(self, model_prediction: ModelPrediction) -> Tuple[List[SpanOutput], dict[dict]]:
-    def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) -> Tuple[List[SpanOutput], dict[dict]]:
+    def compare_span(self, annotated_spans: List[Span], predicted_spans: List[Span]) -> Tuple[List[SpanOutput], dict[dict], dict[dict]]:
         """
         Compares ground truth tags (annotation) and predicted (prediction) at span level. 
-        :param annotated_spans: model_prediction containing an InputSample and a list of predicted tags and tokens
-        :param predicted_spans: 
+        :param annotated_spans: truth annotation from InputSample
+        :param predicted_spans: predicted span from PII model/system
         Returns:
         List[SpanOutput]: a list of SpanOutput
-        dict: a dictionary of PII results per entity with structure {{entity_name: {output_type : count}}}
+        dict: a dictionary of global PII results with structure {eval_type : {}}
+        dict: a dictionary of PII results per entity with structure {entity_name: {eval_type : {}}}
         """
         # get annotated and predicted span from ModelPrediction
         # annotated_spans = model_prediction.input_sample.spans