-
Notifications
You must be signed in to change notification settings - Fork 115
/
ner_fuzzy_manual.py
129 lines (115 loc) · 5.38 KB
/
ner_fuzzy_manual.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from typing import List, Optional
import copy
from collections import defaultdict
import prodigy
from prodigy.components.loaders import JSONL
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string, set_hashes
import spacy
from spacy.tokens import Span
from spaczz.matcher import FuzzyMatcher
def parse_phrase_patterns(patterns):
"""
A parser for patterns file.
It assumes Prodigy's patterns format with string patterns such as {"pattern": "some pattern", "label": "SOME_LABEL"}.
"""
phrase_patterns = defaultdict(list)
# Auxiliary map to recover the pattern id as line number in the UI
line_numbers = {}
for i, entry in enumerate(patterns):
label = entry["label"]
pattern = entry["pattern"]
line_number = i+1
phrase_patterns[label].append((line_number, pattern))
line_numbers[line_number] = label
return phrase_patterns, line_numbers
def apply_fuzzy_matcher(stream, nlp, fuzzy_matcher, line_numbers):
"""Add a 'spans' key to each example, with fuzzy pattern matches."""
# Process the stream using spaCy's nlp.pipe, which yields doc objects.
# If as_tuples=True is set, you can pass in (text, context) tuples.
texts_examples = ((eg["text"], eg) for eg in stream)
for doc, eg in nlp.pipe(texts_examples, as_tuples=True):
task = copy.deepcopy(eg)
matched_spans = []
for line_number, start_token, end_token, _ in fuzzy_matcher(doc):
span_obj = Span(doc, start_token, end_token)
span = {
"text": span_obj.text,
"start": span_obj.start_char,
"end": span_obj.end_char,
"token_start": span_obj.start,
"token_end": span_obj.end -1,
"label": line_numbers[line_number],
"line_number": line_number
}
matched_spans.append(span)
if matched_spans:
task["spans"] = matched_spans
all_ids = []
for s in task["spans"]:
all_ids.append(s["line_number"])
# Not needed anymore
del s["line_number"]
task["meta"]["pattern"] = ", ".join([f"{p}" for p in all_ids])
# Rehash the newly created task so that hashes reflect added data
task = set_hashes(task)
yield task
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"ner.fuzzy.manual",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("The base model", "positional", None, str),
source=("The source data as a JSONL file", "positional", None, str),
patterns=("Phrase patterns", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
exclude=("Names of datasets to exclude", "option", "e", split_string),
)
def ner_fuzzy_manual(
dataset: str,
spacy_model: str,
source: str,
patterns: str,
label: Optional[List[str]] = None,
exclude: Optional[List[str]] = None,
):
"""
Mark spans manually by token with suggestions from phrase patterns pre-highlighted.
The suggestions are spans matched by spaczz fuzzy matcher ignoring the case.
Note, that if token patterns are required, spaczz syntax for token patterns should be observed
and the parsing function should be modified accordingly. Please check spaczz documentation
for details: https://spacy.io/universe/project/spaczz.
The recipe doesn't require any entity recognizer, and it doesn't do any active learning.
It will present all examples in order, so even examples without matches are shown.
"""
# Load the spaCy model for tokenization
nlp = spacy.load(spacy_model)
# Initialize spaczz fuzzy matcher
fuzzy_matcher = FuzzyMatcher(nlp.vocab)
# Load phrase patterns and feed them to spaczz matcher
patterns = JSONL(patterns)
phrase_patterns, line_numbers = parse_phrase_patterns(list(patterns))
for pattern_label, patterns in phrase_patterns.items():
for (line_number, pattern) in patterns:
# Use the line number from the patterns source file as the pattern_id
fuzzy_matcher.add(line_number, [nlp(pattern)], kwargs= [{"ignorecase": True}])
# Load the stream from a JSONL file and return a generator that yields a
# dictionary for each example in the data.
stream = JSONL(source)
# Tokenize the incoming examples and add a "tokens" property to each
# example. Also handles pre-defined selected spans. Tokenization allows
# faster highlighting, because the selection can "snap" to token boundaries.
stream = add_tokens(nlp, stream)
# Apply the spaczz matcher to the stream.
stream = apply_fuzzy_matcher(stream, nlp, fuzzy_matcher, line_numbers)
return {
"view_id": "ner_manual", # Annotation interface to use
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
"config": { # Additional config settings, mostly for app UI
"lang": nlp.lang,
"labels": label
},
}