-
Notifications
You must be signed in to change notification settings - Fork 0
/
contractions.py
78 lines (74 loc) · 2.59 KB
/
contractions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
nlp = spacy.load('en_core_web_sm')
"""
Deals with contractions by expanding spaCy's tokenizer exceptions.
ORTH is the form in the text/corpus.
LEMMA is the dictionary form.
TAG is part of speech.
"""
TOKENIZER_EXCEPTIONS = {
# do
"don't": [
{ORTH: "do", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"doesn't": [
{ORTH: "does", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"didn't": [
{ORTH: "did", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# can
"can't": [
{ORTH: "ca", LEMMA: "can"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"couldn't": [
{ORTH: "could", LEMMA: "can"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# have
"I've'": [
{ORTH: "I", LEMMA: "I"},
{ORTH: "'ve'", LEMMA: "have", NORM: "have", TAG: "VERB"}],
"haven't": [
{ORTH: "have", LEMMA: "have"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"hasn't": [
{ORTH: "has", LEMMA: "have"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"hadn't": [
{ORTH: "had", LEMMA: "have"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# will/shall will be replaced by will
"I'll'": [
{ORTH: "I", LEMMA: "I"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"he'll'": [
{ORTH: "he", LEMMA: "he"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"she'll'": [
{ORTH: "she", LEMMA: "she"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"it'll'": [
{ORTH: "it", LEMMA: "it"},
{ORTH: "'ll'", LEMMA: "will", NORM: "will", TAG: "VERB"}],
"won't": [
{ORTH: "wo", LEMMA: "will"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"wouldn't": [
{ORTH: "would", LEMMA: "will"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
# be
"I'm'": [
{ORTH: "I", LEMMA: "I"},
{ORTH: "'m'", LEMMA: "be", NORM: "am", TAG: "VERB"}]
}
def de_contraction(doc):
"""
Replaces contractions with full form using the defined TOKENIZER_EXCEPTIONS.
"""
new_doc = doc
for i, token in enumerate(doc):
new_doc = nlp.make_doc(new_doc[:i].text + ' ' + token.norm_ + ' ' + new_doc[((i)+1):].text)
return(new_doc)
print(de_contraction(nlp(u"Oh no he didn't. I can't and I won't. I'll know what I'm gonna do.")))