-
Notifications
You must be signed in to change notification settings - Fork 11
/
ngb_test.py
120 lines (88 loc) · 4.6 KB
/
ngb_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import unittest
import ngb
class CounterTest(unittest.TestCase):
def test_when_adding_two_ngrams_then_counts_added(self):
counter = ngb.Counter({ "apple": 1 })
counter.add(ngb.Counter({ "apple": 2 }))
self.assertEqual(counter["apple"], 3)
def test_when_cannot_remove_subphrases_then_none_removed(self):
counter = ngb.Counter({
"apple": 4,
"apple core": 3,
"apple core removal": 2,
"apple core removal tool": 1,
})
counter.remove_subphrases()
self.assertEqual(len(list(counter.keys())), 4)
def test_when_can_remove_all_subphrases_then_all_removed(self):
counter = ngb.Counter({
"apple": 1,
"apple core": 1,
"core removal": 1,
"apple core removal": 1,
"apple core removal tool": 1,
})
counter.remove_subphrases()
self.assertEqual(counter, { "apple core removal tool": 1 })
def test_when_can_remove_some_subphrases_then_some_removed(self):
counter = ngb.Counter({
"apple": 2,
"apple core": 1,
"core removal": 1,
"apple core removal tool": 1,
})
counter.remove_subphrases()
self.assertEqual(counter, { "apple": 2, "apple core removal tool": 1 })
def test_when_subphrase_appears_within_ngram_then_can_be_removed(self):
counter = ngb.Counter({ "core": 1, "apple core removal": 1 })
counter.remove_subphrases()
self.assertFalse("core" in counter)
class NgramBuilderTest(unittest.TestCase):
def setUp(self):
self.text = "apple banana apple potato"
self.NgramBuilder = ngb.NgramBuilder(stopwords=ngb.stopwords)
def test_when_counting_unigrams_then_single_words_found(self):
ngrams = { "apple": 2, "banana": 1, "potato": 1 }
self.assertEqual(self.NgramBuilder.find_ngrams(self.text, 1), ngrams)
def test_when_counting_bigrams_then_double_words_found(self):
ngrams = { "apple banana": 1, "banana apple": 1, "apple potato": 1 }
self.assertEqual(self.NgramBuilder.find_ngrams(self.text, 2), ngrams)
def test_when_counting_ngrams_same_length_as_input_then_1_found(self):
length = len(self.text.split(" "))
self.assertEqual(
self.NgramBuilder.find_ngrams(self.text, length), { self.text: 1 })
def test_when_counting_ngrams_longer_than_input_then_nothing_found(self):
self.assertEqual(self.NgramBuilder.find_ngrams(self.text, 100), {})
def test_when_text_contains_punctuation_then_punctuation_ignored(self):
text = "apple, banana"
expected = { "apple": 1, "banana": 1 }
self.assertEqual(self.NgramBuilder.find_ngrams(text, 1), expected)
def test_when_text_contains_numbers_then_not_returned_as_unigrams(self):
text = "1 potato, 2 potato"
self.assertEqual(self.NgramBuilder.find_ngrams(text, 1), { "potato": 2 })
def test_when_unigram_is_single_character_then_excluded(self):
self.assertEqual(self.NgramBuilder.find_ngrams("a bc", 1), { "bc": 1 })
def test_when_unigram_doesnt_contain_letter_then_ignored(self):
self.assertEqual(self.NgramBuilder.find_ngrams("23:45", 1), {})
def test_when_ngram_starts_with_stopword_then_ignored(self):
self.assertEqual(self.NgramBuilder.find_ngrams("the apple", 2), {})
self.assertEqual(self.NgramBuilder.find_ngrams("of the bear", 2), {})
def test_when_ngram_ends_with_stopword_then_ignored(self):
self.assertEqual(self.NgramBuilder.find_ngrams("apple the", 2), {})
self.assertEqual(self.NgramBuilder.find_ngrams("bear of the", 2), {})
class TokenizerTest(unittest.TestCase):
def tokenize(self, text):
return ngb.NgramBuilder().split_into_unigrams(text)[1]
def test_when_word_contains_apostrophe_then_included_in_unigram(self):
self.assertTrue("don't" in self.tokenize("don't you know"))
def test_when_word_ends_with_punctuation_then_removed(self):
unigrams = self.tokenize("yes, thanks! - much.")
self.assertEqual(["yes", "thanks", "much"], unigrams)
def test_when_word_is_punctuation_then_removed(self):
self.assertEqual([], self.tokenize("{ } [ ] ( ) /"))
def test_when_content_is_whitespace_then_not_considered_an_ngram(self):
self.assertEqual(["fruit"], self.tokenize(" \tfruit!\n \t\t "))
def test_when_content_contains_newlines_then_split_on_newlines(self):
self.assertEqual(["foo", "bar"], self.tokenize("foo\n\nbar"))
if __name__ == '__main__':
unittest.main()