forked from 4sushi/fastHmlExtract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
testFastHtmlExtract.py
151 lines (128 loc) · 8.55 KB
/
testFastHtmlExtract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
import unittest
import fastHtmlExtract
class TestFastHtmlExtract(unittest.TestCase):
"""
Unit Test - FastHtmlExtract
"""
def setUp(self):
self.html = """
<div class="container">
<div class="row">
<div class="col-sm-8 blog-main">
<div class="blog-post">
<h2 class="blog-post-title">Sample blog post</h2>
<p class="blog-post-meta">January 1, 2014 by <a href="#">Mark</a></p>
<p>This blog post shows a few different types of content that's supported and styled with Bootstrap. Basic typography, images, and code are all supported.</p>
<hr>
<p>Cum sociis natoque penatibus et magnis <a href="#">dis parturient montes</a>, nascetur ridiculus mus. Aenean eu leo quam. Pellentesque ornare sem lacinia quam venenatis vestibulum. Sed posuere consectetur est at lobortis. Cras mattis consectetur purus sit amet fermentum.</p>
<blockquote>
<p>Curabitur blandit tempus porttitor. <strong>Nullam quis risus eget urna mollis</strong> ornare vel eu leo. Nullam id dolor id nibh ultricies vehicula ut id elit.</p>
</blockquote>
<p>Etiam porta <em>sem malesuada magna</em> mollis euismod. Cras mattis consectetur purus sit amet fermentum. Aenean lacinia bibendum nulla sed consectetur.</p>
<h2>Heading</h2>
<p>Vivamus sagittis lacus vel augue laoreet rutrum faucibus dolor auctor. Duis mollis, est non commodo luctus, nisi erat porttitor ligula, eget lacinia odio sem nec elit. Morbi leo risus, porta ac consectetur ac, vestibulum at eros.</p>
<h3>Sub-heading</h3>
<p>Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.</p>
<pre><code>Example code block</code></pre>
<p>Aenean lacinia bibendum nulla sed consectetur. Etiam porta sem malesuada magna mollis euismod. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa.</p>
<h3>Sub-heading</h3>
<p>Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Aenean lacinia bibendum nulla sed consectetur. Etiam porta sem malesuada magna mollis euismod. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus.</p>
<ul>
<li>Praesent commodo cursus magna, vel scelerisque nisl consectetur et.</li>
<li>Donec id elit non mi porta gravida at eget metus.</li>
<li>Nulla vitae elit libero, a pharetra augue.</li>
</ul>
<p>Donec ullamcorper nulla non metus auctor fringilla. Nulla vitae elit libero, a pharetra augue.</p>
<ol>
<li>Vestibulum id ligula porta felis euismod semper.</li>
<li>Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.</li>
<li>Maecenas sed diam eget risus varius blandit sit amet non magna.</li>
</ol>
<p>Cras mattis consectetur purus sit amet fermentum. Sed posuere consectetur est at lobortis.</p>
</div><!-- /.blog-post -->
<div class="blog-post">
<h2 class="blog-post-title">Another blog post</h2>
<p class="blog-post-meta">December 23, 2013 by <a href="#">Jacob</a></p>
<p>Cum sociis natoque penatibus et magnis <a href="#">dis parturient montes</a>, nascetur ridiculus mus. Aenean eu leo quam. Pellentesque ornare sem lacinia quam venenatis vestibulum. Sed posuere consectetur est at lobortis. Cras mattis consectetur purus sit amet fermentum.</p>
<blockquote>
<p>Curabitur blandit tempus porttitor. <strong>Nullam quis risus eget urna mollis</strong> ornare vel eu leo. Nullam id dolor id nibh ultricies vehicula ut id elit.</p>
</blockquote>
<p>Etiam porta <em>sem malesuada magna</em> mollis euismod. Cras mattis consectetur purus sit amet fermentum. Aenean lacinia bibendum nulla sed consectetur.</p>
<p>Vivamus sagittis lacus vel augue laoreet rutrum faucibus dolor auctor. Duis mollis, est non commodo luctus, nisi erat porttitor ligula, eget lacinia odio sem nec elit. Morbi leo risus, porta ac consectetur ac, vestibulum at eros.</p>
</div><!-- /.blog-post -->
<div class="blog-post">
<h2 class="blog-post-title">New feature</h2>
<p class="blog-post-meta">December 14, 2013 by <a href="#">Chris</a></p>
<p>Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Aenean lacinia bibendum nulla sed consectetur. Etiam porta sem malesuada magna mollis euismod. Fusce dapibus, tellus ac cursus commodo, tortor mauris condimentum nibh, ut fermentum massa justo sit amet risus.</p>
<ul>
<li>Praesent commodo cursus magna, vel scelerisque nisl consectetur et.</li>
<li>Donec id elit non mi porta gravida at eget metus.</li>
<li>Nulla vitae elit libero, a pharetra augue.</li>
</ul>
<p>Etiam porta <em>sem malesuada magna</em> mollis euismod. Cras mattis consectetur purus sit amet fermentum. Aenean lacinia bibendum nulla sed consectetur.</p>
<p>Donec ullamcorper nulla non metus auctor fringilla. Nulla vitae elit libero, a pharetra augue.</p>
</div><!-- /.blog-post -->
<nav class="blog-pagination">
<a class="btn btn-outline-primary" href="#">Older</a>
<a class="btn btn-outline-secondary disabled" href="#">Newer</a>
</nav>
</div><!-- /.blog-main -->
<div class="col-sm-3 offset-sm-1 blog-sidebar">
<div class="sidebar-module sidebar-module-inset">
<h4>About</h4>
<p>Etiam porta <em>sem malesuada magna</em> mollis euismod. Cras mattis consectetur purus sit amet fermentum. Aenean lacinia bibendum nulla sed consectetur.</p>
</div>
<div class="sidebar-module">
<h4>Archives</h4>
<ol class="list-unstyled">
<li><a href="#">March 2014</a></li>
<li><a href="#">February 2014</a></li>
<li><a href="#">January 2014</a></li>
<li><a href="#">December 2013</a></li>
<li><a href="#">November 2013</a></li>
<li><a href="#">October 2013</a></li>
<li><a href="#">September 2013</a></li>
<li><a href="#">August 2013</a></li>
<li><a href="#">July 2013</a></li>
<li><a href="#">June 2013</a></li>
<li><a href="#">May 2013</a></li>
<li><a href="#">April 2013</a></li>
</ol>
</div>
<div class="sidebar-module">
<h4>Elsewhere</h4>
<ol class="list-unstyled">
<li><a href="#">GitHub</a></li>
<li><a href="#">Twitter</a></li>
<li><a href="#">Facebook</a></li>
</ol>
</div>
</div><!-- /.blog-sidebar -->
</div><!-- /.row -->
<!-- <h3>H3 in comment</h3> -->
</div>
"""
def test_extract_h3(self):
html_elements = fastHtmlExtract.extract(self.html, ['h3'])
expected_elements = ['<h3>Sub-heading</h3>','<h3>Sub-heading</h3>']
self.assertEqual(expected_elements, html_elements['h3'])
def test_extract_p(self):
html_elements = fastHtmlExtract.extract(self.html, ['p'])
nb_expected_elements = 21
self.assertEqual(nb_expected_elements, len(html_elements['p']))
def test_count_div(self):
elements_count = fastHtmlExtract.count(self.html, ['div'])
nb_expected_elements = 10
self.assertEqual(nb_expected_elements, elements_count['div'])
def test_count_h3(self):
elements_count = fastHtmlExtract.count(self.html, ['h3'])
nb_expected_elements = 2
self.assertEqual(nb_expected_elements, elements_count['h3'])
def test_get_text(self):
html = "abc <p> def <br/>ghi<b>jkl</b></p>1234"
text = fastHtmlExtract.get_text(html)
expected_text = "abc def ghi jkl 1234"
self.assertEqual(expected_text, text)
if __name__ == '__main__':
unittest.main()