-
Notifications
You must be signed in to change notification settings - Fork 7
/
xmlmysqldump.py
executable file
·71 lines (57 loc) · 1.79 KB
/
xmlmysqldump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
"""
Read in the XML mysqldump for sys.sdin.
@note: We assume that <row> and </row> are in lines with no other text, save whitespace.
"""
import sys
try:
import psyco
psyco.full()
except ImportError:
sys.stderr.write("psyco could not be imported\n")
rows = []
# Current row
rowtxt = None
import re
row_start_re = re.compile('^\s*<row>\s*$')
row_end_re = re.compile('^\s*</row>\s*$')
title_re = re.compile('<title>(.*)</title>')
hackernews_re = re.compile('Hacker News \|\s*')
link_re = re.compile('\|\s+link')
from common.xml2json.parker import convertxmlstring
from common.html2text import html2text
import common.json
for l in sys.stdin:
if row_start_re.match(l):
assert rowtxt is None
rowtxt = ""
if rowtxt is not None: rowtxt += l
if row_end_re.match(l):
row = convertxmlstring(rowtxt)["row"]
(id, created_at, url, text) = row
print >> sys.stderr, "Processing id #%s" % id
# print "id=", id
# print "created_at=", created_at
# print "url=", url
if text == "" or text is None:
rowtxt = None
continue
m = title_re.search(text)
if not m:
title = None
else:
title = m.group(1)
title = hackernews_re.sub("", title)
# print "title=", title.encode("utf-8")
# print "text=", text.encode("utf-8")
try:
txttext = html2text(text)
# print "len(txttext)=", len(txttext)
# print "commentcount=", len(link_re.findall(txttext))
# print "txttext=", txttext
# print
except RuntimeError:
print >> sys.stderr, "RuntimeError when processing text:", text.encode("utf-8")
rowtxt = None
continue
rowtxt = None