forked from matchbox/warc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
warcnottikad.py
executable file
·84 lines (72 loc) · 2.95 KB
/
warcnottikad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
"""Copyright 2014 Tom Nicholls
Process a directory of Web ARChive files through the warctika library to
reduce binary document formats to plain text 'conversion' records.
This work is available under the terms of the GNU General Purpose Licence
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
"""
#####
#SETUP
#####
import sys
import os
import pyinotify
import warctika
import re
import time
if len(sys.argv) < 2:
print "Must give name of WARC directory to watch"
sys.exit(1)
dirname = sys.argv[1]
# Watch the WARC directory for file creation and deletion
#log.setLevel(10)
#wm = pyinotify.WatchManager() # Watch Manager
# watched events
# TODO: Consider if we also want IN_CLOSE_WRITE (depends on the order that
# heritrix finishes writing, closes and renames the file.
#mask = pyinotify.IN_CREATE | pyinotify.IN_MOVED_TO | pyinotify.IN_MOVED_FROM
#wm.add_watch(dirname, mask)
warcprocessor = warctika.WARCNonTikaProcessor()
oldsuffix = '.warc.gz'
newsuffix = '-NotViaTika.warc.gz'
#handler = warctika.WARCNotifyHandler(warcprocessor=warcprocessor,
# oldsuffix=oldsuffix,
# newsuffix=newsuffix)
#notifier = pyinotify.Notifier(wm, handler)
# On first run,
# loop through watched directory and handle all existing
# files, in case we restarted part-way through a crawl.
# Then check forever.
while True:
for fn in os.listdir(dirname):
if fn.endswith(oldsuffix) and not fn.endswith(newsuffix):
infn = dirname+"/"+fn
outfn = re.sub(oldsuffix+'$', newsuffix, infn)
# if os.path.exists(outfn):
# print "Existing file", infn, "has already been processed. Skipping."
# continue
print "Processing existing file:", infn
# try:
warcprocessor.process(infn=infn, outfn=outfn)
print "Not deleting:", infn
# os.unlink(infn)
# except Exception as e:
# XXX cleanup: delete -ViaTika.warc.gz file if present.
# print ("Warning: Startup processor failed to process "+
# "file "+fn+": "+str(e)+str(e.args)+
# "\n\tGiving up on it.")
# raise e
print "Done."
time.sleep(15)
#print "Finished processing existing files. Now watching for new WARC files."
# Run forever
#notifier.loop()