forked from matchbox/warc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
warctikad.py
executable file
·53 lines (42 loc) · 1.6 KB
/
warctikad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
"""Copyright 2014 Tom Nicholls
Process a directory of Web ARChive files through the warctika library to
reduce binary document formats to plain text 'conversion' records.
This work is available under the terms of the GNU General Purpose Licence
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
"""
#####
#SETUP
#####
import sys
import os
from warctika import *
import re
import time
if len(sys.argv) < 2:
print "Must give name of WARC directory to watch"
sys.exit(1)
dirname = sys.argv[1]
warcprocessor = WARCTikaProcessor()
oldsuffix = '.warc.gz'
newsuffix = '-ViaTika.warc.gz'
while True:
for fn in os.listdir(dirname):
if fn.endswith(oldsuffix) and not fn.endswith(newsuffix):
infn = dirname+"/"+fn
outfn = re.sub(oldsuffix+'$', newsuffix, infn)
if os.path.exists(outfn):
print "File", infn, "has already been processed. Skipping."
else:
warcprocessor.process(infn=infn, outfn=outfn, delete=True)
print "Done."
time.sleep(15)