-
Notifications
You must be signed in to change notification settings - Fork 1
/
server.py
executable file
·418 lines (369 loc) · 15 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
#!/usr/bin/env python
import ConfigParser
import fnmatch
import logging
import mimetypes
import os
from rar import RarFile, BadRarFile
import re
from shutil import rmtree
import subprocess
import sys
import zipfile
from twisted.python.log import err
from twisted.protocols.basic import FileSender
from twisted.internet import reactor, defer, error as twistedErrors
from twisted.web import server, resource
from twisted.web.resource import NoResource
# Setup logging
logger = logging.getLogger("comix")
logger.setLevel(logging.DEBUG)
# Use file output for production logging:
logfilename = "comix.log"
filelog = logging.FileHandler(logfilename, "w")
filelog.setLevel(logging.INFO)
# Use console for development logging:
conlog = logging.StreamHandler()
conlog.setLevel(logging.DEBUG)
# Specify log formatting:
formatter = logging.Formatter("%(asctime)s - %(message)s")
conlog.setFormatter(formatter)
filelog.setFormatter(formatter)
# Add console log to logger - TODO: check config file before turning on console logging
logger.addHandler(conlog)
logger.addHandler(filelog)
# Regexs we will need
LEADING_DIGIT_CLEANER = re.compile("^\(?\d+\.?\)?\s*")
OTHER_LEADING_DIGIT_CLEANER = re.compile("^\d+\s*\-?\s*")
BRACKET_CLEANER = re.compile("\s*[\[|(].*[\]|)]\s*")
ISSUE_RANGE_CLEANER = re.compile("\s*\d+\s*-\s*\d+\s*")
VOLUME_CLEANER = re.compile("\s*v\s{0,1}\d+\s*", re.IGNORECASE)
LONELY_APOSTROPHE_CLEANER = re.compile("\s+'\W*\s*")
HIGH_ASCII_CLEANER = re.compile("[^\\x00-\\x7f]")
ANNUALS_CLEANER = re.compile("[\s|-]+annuals.*", re.IGNORECASE)
IMAGE_FILE_EXTENSION_RE = re.compile(".jpe?g", re.IGNORECASE)
FILENAME_SPACE_CLEANER = re.compile("\s+|\s+-\s+")
ROOT = os.path.dirname(os.path.realpath(__file__))
STORAGE_PATH = os.path.join(ROOT, "temporary_storage")
# Caching
CURRENT_ISSUE = {}
# global setup stuff
try:
f = open("template.html", "r")
template = f.read()
f.close()
except IOError:
logger.critical("Could not find template.html in this directory")
sys.exit(1)
def setup():
"""
Create the temporary directory we'll extract files to. If it still exists
because a previous run could not delete it, clean it up a bit. Why?
Because I'm stupidly anal-retentive (and because I don't want to be filling
up someone's hard drive more than I need to).
"""
if os.path.exists(STORAGE_PATH):
for f in os.listdir(STORAGE_PATH):
path = os.path.join(STORAGE_PATH, f)
if os.path.isfile(path):
try:
os.unlink(path)
except OSError:
pass
else:
try:
os.makedirs(STORAGE_PATH)
except OSError:
logger.critical("I don't have enough permission to create %s in %s" %
STORAGE_PATH, ROOT)
def clean_up():
try:
rmtree(STORAGE_PATH)
except OSError:
logger.info("I wasn't able to delete %s. Are you still viewing something?"
% STORAGE_PATH)
reactor.addSystemEventTrigger("before", "shutdown", clean_up)
setup()
class ComicServer(resource.Resource):
def __init__(self, directory):
# old-skool call to parent
resource.Resource.__init__(self)
self.titles = {}
# TODO: directory handling - make sure ends in /,
# replace Windows separator stuff with /
self.directory = self._normalize_directory_path(directory)
if not os.path.exists(self.directory):
logger.critical("%s is not a valid path for the root directory" % self.directory)
sys.exit(1)
# ASSUMPTION: Empty folders (parents that only contain other folders or
# non-matching files) should never be used as a key in TITLES
self.ignored_folder_names = []
total = 0
# when you find a cbr or cbz, put folder name into titles
# problem here: fnmatch is only case-insensitive on case-insensitive OSes - replace?
for root, dirnames, filenames in os.walk(self.directory):
matches = fnmatch.filter(filenames, "*.cb[r|z]")
matches.sort()
if not matches:
self.ignored_folder_names.append(os.path.split(root)[-1])
for f in matches:
self._add_match_to_collection(f, root)
total = total + 1
logger.info("Found %d comics" % total)
def getChild(self, url, request):
response = CBRResource(url, request, self)
if response:
return response
return NoResource()
def _add_match_to_collection(self, filename, root):
"""
For a matching file, look at its folder information. If any of the folders
in its parent path exist in self.titles already, use that. Otherwise,
create a new entry.
Update book count and add this match to self.titles[key] files list
TODO: The problem with this folder-as-title logic is that its dependent
on the order in which the folders are matched by fnmatch.filter
As an example, ideally all of the titles I have under E:\Comics\Indies\Nexus
would be inside 'Nexus', but because the sub-folders are fed in first, only
the one file in the root folder shows up there
"""
path_info = os.path.split(root.replace(self.directory, ""))
exists = False
for folder in path_info:
if folder in self.ignored_folder_names:
continue
folder = self._prep_title(folder)
key = self._slugify(folder)
if self.titles.has_key(key):
exists = True
self.titles[key]["count"] = self.titles[key]["count"] + 1
if not exists or not self.titles.has_key(key):
self.titles[key] = {"count": 1, "files": {}, "full title": folder}
# ignore duplicate files
file_list = self.titles[key]["files"]
file_path = os.path.join(root, filename)
file_key = self._slugify(filename)
if file_key not in file_list:
file_list[file_key] = file_path
def _prep_title(self, folder_name):
"""
Do some basic cleanup on the nastiness P2P folks and anal-retentives
like to add to folders.
* Get rid of underscores and # signs
* leading digits for sorted collections (e.g., '1. A New Hope')
how to differentiate this from a comic like 2000 A.D.?
especially when people put years in the titles
* Strip anything inside brackets or parens
* Lose any number ranges (e.g., 1 - 10), regardless of space inside
* Get rid of volume indicators for now (e.g., v2)
might need to provide that info later in titles
* Get rid of any apostrophes we've orphaned (e.g., '93-'96)
* Lose high ASCII, non-alphanumeric stuff (e.g., copyright symbol)
* Strip "Annuals" to prevent having a separate folder for those ???
* If we're left with nothing at the end, return the original folder name
"""
title = folder_name.replace("_", " ")
title = folder_name.replace("#", "")
title = LEADING_DIGIT_CLEANER.sub("", title)
title = OTHER_LEADING_DIGIT_CLEANER.sub("", title)
title = BRACKET_CLEANER.sub("", title)
title = ISSUE_RANGE_CLEANER.sub("", title)
title = VOLUME_CLEANER.sub("", title)
title = LONELY_APOSTROPHE_CLEANER.sub("", title)
title = HIGH_ASCII_CLEANER.sub("", title)
title = ANNUALS_CLEANER.sub("", title)
if not len(title):
return folder_name
return title
def _normalize_directory_path(self, directory):
"""
For Windows, get rid of \ crud
"""
return directory.replace('\\', '/')
def _slugify(self, value):
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
return re.sub('[-\s]+', '-', value)
class CBRResource(resource.Resource):
isLeaf = True
def __init__(self, url, request, parent):
self.url = url
self.request = request
self.parent = parent
def render_GET(self, request):
request.setHeader("content-type", "text/html")
response = self.get_matching_response(request.path)
if not response:
return None
if "static" in response:
file_path = response["static"]
contentType, junk = mimetypes.guess_type(file_path)
request.setHeader("Content-Type",
contentType if contentType else "text/plain")
fp = open(file_path, "rb")
d = FileSender().beginFileTransfer(fp, request)
def cbFinished(ignored):
fp.close()
request.finish()
d.addErrback(err).addCallback(cbFinished)
return server.NOT_DONE_YET
return template % {
"title": str(response["title"]),
"body": str(response["body"])
}
def get_matching_response(self, path):
request_info = filter(None, path.split("/"))
if request_info:
top_folder = request_info[0]
if top_folder == "favicon.ico":
return None
if top_folder == "issue" and len(request_info) == 3:
return self.request_issue(*request_info[1:])
if top_folder in self.parent.titles.keys():
return self.request_title_list(top_folder)
if top_folder == "page" and len(request_info) == 4:
return self.request_page(*request_info[1:])
return self.request_root()
def request_root(self):
response = "Serving contents of %s<ul>" % self.parent.directory
for key in sorted(self.parent.titles.iterkeys()):
entry = self.parent.titles[key]
response += '<li><a href="/%s/">%s</a>: %d issues</li>' % (key,
entry["full title"], entry["count"])
response += "</ul>"
return {
"body": response,
"title": "Comix Server"
}
def request_title_list(self, title_key):
entry = self.parent.titles[title_key]
title = entry["full title"]
content = "<h1>%s</h1><ul>" % (title)
for key in entry["files"].keys():
content += '<li><a href="/issue/%s/%s/">%s</a></li>' % (title_key,
key, os.path.basename(entry["files"][key]))
content += "</ul>"
return {
"body": content,
"title": title
}
def request_issue(self, title_key, file_key):
file_contents = self._open_issue(title_key, file_key)
if not file_contents:
return {
"body": "Unable to open %s" % file_key,
"title": title_key
}
content = "<h1>Files in %s</h1><ul>" % (title_key)
for position, f in enumerate(file_contents):
content += '<li><a href="/page/%s/%s/%d">%s</a></li>' % (
title_key, file_key, (position + 1), os.path.basename(f)
)
content += "</ul>"
return {
"body": content,
"title": title_key
}
def request_page(self, title_key, file_key, position):
"""
Get a page inside a given issue
"""
file_contents = self._open_issue(title_key, file_key)
if not file_contents:
return None
try:
position = int(position) - 1
except TypeError:
return None
return {"static": os.path.join(STORAGE_PATH, file_contents[position])}
def _open_issue(self, title_key, file_key):
"""
Given the book title and the specific issue, get the contents
in the zip/ rar file
TODO: cache this in memory in a structure that only holds an issue or
two.
"""
cache_key = "%s-%s" % (title_key, file_key)
contents = CURRENT_ISSUE.get(cache_key, None)
if contents:
return contents
if not title_key in self.parent.titles:
return None
entry = self.parent.titles[title_key]
issue = entry["files"].get(file_key, None)
if not issue:
return None
contents = self._open_issue_file(issue)
if not contents:
return None
CURRENT_ISSUE[cache_key] = contents
return contents
def _open_issue_file(self, path):
"""
Open issue file based on extension
.cbr = RAR file
.cbz = ZIP file
See full file description at http://en.wikipedia.org/wiki/Comic_Book_Archive_file
TODO: Handle additional types
.cb7 = 7z
.cbt = TAR
.cba = ACE
"""
if not os.path.exists(path):
return None
extension = path.lower()[-3:]
folder_name = path.split(os.sep)[-1].split(".")[0]
folder_path = os.path.join(STORAGE_PATH, folder_name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
if extension == "cbz":
try:
z = zipfile.ZipFile(path)
files = self._filter_filenames(z.namelist())
paths = []
for f in files:
save_path = os.path.join(folder_path, f.split(os.sep)[-1])
try:
save = open(save_path, "w")
save.write(z.read(f))
save.close()
paths.append(save_path)
except IOError:
logging.warn("Unable to open a file: %s" % save_path)
logging.warn("Original path: %s" % f)
return ["Could not extract the files from this issue"]
except zipfile.BadZipfile:
return None
if extension == "cbr":
try:
file = open(path, "r")
rar = RarFile(file)
file = None
return [f for f in self._filter_filenames(rar.namelist())]
except BadRarFile:
logging.warn("Could not extract contents of %s" % path)
return ["Could not extract the files from this issue"]
return None
def _filter_filenames(self, name_list):
return [f for f in name_list if IMAGE_FILE_EXTENSION_RE.search(f)]
# run as script
if __name__ == '__main__':
config = ConfigParser.ConfigParser()
try:
config.read("comix.conf")
port = int(config.get("basics", "port"))
try:
reactor.listenTCP(port, server.Site(
ComicServer(config.get("basics", "directory")))
)
logger.info("Listening on %d" % port)
reactor.run()
except twistedErrors.CannotListenError:
logger.critical("Could not listen on port %d. Is something else running there?" % port)
sys.exit(1)
except ConfigParser.ParsingError, e:
logger.critical("""Sorry, I couldn't find a comix.conf file in this directory.
It should contain a [basics] section with port and directory info""")
sys.exit(1)
except ValueError, e:
logger.critical("The value for port in comix.conf must be a number")
sys.exit(1)