-
Notifications
You must be signed in to change notification settings - Fork 1
/
rar.py
348 lines (286 loc) · 14.2 KB
/
rar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A pure-Python module for identifying and examining RAR files developed without
any exposure to the original unrar code. (Just format docs from wotsit.org)
It was, however, influenced by the zipfile module in the Python standard
library as, having already decided to match the zipfile.ZipFile API as closely
as feasibly possible, I didn't see a point to doing extra work to come up with
new ways of laying out my code for no good reason.
@todo: Determine how rarfile (http://rarfile.berlios.de/) compares to this in
various target metrics. If it is superior or close enough on all fronts,
patch it as necessary and plan a migration path. Otherwise, do the following:
- Complete the parsing of the RAR metadata.
(eg. Get data from archive header, check CRCs, read cleartext comments, etc.)
- Optimize further and write a test suite.
- Double-check that ZipFile/ZipInfo API compatibility has been maintained
wherever feasible.
- Support extraction of files stored with no compression.
- Look into supporting split and password-protected RARs.
- Some password-protected RAR files use blocks with types 0x30, 0x60, and 0xAD
according to this code. Figure out whether it's a bug or whether they're really
completely new kinds of blocks. (Encrypted headers for filename-hiding?)
- When the appropriate code is available, use the following message for failure
to extract compressed files::
For reasions of patent, performance, and a general lack of motivation on the
author's part, this module does not extract compressed files.
"""
__appname__ = "rar.py"
__author__ = "Stephan Sokolow (deitarion/SSokolow)"
__version__ = "0.2.99.0"
__license__ = "PSF License 2.4 or higher (The Python License)"
#{ Settings for findRarHeader()
CHUNK_SIZE = 4096
MARKER_BLOCK = "\x52\x61\x72\x21\x1a\x07\x00"
FIND_LIMIT = 1024**2 #: 1MiB
# A Compromise. Override FIND_LIMIT with 0 to be sure but potentially very slow.
#{ Packing method values
RAR_STORED = 0x30
RAR_FASTEST = 0x31
RAR_FAST = 0x32
RAR_NORMAL = 0x33
RAR_GOOD = 0x34
RAR_BEST = 0x35
#}
import math, struct, sys, time, zlib
_struct_blockHeader = struct.Struct("<HBHH")
_struct_addSize = struct.Struct('<L')
_struct_fileHead_add1 = struct.Struct("<LBLLBBHL") # Plus FILE_NAME and everything after it
class BadRarFile(Exception):
"""Raised when no valid RAR header is found in a given file."""
class RarInfo(object):
"""The metadata for a file stored in a RAR archive.
@attention: API compatibility with ZipInfo could not be maintained in the
following fields:
- C{create_version} (Not stored in RAR files)
- C{flag_bits} (Zip and RAR use different file header flags)
- C{volume} (Zip files specify volume number. RAR files just have
"File is continued from previous" and "File continues in next" flags and
an archive-level "is volume" flag)
- C{comment} (RAR files may have multiple comments per file and they may be
stored using compression... which rar.py doesn't support)
@todo: How do I interpret the raw file timestamp?
@todo: Is the file's CRC of the compressed or uncompressed data?
@todo: Does RAR perform any kind of path separator normalization?
"""
os_map = ['MS DOS', 'OS/2', 'Win32', 'Unix'] #: Interpretations for possible L{create_system} values.
compress_size = None #: File's compressed size
compress_type = None #: Packing method (C{0x30} indicates no compression)
create_system = None #: Type of system on which the file originated (See L{os_map})
date_time = None #: File's timestamp
external_attr = None #: File's attributes
extract_version = None #: Minimum RAR version needed to extract (major * 10 + minor)
filename = None #: Filename relative to the archive root
file_size = None #: File's uncompressed size
flag_bits = 0 #: Raw flag bits from the RAR header
header_offset = None #: Offset of the compressed data within the file
is_directory = False #: The entry describes a folder/directory
is_encrypted = False #: The file has been encrypted with a password
is_solid = False #: Information from previous files has been used
not_first_piece = False #: File is continued from previous volume
not_last_piece = False #: File continues in next volume
CRC = None #: File's CRC
_raw_time = None #: Raw integer time value extracted from the header
#TODO: comment, extra, reserved, internal_attr
def __init__(self, filename, ftime=0):
"""
@param filename: The file's name and path relative to the archive root.
@note: Since I know of no filesystem which allows null bytes in paths,
this borrows a trick from C{ZipInfo} and truncates L{filename} at the
first null byte to protect against certain kinds of virus tricks.
@todo: Implement support for taking ints OR tuples for L{ftime}.
"""
null_byte = filename.find(chr(0))
if null_byte >= 0:
filename = filename[0:null_byte]
self.filename = filename
self.orig_filename = filename # Match ZipInfo for better compatibility
self._raw_time = ftime
self.date_time = time.gmtime(self._raw_time) #TODO: Verify this is correct.
class RarFile(object):
"""A simple parser for RAR archives capable of retrieving content metadata
and, possibly in the future, of extracting entries stored without
compression.
@note: Whenever feasible, this class replicates the API of
C{zipfile.ZipFile}. As a side-effect, design decisions the author
has no strong feelings about (eg. naming of private methods)
will generally closely follow those made C{in zipfile.ZipFile}.
"""
_block_types = {
0x72: 'Marker Block ( MARK_HEAD )',
0x73: 'Archive Heaver ( MAIN_HEAD )',
0x74: 'File Header',
0x75: 'Comment Header',
0x76: 'Extra Info',
0x77: 'Subblock',
0x78: 'Recovery Record',
0x7b: 'Terminator?'
} #: Raw HEAD_TYPE values used in block headers.
# According to the comment in zipfile.ZipFile, __del__ needs fp here.
fp = None #: The file handle used to read the metadata.
_filePassed = None #: Whether an already-open file handle was passed in.
# I just put all public members here as a matter of course.
filelist = None #: A C{list} of L{RarInfo} objects corresponding to the contents.
debug = 0 #: Debugging verbosity. Effective range is currently 0 to 1.
def __init__(self, handle):
# If we've been given a path, get our desired file-like object.
if isinstance(handle, basestring):
self_filePassed = False
self.filename = handle
self.fp = open(handle, 'rb')
else:
self._filePassed = True
self.fp = handle
self.filename = getattr(handle, 'name', None)
# Find the header, skipping the SFX module if present.
start_offset = findRarHeader(self.fp)
if start_offset:
self.fp.seek(start_offset)
else:
if not self._filePassed:
self.fp.close()
self.fp = None
raise BadRarFile("Not a valid RAR file")
self.filelist = []
# Actually read the file metadata.
self._getContents()
def __del__(self):
"""Close the file handle if we opened it... just in case the underlying
Python implementation doesn't do refcount closing."""
if self.fp and not self._filePassed:
self.fp.close()
def _getContents(self):
"""Content-reading code is here separated from L{__init__} so that, if
the author so chooses, writing of uncompressed RAR files may be
implemented in a later version more easily.
"""
while True:
offset = self.fp.tell()
# Read the fields present in every type of block header
try:
head_crc, head_type, head_flags, head_size = self._read_struct(_struct_blockHeader)
except struct.error:
# If it fails here, we've reached the end of the file.
return
# Read the optional field ADD_SIZE if present.
if head_flags & 0x8000:
add_size = self._read_struct(_struct_addSize)[0]
else:
add_size = 0
# TODO: Rework handling of archive headers.
if head_type == 0x73:
#TODO: Try to factor this out to reduce time spent in syscalls.
self.fp.seek(offset + 2) # Seek to just after HEAD_CRC
#FIXME: Check header CRC on all blocks.
assert self._check_crc(self.fp.read(11), head_crc)
# TODO: Rework handling of file headers.
elif head_type == 0x74:
unp_size, host_os, file_crc, ftime, unp_ver, method, name_size, attr = self._read_struct(_struct_fileHead_add1)
# FIXME: What encoding does WinRAR use for filenames?
# TODO: Verify that ftime is seconds since the epoch as it seems
fileinfo = RarInfo(self.fp.read(name_size), ftime)
fileinfo.compress_size = add_size
fileinfo.header_offset = offset
fileinfo.file_size = unp_size #TODO: What about >2GiB files? (Zip64 equivalent?)
fileinfo.CRC = file_crc #TODO: Verify the format matches that ZipInfo uses.
fileinfo.compress_type = method
# Note: RAR seems to have copied the encoding methods used by
# Zip for these values.
fileinfo.create_system = host_os
fileinfo.extract_version = unp_ver
fileinfo.external_attr = attr #TODO: Verify that this is correct.
# Handle flags
fileinfo.flag_bits = head_flags
fileinfo.not_first_piece = head_flags & 0x01
fileinfo.not_last_piece = head_flags & 0x02
fileinfo.is_encrypted = head_flags & 0x04
#TODO: Handle comments
fileinfo.is_solid = head_flags & 0x10
# TODO: Verify this is correct handling of bits 7,6,5 == 111
fileinfo.is_directory = head_flags & 0xe0
self.filelist.append(fileinfo)
elif self.debug > 0:
sys.stderr.write("Unhandled block: %s\n" % self._block_types.get(head_type, 'Unknown (0x%x)' % head_type))
# Line up for the next block
#TODO: Try to factor this out to reduce time spent in syscalls.
self.fp.seek(offset + head_size + add_size)
def _read_struct(self, fmt):
"""Simplifies the process of extracting a struct from the open file."""
return fmt.unpack(self.fp.read(fmt.size))
def _check_crc(self, data, crc):
"""Check some data against a stored CRC.
Note: For header CRCs, RAR calculates a CRC32 and then throws out the high-order bytes.
@bug: This method of parsing is deprecated.
@todo: I've only tested this out on 2-byte CRCs, not 4-byte file data CRCs.
@todo: Isn't there some better way to do the check for CRC bitwidth?
@bug: Figure out why I can't get a match on valid File Header CRCs.
"""
if isinstance(crc, int):
if crc < 65536:
crc = struct.pack('>H', crc)
else:
crc = struct.pack('>L', crc)
return struct.pack('>L',zlib.crc32(data)).endswith(crc)
def infolist(self):
"""Return a list of L{RarInfo} instances for the files in the archive."""
return self.filelist
def namelist(self):
"""Return a list of filenames for the files in the archive."""
return [x.filename for x in self.filelist]
def findRarHeader(handle, limit=FIND_LIMIT):
"""Searches a file-like object for a RAR header.
@returns: The in-file offset of the first byte after the header block or
C{None} if no RAR header was found.
@warning: The given file-like object must support C{seek()} up to the size
of C{limit}.
@note: C{limit} is rounded up to the nearest multiple of L{CHUNK_SIZE}.
@todo: Audit this to ensure it can't raise an exception L{is_rarfile()}
won't catch.
"""
startPos, chunk = handle.tell(), ""
limit = math.ceil(limit / float(CHUNK_SIZE)) * CHUNK_SIZE
# Find the RAR header and line up for further reads. (Support SFX bundles)
while True:
temp = handle.read(CHUNK_SIZE)
curr_pos = handle.tell()
# If we hit the end of the file without finding a RAR marker block...
if not temp or (limit > 0 and curr_pos > limit):
handle.seek(startPos)
return None
chunk += temp
marker_offset = chunk.find(MARKER_BLOCK)
if marker_offset > -1:
handle.seek(startPos)
return curr_pos - len(chunk) + marker_offset + len(MARKER_BLOCK)
# Obviously we haven't found the marker yet...
chunk = chunk[len(temp):] # Use a rolling window to minimize memory consumption.
def is_rarfile(filename, limit=FIND_LIMIT):
"""Convenience wrapper for L{findRarHeader} equivalent to C{is_zipfile}.
Returns C{True} if C{filename} is a valid RAR file based on its magic
number, otherwise returns C{False}.
Optionally takes a limiting value for the maximum amount of data to sift
through. Defaults to L{FIND_LIMIT} to set a sane bound on performance. Set
it to 0 to perform an exhaustive search for a RAR header.
@note: findRarHeader rounds this limit up to the nearest multiple of
L{CHUNK_SIZE}.
"""
try:
handle = file(filename, 'rb')
return findRarHeader(handle, limit) is not None
except IOError:
pass
return False
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser(description=__doc__.split('\n\n')[0],
version="%%prog v%s" % __version__, usage="%prog <path> ...")
opts, args = parser.parse_args()
if args:
RarFile.debug = 1
for fpath in args:
print "File: %s" % fpath
if is_rarfile(fpath):
for line in RarFile(fpath).namelist():
print "\t%s" % line
else:
print "Not a RAR file"