forked from joswr1ght/md5deep
-
Notifications
You must be signed in to change notification settings - Fork 0
/
md5deep.py
executable file
·215 lines (189 loc) · 7.75 KB
/
md5deep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python
# MIT License, (c) Joshua Wright [email protected]
# https://github.com/joswr1ght/md5deep
import os, sys, hashlib, re, multiprocessing
from Queue import Queue
from threading import Thread
import time, datetime, errno
# To stop the queue from consuming all the RAM available
MaxQueue = 1000
# Reproduce this output with slashes consistent for Windows systems
#ba2812a436909554688154be461d976c A\SEC575-Clown-Chat\nvram
# file regex
md5FileRegex = re.compile(r'^(?P<hash>[a-f0-9]{32}) (?P<path>(/)?([^/\0]+(/)?)+)\n$')
file_queue = Queue(MaxQueue)
# Optimized for low-memory systems, read whole file with blocksize=0
def md5sum(filename, blocksize=65536):
hash = hashlib.md5()
try:
with open(filename, "rb") as f:
for block in iter(lambda: f.read(blocksize), ""):
hash.update(block)
return hash.hexdigest().strip()
except IOError as e:
if e.errno == errno.EACCES:
sys.stderr.write("Permission denied: %s\n"%(filename))
return "00000000000000000000000000000000"
pass
def mod_datetime(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)
def usage():
print "Usage: md5deep.py [OPTIONS] [FILES]"
print "-r - recursive mode, all subdirectories are traversed."
print "-n - During any of the matching modes (-m,-M,-x,or -X), displays only the filenames of any known hashes that were not matched by any of the input files."
print "-M <file> - enables matching mode."
print "-m <file> - as above."
print "-X <file> - enables negative matching mode."
print "-x <file> - as above."
print "-n - used with -MmXx so only file name outputed."
print "-f - speed up hash calculations, using more memory."
print "-0 - Uses a NULL character (/0) to terminate each line instead of a newline. Useful for processing filenames with strange characters."
print "-jnn - Controls multi-threading. By default the program will create one producer thread to scan the file system and one hashing thread per CPU core. Multi-threading causes output filenames to be in non-deterministic order, as files that take longer to hash will be delayed while they are hashed. If a deterministic order is required, specify -j0 to disable multi-threading."
print "-s - includes the targets of symlinks in the cargo, by default they are ignored."
print "-t yyyymmddThhmmss - include only files modified after the timestamp provided."
def formatOutput(hash, path):
hash = hash.replace(" ","")
path = path.replace("\r","")
path = path.replace("\n","")
if opt_nameonly:
sys.stdout.write("%s%s"%(path, opt_endofline))
else:
sys.stdout.write("%s %s%s"%(hash, path, opt_endofline))
def validate_hashes(hashfile, hashlist, mode):
# Open file and build a new hashlist
hashlistrec = []
with open(hashfile, "r") as f:
for line in f:
hashpair = md5FileRegex.match(line)
if hashpair:
filehash = hashpair.group('hash')
filename = hashpair.group('path')
# Convert to platform covention directory separators
filename = normfname(hashpair.group('path'))
# Add entry to hashlistrec
hashlistrec.append((filename, filehash))
if mode == "neg":
for diff in list(set(hashlist) - set(hashlistrec)):
formatOutput(diff[1], normfname(diff[0]))
elif mode == "pos":
for inter in list(set(hashlistrec) & set(hashlist)):
formatOutput(inter[1], normfname(inter[0]))
# Normalize filename based on platform
def normfname(filename):
if os.name == 'nt': # Windows
return filename.replace("/", "\\")
else:
return filename.replace("\\","/")
# Worker thread function
def calcMD5(i, q):
while True:
path = q.get()
formatOutput(md5sum(path, md5blocklen), path)
q.task_done()
if __name__ == '__main__':
opt_recursive = None
opt_negmatch = None
opt_match = None
opt_nameonly = None
opt_hashtable = None
opt_fast = None
opt_endofline = "\n"
opt_files = []
opt_threads = multiprocessing.cpu_count()
opt_symlink = False
opt_timestamp =""
if len(sys.argv) == 1:
usage()
sys.exit(0)
args = sys.argv[1:]
it = iter(args)
for i in it:
if i == '-r':
opt_recursive = True
continue
elif i == '-0':
opt_endofline = "\0"
continue
elif i == '-f':
opt_fast = True
elif i == '-X' or i == '-x':
opt_negmatch = next(it)
if not os.path.isfile(opt_negmatch):
sys.stdout.write("Cannot open negative match file %s\n"%opt_negmatch)
sys.exit(-1)
continue
elif i == '-M' or i == '-m':
opt_match = next(it)
if not os.path.isfile(opt_match):
sys.stdout.write("Cannot open match file %s\n"%opt_match)
sys.exit(-1)
continue
elif i == '-n' and (opt_negmatch or opt_match):
opt_nameonly = True
continue
elif i.startswith('-j'):
opt_threads = int(i[2:])
continue
elif i == '-s':
opt_symlink = True
continue
elif i == '-t':
opt_timestamp = datetime.datetime.strptime( next(it), "%Y%m%dT%H%M%S" )
if not opt_timestamp:
sys.stdout.write("Is not valid ISO timestamp %s\n"%opt_timestampe)
sys.exit(-1)
continue
else:
opt_files.append(i)
if opt_fast:
md5blocklen=0
else:
# Default to optimize for low-memory systems
md5blocklen=65536
# If we are not doing matching then we by-pass the hashtable
# this saves RAM and allows us to process much larger filesystems
if opt_negmatch or opt_match:
opt_hashtable = True
else:
opt_hashtable = False
if opt_threads:
for i in range(opt_threads):
worker = Thread(target=calcMD5, args=(i, file_queue))
worker.setDaemon(True)
worker.start()
# Build a list of (hash,filename) for each file, regardless of specified
# options
hashlist = []
# Hash files in the current directory
for f in opt_files:
if os.path.isfile(f):
hashlist.append((f, md5sum(f, md5blocklen)))
# Walk all subdirectories
if opt_recursive:
for start in sys.argv[1:]:
for (directory, _, files) in os.walk(start):
for f in files:
path = os.path.join(directory, f)
if (not(opt_symlink and os.path.islink(path))):
if opt_hashtable:
hashlist.append((path, md5sum(path, md5blocklen)))
elif not opt_timestamp or (mod_datetime(path) > opt_timestamp and opt_timestamp):
if opt_threads:
# Add it to the queue
file_queue.put(path)
else:
# Threading disabled
formatOutput(md5sum(path, md5blocklen), path)
# With the hashlist built, compare to the negative/posative match list, or print
# the results.
if opt_threads:
file_queue.join()
elif opt_negmatch:
validate_hashes(opt_negmatch, hashlist, "neg")
elif opt_match:
validate_hashes(opt_match, hashlist, "pos")
else:
# Just print out the list with Windows-syle filenames
for hash in hashlist:
formatOutput(hash[1],normfname(hash[0]))