#!/usr/bin/env python import bsddb, sha, binascii import os, sys from gzip import GzipFile from StringIO import StringIO def hash(file, piecesize): h = [] fullh = sha.new() while 1: x = file.read(piecesize) if x == "": break h.append((sha.new(x).hexdigest(), len(x))) fullh.update(x) return (fullh.hexdigest(), h) piecesize = 512*1024 chunksize = 16*1024 def optimalpiecesize(size): def eval(s,c,m): b = m/c return [ i*c for i in range(int(b/2), b+1) if s - i*c*int(s/m) <= i*c ] def score(s,c,m): l = int(s/m) return [ (abs(i - (s - l*i)), i) for i in eval(s,c,m) ] def bestest(s,c,m): return min( score(s,c,m) ) return bestest(size,chunksize,piecesize)[1] cache_file = sys.argv[1] pieces = {} cache = bsddb.btopen(cache_file, "w") def str2hash(s): r = [] if s == "": return None, [] fh,s = binascii.b2a_hex(s[:20]), s[20:] while len(s) > 0: (l,h,s) = s[:4], s[4:24], s[24:] r.append( (binascii.b2a_hex(h), long(binascii.b2a_hex(l), 16)) ) return fh,r def hash2str(fh, hs): s = binascii.a2b_hex(fh) for (h, l) in hs: s += binascii.a2b_hex("%08x" % l) + binascii.a2b_hex(h) return s for filename in sys.stdin: filename = filename.rstrip() fnkey = filename + ":pc" if cache.has_key(fnkey): sha1, result = str2hash(cache[fnkey]) else: size = os.stat(filename).st_size if size <= piecesize: values = "" result = [] else: ps = optimalpiecesize(size) file = open(filename) sha1, result = hash(file, ps) values = hash2str(sha1, result) file.close() cache[fnkey] = values if result: print "Filename: %s" % (filename) print "SHA1: %s" % (sha1) print "SHA1-Pieces:" for x in result: print " %s %d" % x print "" cache.sync() cache.close()