import os,profile,sys from stat import * from sys import argv from struct import pack from collections import deque from array import array from bisect import bisect import psyco psyco.full() # number of hash tables to use - 1 hashlen = 6 # maximum number of entries in a list in the last hash table hashmax = 6000 # read a whole file def readfile(name): fd = open(name, "rb") contents = fd.read() fd.close() return os.stat(name)[ST_SIZE], contents # write a whole file def writefile(name, contents): fd = open(name, "wb") fd.write(contents) fd.close() # sort matches def sort_by_length(left, right): return left[1][0][1] - right[1][0][1] def sort_by_start(left, right): return left[0] - right[0] #apack uses a bitstream for encoding information and some gamma/length #information and it uses a character stream for raw data and length #information, as appropriate class apack: # extend the last hash table def extend(self, hashes, k): global hashmax l = 0 z = self.z lenz = self.lenz end = hashes[k] hashes[k] = {} hashes.append({}) k2 = k+1 for f in end.items(): key = f[0] hashes[k][key] = f[1][0] for g in xrange(len(f[1])-1,-1,-1): i = f[1][g] if i+k2+1 < lenz: sub = z[i:i+k2+1] if sub in hashes[k2]: hashes[k2][sub].appendleft(i) if len(hashes[k2][sub]) > l: l = len(hashes[k2][sub]) else: hashes[k2][sub] = deque([i]) if l > hashmax: k2 = extend(hashes, k2) return k2 # build up a list of matches for each position in the source string def search(self): global hashlen, hashmax z = self.z lenz = self.lenz #matches are of the format: [match match match ... ] #each match is of the format: #[string_index previous_list] # previous_list = [[previous_index length] ...]] #where previous_index is a position less than or equal to the #string_index and length is a count of the number of characters #for which previous_index and string_index match #ie string[string_index:string_index+length] == # string[previous_index:previous_index+length] #previous_list goes from farthest away from string_index and longest #length to closest to string_index and shortest length, #[string_index 1] being the closest/shortest matches = [[0, deque([[0, 1]])]] #the hash list stores hashes for progressive longer strings such that #the index in hashes + 1 is the length of all hashed strings in that #hash table; for all indexes less than hashlen, the value for each #key is index of the last occurance of said key; for the the index of #hashlen, a list is kept of all occurances such that a string search #can be performed; if one of these lists grows larger than hashmax, #hashlen is increased and the list for the new hashlen is built hashes = [{} for i in xrange(hashlen+1)] for i in xrange(1,lenz): if not (i%500): print i sys.stdout.flush() k = 0 idx = i leng = 1 match = [i, deque([[idx, leng]])] amatch = 1 while amatch and i+k < lenz and k < hashlen: amatch = 0 sub = z[i:i+k+1] if sub in hashes[k]: idx = hashes[k][sub] leng = k+1 hashes[k][sub] = i match[1].appendleft([idx, leng]) k += 1 amatch = 1 while i+k < lenz and k < hashlen: sub = z[i:i+k+1] hashes[k][sub] = i k += 1 if i+k < lenz: sub = z[i:i+k+1] if sub in hashes[k]: ls = hashes[k][sub] leng += 1 idx = ls[0] match[1].appendleft([idx, leng]) for f in ls: if i+leng >= lenz: break if z[i+leng] == z[f+leng] and z[i+k:i+leng] == z[f+k:f+leng]: idx = f leng += 1 while i+leng < lenz and z[i+leng] == z[f+leng]: leng += 1 match[1].appendleft([idx, leng]) hashes[k][sub].appendleft(i) if len(hashes[k][sub]) > hashmax: hashlen = self.extend(hashes, k) print i, ">", hashlen else: hashes[k][sub] = deque([i]) matches.append(match) return matches #this is the compression "optimizer" #there are two main focuses that are used to optimize #1. allocate ranges starting from largest to smallest #2. try to have ranges border existing ranges or each other to avoid # one byte holes def trim(self, matches): #uncomment the following line to disable the optimizer #return matches matches.sort(sort_by_length) lenz = self.lenz #use the next two lists to keep track of what parts of the files #have been mem = [0] mem2 = [[0, lenz]] #and use this list to avoid having to sort every time we need to #insert into the main matches list lengths = [] for i in xrange(len(matches)): lengths.append(matches[i][1][0][1]) result = [] i = lenz-1 while i > -1: length = matches[i][1][0][1] #print "--", i, len(result), length #print mem #print mem2 sys.stdout.flush() if length > 1: list = [matches[i]] del matches[i] del lengths[i] i -= 1 #generate a list of all same-sized ranges while i > -1 and matches[i][1][0][1] == length: list.append(matches.pop(i)) del lengths[i] i -= 1 sidebyside = 1 while sidebyside: sidebyside = 0 bordered = 1 #add all ranges that border an existing allocated range #(or the start/end of the file) while bordered: bordered = 0 j = 0 while j < len(list): start = list[j][0] end = start+length x = bisect(mem, start)-1 memrange = mem2[x] mstart = memrange[0] mend = memrange[1] #print "..>", start, end #print "<..", mstart, mend if mstart == start and end <= mend: if mend == end: #print "border-left-and-right" mem.pop(x) mem2.pop(x) else: #print "border-left" mem2[x][0] = end mem[x] = end result.append(list[j]) list.pop(j) j -= 1 bordered = 1 elif mend == end and start >= mstart: #print "border-right" mem2[x][1] = start result.append(list[j]) list.pop(j) j -= 1 bordered = 1 j += 1 #search for two ranges that border each other list.sort(sort_by_start) j = 0 while j < len(list)-1: if list[j][0]+length == list[j+1][0]: start = list[j][0] end = start+length*2 x = bisect(mem, start)-1 memrange = mem2[x] mstart = memrange[0] mend = memrange[1] #print "...>", start, end #print "<...", mstart, mend if start >= mstart and end <= mend: #print "adjacent blocks" if start == mstart: if mend == end: mem.pop(x) mem2.pop(x) else: mem2[x][0] = end mem[x] = end elif end == mend: mem2[x][1] = start else: nend = mend nstart = end mem2[x][1] = start mem2.insert(x+1, [nstart, nend]) mem.insert(x+1, nstart) result.append(list[j]) result.append(list[j+1]) list.pop(j) list.pop(j) j -= 1 sidebyside = 1 break j += 1 # with all bordered/adjacent ranges handled, do our best to # allocate the rest or punt to a future loop j = 0 while j < len(list): start = list[j][0] end = start+length x = bisect(mem, start)-1 memrange = mem2[x] mstart = memrange[0] mend = memrange[1] #print ".>", start, end #print "<.", mstart, mend if start >= mstart and end <= mend: #print "inrange" if start == mstart: if mend == end: mem.pop(x) mem2.pop(x) else: mem2[x][0] = end mem[x] = end elif end == mend: mem2[x][1] = start else: nend = mend nstart = end mem2[x][1] = start mem2.insert(x+1, [nstart, nend]) mem.insert(x+1, nstart) result.append(list[j]) else: if start > mend or end < mstart: #this range exists completely inside an existing #allocated range, so minimize it for possible #use as filler #print "to-one" while list[j][1][0][1] != 1: list[j][1].popleft() l = 1 else: #this range could fit, if it were made to not #overlap, so find the smallest range that is #greater than or equal to the avaiable range; #this is done because smaller ranges are closer #to the start index and if a gamma is used this #means a smaller output size print "fit" l = mend - start while list[j][1][0][1] != 1 and list[j][1][1][1] >= l: list[j][1].popleft() list[j][1][0][1] = l #make it an exact fit and punt to a future loop #just in case there is a better fit somehow x = bisect(lengths, l) lengths.insert(x, l) matches.insert(x, list[j]) i += 1 j += 1 else: # we're down to only filler/one chars while i > -1: result.append(matches.pop(i)) del lengths[i] i -= 1 result.sort(sort_by_start) #result and matches should be the same length print "---", len(result) sys.stdout.flush() return result #the encoder's helper functions # write out bits in the bitstream def setbits(self, value, length): mask = 1<<(length-1) while mask: if self.ormask == 1: self.out[self.flagidx] = chr(self.flags) self.ormask = 128 self.flags = 0 self.flagidx = len(self.out) self.out.append("\0") else: self.ormask >>= 1 if mask&value: self.flags |= self.ormask mask >>= 1 # write out gamma (a variable-length bitstream) of the format: # [1]d1d1d1d1d0 # ex. 4 -> 0100 def setgamma(self, gamma): if gamma < 2: print "Error: Gamma passed %d" % gamma sys.exit(1) tmp = gamma mask = 1 length = 0 while tmp != 1: tmp >>= 1 mask <<= 1 length += 1 gamma = gamma&(mask-1) mask >>= 1 while length > 1: length -= 1 tmp = (((gamma&mask)>>length)<<1) | 1 self.setbits(tmp, 2) mask >>= 1 self.setbits((gamma&1)<<1, 2) #'0' flag for raw and the char itself def raw(self, char): self.setbits(0, 1) self.out.append(char) self.lwm = 0 #'1110000' for "\0" character def zero(self): self.setbits(64+32+16, 7) self.lwm = 0 #'1110000'+off if a duplicate character is within 15 bytes def onenear(self, off): self.setbits(64+32+16+off, 7) self.lwm = 0 #'110' flag + a char for two or three characters that are within 127 bytes #the char is the (offset)<<1 | (length == 3) def twoorthree(self, off, three): c = off<<1 if three: c |= 1 self.setbits(4+2, 3) self.out.append(chr(c)) self.recentoff = off self.lwm = 1 #if recentoffset is found to be the same, use '1000' and a gamma to #specify the length def recent(self, len): self.setbits(8, 4) self.setgamma(len) self.lwm = 1 #for larger lengths or simply further away distances, use this #encoding is '10', a gamma of the offset/256 (plus 2 or 3, to #differentiate from a recentoffset encoding), and a character #for the offset%256. The length is then encoded as a gamma, after #some "correction" for short and long distances def variable(self, shift, off, len): if off >= 32000: len -= 1 if off >= 1280: len -= 1 if off < 128: len -= 2 if len > 1: self.setbits(2, 2) self.setgamma((off>>8)+shift) self.out.append(chr(off&255)) self.setgamma(len) self.recentoff = off self.lwm = 1 return 0 else: #it's possible that the final length is small enough that #it's not possible to proper encode it as a gamma string, #so we error and try to find a better match return 1 #the encoder itself def encode(self, matches): z = self.z lenz = self.lenz self.ormask = 256 self.flags = 0 self.out = array('c', [z[0], "\0"]) self.flagidx = 1 self.recentoff = -1 #i'm not sure what lwm stands for, but it effects the encoding self.lwm = 0 index = 1 while index < lenz: start = matches[index][0] match = matches[index][1].popleft() idx = match[0] leng = match[1] off = start - idx #print "..", index, start, idx, leng, off, self.lwm, self.recentoff sys.stdout.flush() if leng > 1 and self.lwm == 0 and off == self.recentoff: self.recent(leng) elif leng > 1 and leng < 4 and off > 0 and off < 128: self.twoorthree(off, leng == 3) elif leng > 1 and self.lwm == 0: err = self.variable(3, off, leng) if err: #not strictly an error, but this is suboptimal print "err(3)", idx, start, off, leng leng = 0 if not matches[index][1]: print "Ran out of filler(3)!" sys.exit(3) elif leng > 1: err = self.variable(2, off, leng) if err: #not strictly an error, but this is suboptimal print "err(2)", idx, start, off, leng leng = 0 if not matches[index][1]: print "Ran out of filler(2)!" sys.exit(2) elif off > 0 and off < 16: self.onenear(off) elif z[index] == 0: self.zero() else: self.raw(z[index]) index += leng # terminate the output with '110' bitstream and a "\0" char self.setbits(4+2, 3) self.out.append("\0") self.setbits(0, 7) #flush flags #cut off the spurious empty flags character before converting to a #string return self.out[:len(self.out)-1].tostring() def compress(self, z): '''the actual apack compressor returns compressed string''' self.z = z self.lenz = len(z) matches = self.search() matches = self.trim(matches) str = self.encode(matches) return str if __name__ == "__main__": if len(argv) == 3: size, contents = readfile(argv[1]) #profile.run('writefile(argv[2], apack().compress(contents))') writefile(argv[2], apack().compress(contents)) if len(argv) == 4: hashlen = int(argv[3]) size, contents = readfile(argv[1]) #profile.run('writefile(argv[2], apack().compress(contents))') writefile(argv[2], apack().compress(contents)) if len(argv) == 5: hashlen = int(argv[3]) hashmax = int(argv[4]) size, contents = readfile(argv[1]) #profile.run('writefile(argv[2], apack().compress(contents))') writefile(argv[2], apack().compress(contents))