# rarfile.py # # Copyright (c) 2005 Marko Kreen # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. import os, re from struct import pack, unpack from binascii import crc32 from cStringIO import StringIO # whether to speed up decompression by using tmp archive _use_extract_hack = 1 # command line to use for extracting _extract_cmd = 'unrar p -inul "%s" "%s"' # # rar constants # RAR_ID = "Rar!\x1a\x07\x00" # block types RAR_BLOCK_MARK = 0x72 # r RAR_BLOCK_MAIN = 0x73 # s RAR_BLOCK_FILE = 0x74 # t RAR_BLOCK_OLD_COMMENT = 0x75 # u RAR_BLOCK_OLD_EXTRA = 0x76 # v RAR_BLOCK_OLD_SUB = 0x77 # w RAR_BLOCK_OLD_RECOVERY = 0x78 # x RAR_BLOCK_OLD_AUTH = 0x79 # y RAR_BLOCK_SUB = 0x7a # z RAR_BLOCK_ENDARC = 0x7b # { # main header flags RAR_MAIN_VOLUME = 0x0001 RAR_MAIN_COMMENT = 0x0002 RAR_MAIN_LOCK = 0x0004 RAR_MAIN_SOLID = 0x0008 RAR_MAIN_NEWNUMBERING = 0x0010 RAR_MAIN_AUTH = 0x0020 RAR_MAIN_RECOVERY = 0x0040 RAR_MAIN_PASSWORD = 0x0080 RAR_MAIN_FIRSTVOLUME = 0x0100 # file header flags RAR_FILE_SPLIT_BEFORE = 0x0001 RAR_FILE_SPLIT_AFTER = 0x0002 RAR_FILE_PASSWORD = 0x0004 RAR_FILE_COMMENT = 0x0008 RAR_FILE_SOLID = 0x0010 RAR_FILE_DICTMASK = 0x00e0 RAR_FILE_DICT64 = 0x0000 RAR_FILE_DICT128 = 0x0020 RAR_FILE_DICT256 = 0x0040 RAR_FILE_DICT512 = 0x0060 RAR_FILE_DICT1024 = 0x0080 RAR_FILE_DICT2048 = 0x00a0 RAR_FILE_DICT4096 = 0x00c0 RAR_FILE_DIRECTORY = 0x00e0 RAR_FILE_LARGE = 0x0100 RAR_FILE_UNICODE = 0x0200 RAR_FILE_SALT = 0x0400 RAR_FILE_VERSION = 0x0800 RAR_FILE_EXTTIME = 0x1000 RAR_FILE_EXTFLAGS = 0x2000 RAR_ENDARC_NEXT_VOLUME = 0x0001 RAR_ENDARC_DATACRC = 0x0002 RAR_ENDARC_REVSPACE = 0x0004 # flags common to all blocks RAR_SKIP_IF_UNKNOWN = 0x4000 RAR_LONG_BLOCK = 0x8000 # Host OS types RAR_OS_MSDOS = 0 RAR_OS_OS2 = 1 RAR_OS_WIN32 = 2 RAR_OS_UNIX = 3 # # Public interface # def is_rarfile(fn): '''Check quickly whether file is rar archive.''' buf = open(fn, "rb").read(len(RAR_ID)) return buf == RAR_ID class RarInfo: '''An entry in rar archive.''' def isdir(self): '''Returns True if the entry is a directory.''' if self.type == RAR_BLOCK_FILE: return (self.flags & RAR_FILE_DIRECTORY) == RAR_FILE_DIRECTORY return False class RarFile: '''Rar archive handling.''' def __init__(self, rarfile, mode="r", charset=None, info_callback=None): self.rarfile = rarfile self.charset = charset self.info_list = [] self.is_solid = 0 self.uses_newnumbering = 0 self.uses_volumes = 0 self.info_callback = info_callback self.got_mainhdr = 0 self._gen_volname = self._gen_oldvol if mode != "r": raise Exception("Only mode=r supported") self._parse() def namelist(self): '''Return list of filenames in rar''' res = [] for f in self.info_list: res.append(f.filename) return res def infolist(self): '''Return rar entries.''' return self.info_list def getinfo(self, fname): '''Return RarInfo for fname.''' fx = fname.replace("/", "\\") for f in self.info_list: if fname == f.filename or fx == f.filename: return f def read(self, fname): '''Return decompressed data.''' inf = self.getinfo(fname) if not inf: raise Exception("No such file") if inf.isdir(): raise Exception("No data in directory") if inf.compress_type == 0x30: res = self._extract_clear(inf) elif _use_extract_hack and not self.is_solid and not self.uses_volumes: res = self._extract_hack(inf) else: res = self._extract_unrar(self.rarfile, inf) return res def close(self): pass def printdir(self): for f in self.info_list: print f.filename # store entry def _process_entry(self, item): # RAR_BLOCK_NEWSUB has files too: CMT, RR if item.type == RAR_BLOCK_FILE: # use only first part if (item.flags & RAR_FILE_SPLIT_BEFORE) == 0: self.info_list.append(item) if self.info_callback: self.info_callback(item) # read rar def _parse(self): fd = open(self.rarfile, "rb") id = fd.read(len(RAR_ID)) if id != RAR_ID: raise Exception("Not a Rar") volume = 0 # first vol (.rar) is 0 more_vols = 0 while 1: h = self._parse_header(fd) if not h: if more_vols: volume += 1 fd = open(self._gen_volname(volume), "rb") more_vols = 0 if fd: continue break h.volume = volume if h.type == RAR_BLOCK_MAIN and not self.got_mainhdr: if h.flags & RAR_MAIN_NEWNUMBERING: self.uses_newnumbering = 1 self._gen_volname = self._gen_newvol self.uses_volumes = h.flags & RAR_MAIN_VOLUME self.is_solid = h.flags & RAR_MAIN_SOLID self.got_mainhdr = 1 elif h.type == RAR_BLOCK_ENDARC: more_vols = h.flags & RAR_ENDARC_NEXT_VOLUME # store it self._process_entry(h) # skip data if h.add_size > 0: fd.seek(h.add_size, 1) def _parse_header(self, fd): h = self._parse_block_header(fd) if h and (h.type == RAR_BLOCK_FILE or h.type == RAR_BLOCK_SUB): self._parse_file_header(h) return h # common header def _parse_block_header(self, fd): HDRLEN = 7 h = RarInfo() h.header_offset = fd.tell() buf = fd.read(HDRLEN) if not buf: return None t = unpack(" HDRLEN: h.data = fd.read(h.header_size - HDRLEN) else: h.data = "" h.file_offset = fd.tell() if h.flags & RAR_LONG_BLOCK: h.add_size = unpack("> 5 min = stamp & 0x3F; stamp = stamp >> 6 hr = stamp & 0x1F; stamp = stamp >> 5 day = stamp & 0x1F; stamp = stamp >> 5 mon = stamp & 0x0F; stamp = stamp >> 4 yr = (stamp & 0x7F) + 1980 return (yr, mon, day, hr, min, sec) # new-style volume name def _gen_newvol(self, volume): # allow % in filenames fn = self.rarfile.replace("%", "%%") m = re.search(r"([0-9][0-9]*)[^0-9]*$", fn) if not m: raise Exception("Cannot construct volume name") n1 = m.start(1) n2 = m.end(1) fmt = "%%0%dd" % (n2 - n1) volfmt = fn[:n1] + fmt + fn[n2:] return volfmt % (volume + 1) # old-style volume naming def _gen_oldvol(self, volume): if volume == 0: return self.rarfile i = self.rarfile.rfind(".") base = self.rarfile[:i] if volume <= 100: ext = ".r%02d" % (volume - 1) else: ext = ".s%02d" % (volume - 101) return base + ext # read uncompressed file def _extract_clear(self, inf): volume = inf.volume buf = "" cur = None while 1: f = open(self._gen_volname(volume), "rb") if not cur: f.seek(inf.header_offset) while 1: cur = self._parse_header(f) if cur.type in (RAR_BLOCK_MARK, RAR_BLOCK_MAIN): if cur.add_size: f.seek(cur.add_size, 1) continue if cur.filename == inf.filename: buf += f.read(cur.add_size) break raise RuntimeException("file not found?") # no more parts? if (cur.flags & RAR_FILE_SPLIT_AFTER) == 0: break volume += 1 return buf # put file compressed data into temporary .rar archive, and run # unrar on that, thus avoiding unrar going over whole archive def _extract_hack(self, inf): from tempfile import mkstemp BSIZE = 32*1024 size = inf.compress_size + inf.header_size rf = open(self.rarfile, "rb") rf.seek(inf.header_offset) tmpfd,tmpname = mkstemp(suffix='.rar') tmpf = os.fdopen(tmpfd, "wb") # create main header: crc, type, flags, size, res1, res2 mh = pack(" 0: if size > BSIZE: buf = rf.read(BSIZE) else: buf = rf.read(size) tmpf.write(buf) size -= len(buf) tmpf.close() buf = self._extract_unrar(tmpname, inf) os.unlink(tmpname) return buf # extract using unrar def _extract_unrar(self, rarfile, inf): fn = inf.filename # linux unrar wants '/', not '\' fn = fn.replace("\\", "/") # shell escapes fn = fn.replace("`", "\\`") fn = fn.replace('"', '\\"') fn = fn.replace("$", "\\$") cmd = _extract_cmd % (rarfile, fn) fd = os.popen(cmd, "r") buf = fd.read() err = fd.close() if err > 0: raise Exception("Error reading file") return buf class _UnicodeFilename: def __init__(self, name, encdata): self.std_name = name self.encdata = encdata self.pos = self.encpos = 0 self.buf = StringIO() def enc_byte(self): c = self.encdata[self.encpos] self.encpos += 1 return ord(c) def std_byte(self): return ord(self.std_name[self.pos]) def put(self, lo, hi): self.buf.write(chr(lo) + chr(hi)) self.pos += 1 def decode(self): hi = self.enc_byte() flagbits = 0 while self.encpos < len(self.encdata): if flagbits == 0: flags = self.enc_byte() flagbits = 8 flagbits -= 2 t = (flags >> flagbits) & 3 if t == 0: self.put(self.enc_byte(), 0) elif t == 1: self.put(self.enc_byte(), hi) elif t == 2: self.put(self.enc_byte(), self.enc_byte()) else: n = self.enc_byte() if n & 0x80: c = self.enc_byte() for i in range((n & 0x7f) + 2): lo = (self.std_byte() + c) & 0xFF self.put(lo, hi) else: for i in range(n + 2): self.put(self.std_byte(), 0) return self.buf.getvalue().decode("utf-16le", "replace")