import email, urllib
from WebKit import *
def loadMHT(filename):
"""
Load a .HMT HTML archive and return the WebArchive representation.
"""
return HMTLoad(filename).asWebArchive()
class MHTLoader (object):
"""
A loader for .mht files, and archive format used by MS Internet Explorer
on Windows.
"""
def __init__(self, filename):
self.filename = filename
# root of the archive (index into self.parts)
self.root = None
# filename -> (content-type, data)
self.parts = {}
self.loadFile(filename)
def loadFile(self, filename):
fp = open(filename, 'r')
msg = email.message_from_file(fp)
fp.close()
for part in msg.walk():
if part.get_content_maintype() == 'multipart':
continue
filename = part.get('Content-Location')
contentType = part.get_content_type()
data = part.get_payload(decode=True)
self.parts[filename] = (contentType, data)
if self.root is None:
self.root = filename
def fixupURL(self, url):
# IE creates MHT files with file: URLS containing backslashes,
# NSURL insists that those are invalid, replace backslashes by
# forward slashes.
if url.startswith('file:'):
return url.replace('\\', '/')
else:
return url
def asWebArchive(self):
"""
Convert the MHT archive to a webarchive.
"""
rootType, rootText = self.parts[self.root]
pageResource = WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
NSData.dataWithBytes_length_(rootText.replace('\\', '/'), len(rootText)),
NSURL.URLWithString_(self.fixupURL(self.root)),
NSString.stringWithString_(rootType),
None,
None)
resources = []
for url in self.parts:
if url == self.root: continue
tp, data = self.parts[url]
resources.append(WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
NSData.dataWithBytes_length_(data, len(data)),
NSURL.URLWithString_(self.fixupURL(url)),
NSString.stringWithString_(tp),
None,
None))
return WebArchive.alloc().initWithMainResource_subresources_subframeArchives_(
pageResource, resources, None)
def main():
# Testing...
p = MHTLoader('audit-web.mht')
a = p.asWebArchive()
d = a.data()
fp = open('audit-web.webarchive', 'wb')
fp.write(a.data().bytes())
fp.close()
if __name__ == "__main__":
main()
|