#!/usr/bin/env python
# -*- coding: windows-1251 -*-
# Copyright (C) 2005 Roman V. Kiseliov
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# 3. All advertising materials mentioning features or use of this
# software must display the following acknowledgment:
# "This product includes software developed by
# Roman V. Kiseliov <roman@kiseliov.ru>."
#
# 4. Redistributions of any form whatsoever must retain the following
# acknowledgment:
# "This product includes software developed by
# Roman V. Kiseliov <roman@kiseliov.ru>."
#
# THIS SOFTWARE IS PROVIDED BY Roman V. Kiseliov ``AS IS'' AND ANY
# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Roman V. Kiseliov OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.
__rev_id__ = """$Id: ImportXLS.py,v 1.6 2005/10/26 07:44:24 rvk Exp $"""
import UnicodeUtils
import CompoundDoc
import ExcelMagic
from struct import pack,unpack
def parse_xls(filename, encoding = None):
##########################################################################
def process_BOUNDSHEET(biff8, rec_data):
sheet_stream_pos, visibility, sheet_type = unpack('<I2B', rec_data[:6])
sheet_name = rec_data[6:]
if biff8:
chars_num, options = unpack('2B', sheet_name[:2])
chars_start = 2
runs_num = 0
asian_phonetic_size = 0
result = ''
compressed = (options & 0x01) == 0
has_asian_phonetic = (options & 0x04) != 0
has_format_runs = (options & 0x08) != 0
if has_format_runs:
runs_num , = unpack('<H', sheet_name[chars_start:chars_start+2])
chars_start += 2
if has_asian_phonetic:
asian_phonetic_size , = unpack('<I', sheet_name[chars_start:chars_start+4])
chars_start += 4
if compressed:
chars_end = chars_start + chars_num
result = sheet_name[chars_start:chars_end].decode('latin_1', 'replace')
else:
chars_end = chars_start + 2*chars_num
result = sheet_name[chars_start:chars_end].decode('utf_16_le', 'replace')
tail_size = 4*runs_num + asian_phonetic_size
else:
result = sheet_name[1:].decode(encoding, 'replace')
return result
def unpack2str(biff8, label_name): # 2 bytes length str
if biff8:
chars_num, options = unpack('<HB', label_name[:3])
chars_start = 3
runs_num = 0
asian_phonetic_size = 0
result = ''
compressed = (options & 0x01) == 0
has_asian_phonetic = (options & 0x04) != 0
has_format_runs = (options & 0x08) != 0
if has_format_runs:
runs_num , = unpack('<H', label_name[chars_start:chars_start+2])
chars_start += 2
if has_asian_phonetic:
asian_phonetic_size , = unpack('<I', label_name[chars_start:chars_start+4])
chars_start += 4
if compressed:
chars_end = chars_start + chars_num
result = label_name[chars_start:chars_end].decode('latin_1', 'replace')
else:
chars_end = chars_start + 2*chars_num
result = label_name[chars_start:chars_end].decode('utf_16_le', 'replace')
tail_size = 4*runs_num + asian_phonetic_size
else:
result = label_name[2:].decode(encoding, 'replace')
return result
def process_LABEL(biff8, rec_data):
row_idx, col_idx, xf_idx = unpack('<3H', rec_data[:6])
label_name = rec_data[6:]
result = unpack2str(biff8, label_name)
return (row_idx, col_idx, result)
def process_LABELSST(rec_data):
row_idx, col_idx, xf_idx, sst_idx = unpack('<3HI', rec_data)
return (row_idx, col_idx, sst_idx)
def process_RSTRING(biff8, rec_data):
if biff8:
return process_LABEL(biff8, rec_data)
else:
row_idx, col_idx, xf_idx, length = unpack('<4H', rec_data[:8])
result = rec_data[8:8+length].decode(encoding, 'replace')
return (row_idx, col_idx, result)
def decode_rk(encoded):
b0, b1, b2, b3 = unpack('4B', encoded)
is_multed_100 = (b0 & 0x01) != 0
is_integer = (b0 & 0x02) != 0
if is_integer:
result , = unpack('<i', encoded)
result >>= 2
else:
ieee754 = struct.pack('8B', 0, 0, 0, 0, b0 & 0xFC, b1, b2, b3)
result , = unpack('<d', ieee754)
if is_multed_100:
result /= 100.0
return result
def process_RK(rec_data):
row_idx, col_idx, xf_idx, encoded = unpack('<3H4s', rec_data)
result = decode_rk(encoded)
return (row_idx, col_idx, result)
def process_MULRK(rec_data):
row_idx, first_col_idx = unpack('<2H', rec_data[:4])
last_col_idx , = unpack('<H', rec_data[-2:])
xf_rk_num = last_col_idx - first_col_idx + 1
results = []
for i in range(xf_rk_num):
xf_idx, encoded = unpack('<H4s', rec_data[4+6*i : 4+6*(i+1)])
results.append(decode_rk(encoded))
return zip([row_idx]*xf_rk_num, range(first_col_idx, last_col_idx+1), results)
def process_NUMBER(rec_data):
row_idx, col_idx, xf_idx, result = unpack('<3Hd', rec_data)
return (row_idx, col_idx, result)
def process_SST(rec_data, sst_continues):
# 0x00FC
total_refs, total_str = unpack('<2I', rec_data[:8])
#print total_refs, str_num
pos = 8
curr_block = rec_data
curr_block_num = -1
curr_str_num = 0
SST = {}
while curr_str_num < total_str:
if pos >= len(curr_block):
curr_block_num += 1
curr_block = sst_continues[curr_block_num]
pos = 0
chars_num, options = unpack('<HB', curr_block[pos:pos+3])
#print chars_num, options
pos += 3
asian_phonetic_size = 0
runs_num = 0
has_asian_phonetic = (options & 0x04) != 0
has_format_runs = (options & 0x08) != 0
if has_format_runs:
runs_num , = unpack('<H', curr_block[pos:pos+2])
pos += 2
if has_asian_phonetic:
asian_phonetic_size , = unpack('<I', curr_block[pos:pos+4])
pos += 4
curr_char = 0
result = ''
while curr_char < chars_num:
if pos >= len(curr_block):
curr_block_num += 1
curr_block = sst_continues[curr_block_num]
options = ord(curr_block[0])
pos = 1
#print curr_block_num
compressed = (options & 0x01) == 0
if compressed:
chars_end = pos + chars_num - curr_char
else:
chars_end = pos + 2*(chars_num - curr_char)
#print compressed, has_asian_phonetic, has_format_runs
splitted = chars_end > len(curr_block)
if splitted:
chars_end = len(curr_block)
#print splitted, curr_char, pos, chars_end, repr(curr_block[pos:chars_end])
if compressed:
result += curr_block[pos:chars_end].decode('latin_1', 'replace')
else:
result += curr_block[pos:chars_end].decode('utf_16_le', 'replace')
pos = chars_end
curr_char = len(result)
# end while
# TODO: handle spanning format runs over CONTINUE blocks ???
tail_size = 4*runs_num + asian_phonetic_size
if len(curr_block) < pos + tail_size:
pos = pos + tail_size - len(curr_block)
curr_block_num += 1
curr_block = sst_continues[curr_block_num]
else:
pos += tail_size
#print result.encode('cp866')
SST[curr_str_num] = result
curr_str_num += 1
return SST
#####################################################################################
import struct
encodings = {
0x016F: 'ascii', #ASCII
0x01B5: 'cp437', #IBM PC CP-437 (US)
0x02D0: 'cp720', #IBM PC CP-720 (OEM Arabic)
0x02E1: 'cp737', #IBM PC CP-737 (Greek)
0x0307: 'cp775', #IBM PC CP-775 (Baltic)
0x0352: 'cp850', #IBM PC CP-850 (Latin I)
0x0354: 'cp852', #IBM PC CP-852 (Latin II (Central European))
0x0357: 'cp855', #IBM PC CP-855 (Cyrillic)
0x0359: 'cp857', #IBM PC CP-857 (Turkish)
0x035A: 'cp858', #IBM PC CP-858 (Multilingual Latin I with Euro)
0x035C: 'cp860', #IBM PC CP-860 (Portuguese)
0x035D: 'cp861', #IBM PC CP-861 (Icelandic)
0x035E: 'cp862', #IBM PC CP-862 (Hebrew)
0x035F: 'cp863', #IBM PC CP-863 (Canadian (French))
0x0360: 'cp864', #IBM PC CP-864 (Arabic)
0x0361: 'cp865', #IBM PC CP-865 (Nordic)
0x0362: 'cp866', #IBM PC CP-866 (Cyrillic (Russian))
0x0365: 'cp869', #IBM PC CP-869 (Greek (Modern))
0x036A: 'cp874', #Windows CP-874 (Thai)
0x03A4: 'cp932', #Windows CP-932 (Japanese Shift-JIS)
0x03A8: 'cp936', #Windows CP-936 (Chinese Simplified GBK)
0x03B5: 'cp949', #Windows CP-949 (Korean (Wansung))
0x03B6: 'cp950', #Windows CP-950 (Chinese Traditional BIG5)
0x04B0: 'utf_16_le', #UTF-16 (BIFF8)
0x04E2: 'cp1250', #Windows CP-1250 (Latin II) (Central European)
0x04E3: 'cp1251', #Windows CP-1251 (Cyrillic)
0x04E4: 'cp1252', #Windows CP-1252 (Latin I) (BIFF4-BIFF7)
0x04E5: 'cp1253', #Windows CP-1253 (Greek)
0x04E6: 'cp1254', #Windows CP-1254 (Turkish)
0x04E7: 'cp1255', #Windows CP-1255 (Hebrew)
0x04E8: 'cp1256', #Windows CP-1256 (Arabic)
0x04E9: 'cp1257', #Windows CP-1257 (Baltic)
0x04EA: 'cp1258', #Windows CP-1258 (Vietnamese)
0x0551: 'cp1361', #Windows CP-1361 (Korean (Johab))
0x2710: 'mac_roman', #Apple Roman
0x8000: 'mac_roman', #Apple Roman
0x8001: 'cp1252' #Windows CP-1252 (Latin I) (BIFF2-BIFF3)
}
biff8 = True
SST = {}
sheets = []
sheet_names = []
values = {}
ws_num = 0
BOFs = 0
EOFs = 0
# Inside MS Office document looks like filesystem
# We need extract stream named 'Workbook' or 'Book'
ole_streams = CompoundDoc.Reader(filename).STREAMS
if 'Workbook' in ole_streams:
workbook_stream = ole_streams['Workbook']
elif 'Book' in ole_streams:
workbook_stream = ole_streams['Book']
else:
raise Exception, 'No workbook stream in file.'
workbook_stream_len = len(workbook_stream)
stream_pos = 0
# Excel's method of data storing is based on
# ancient technology "TLV" (Type, Length, Value).
# In addition, if record size grows to some limit
# Excel writes CONTINUE records
while stream_pos < workbook_stream_len and EOFs <= ws_num:
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
stream_pos += data_size
if rec_id == 0x0809: # BOF
#print 'BOF',
BOFs += 1
ver, substream_type = unpack('<2H', rec_data[:4])
if substream_type == 0x0005:
# workbook global substream
biff8 = ver >= 0x0600
elif substream_type == 0x0010:
# worksheet substream
pass
else: # skip chart stream or unknown stream
# stream offsets may be used from BOUNDSHEET record
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
while rec_id != 0x000A: # EOF
#print 'SST CONTINUE'
stream_pos += 4
stream_pos += data_size
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
#print 'BIFF8 == ', biff8
elif rec_id == 0x000A: # EOF
#print 'EOF'
if BOFs > 1:
sheets.extend([values])
values = {}
EOFs += 1
elif rec_id == 0x0042: # CODEPAGE
cp , = unpack('<H', rec_data)
#print 'CODEPAGE', hex(cp)
if not encoding:
encoding = encodings[cp]
#print encoding
elif rec_id == 0x0085: # BOUNDSHEET
#print 'BOUNDSHEET',
ws_num += 1
b = process_BOUNDSHEET(biff8, rec_data)
sheet_names.extend([b])
#print b.encode('cp866')
elif rec_id == 0x00FC: # SST
#print 'SST'
sst_data = rec_data
sst_continues = []
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
while rec_id == 0x003C: # CONTINUE
#print 'SST CONTINUE'
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
sst_continues.extend([rec_data])
stream_pos += data_size
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
SST = process_SST(sst_data, sst_continues)
elif rec_id == 0x00FD: # LABELSST
#print 'LABELSST',
r, c, i = process_LABELSST(rec_data)
values[(r, c)] = SST[i]
#print r, c, SST[i].encode('cp866')
elif rec_id == 0x0204: # LABEL
#print 'LABEL',
r, c, b = process_LABEL(biff8, rec_data)
values[(r, c)] = b
#print r, c, b.encode('cp866')
elif rec_id == 0x00D6: # RSTRING
#print 'RSTRING',
r, c, b = process_RSTRING(biff8, rec_data)
values[(r, c)] = b
#print r, c, b.encode('cp866')
elif rec_id == 0x027E: # RK
#print 'RK',
r, c, b = process_RK(rec_data)
values[(r, c)] = b
#print r, c, b
elif rec_id == 0x00BD: # MULRK
#print 'MULRK',
for r, c, b in process_MULRK(rec_data):
values[(r, c)] = b
#print r, c, b
elif rec_id == 0x0203: # NUMBER
#print 'NUMBER',
r, c, b = process_NUMBER(rec_data)
values[(r, c)] = b
#print r, c, b
elif rec_id == 0x0006: # FORMULA
#print 'FORMULA',
r, c, x = unpack('<3H', rec_data[0:6])
if rec_data[12] == '\xFF' and rec_data[13] == '\xFF':
if rec_data[6] == '\x00':
got_str = False
if ord(rec_data[14]) & 8:
# part of shared formula
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
stream_pos += data_size
if rec_id == 0x0207: # STRING
got_str = True
elif rec_id not in (0x0221, 0x04BC, 0x0236, 0x0037, 0x0036):
raise Exception("Expected ARRAY, SHRFMLA, TABLEOP* or STRING record")
if not got_str:
rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4])
stream_pos += 4
rec_data = workbook_stream[stream_pos:stream_pos+data_size]
stream_pos += data_size
if rec_id != 0x0207: # STRING
raise Exception("Expected STRING record")
values[(r, c)] = unpack2str(biff8, rec_data)
elif rec_data[6] == '\x01':
# boolean
v = ord(rec_data[8])
values[(r, c)] = bool(v)
elif rec_data[6] == '\x02':
# error
v = ord(rec_data[8])
if v in ExcelMagic.error_msg_by_code:
values[(r, c)] = ExcelMagic.error_msg_by_code[v]
else:
values[(r, c)] = u'#UNKNOWN ERROR!'
elif rec_data[6] == '\x03':
# empty
values[(r, c)] = u''
else:
raise Exception("Unknown value for formula result")
else:
# 64-bit float
d, = unpack("<d", rec_data[6:14])
values[(r, c)] = d
encoding = None
return zip(sheet_names, sheets)
|