""" Parse xls up to some point Read storages, (sub-)streams, records from xls file """ # # === LICENSE ================================================================== # xls_parser is copyright (c) 2014-2019 Philippe Lagadec (http://www.decalage.info) # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, # are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #------------------------------------------------------------------------------ # CHANGELOG: # 2017-11-02 v0.1 CH: - first version # 2017-11-02 v0.2 CH: - move some code to record_base.py # (to avoid copy-and-paste in ppt_parser.py) # 2019-01-30 v0.54 PL: - fixed import to avoid mixing installed oletools # and dev version __version__ = '0.54' # ----------------------------------------------------------------------------- # TODO: # - parse more record types (ExternName, ...) # - check what bad stuff can be in other storages: Embedded ("MBD..."), Linked # ("LNK..."), "MsoDataStore" and OleStream ('\001Ole') # # ----------------------------------------------------------------------------- # REFERENCES: # - [MS-XLS]: Excel Binary File Format (.xls) Structure Specification # https://msdn.microsoft.com/en-us/library/office/cc313154(v=office.14).aspx # - Understanding the Excel .xls Binary File Format # https://msdn.microsoft.com/en-us/library/office/gg615597(v=office.14).aspx # # -- IMPORTS ------------------------------------------------------------------ import sys import os.path from struct import unpack import logging # little hack to allow absolute imports even if oletools is not installed. # Copied from olevba.py PARENT_DIR = os.path.normpath(os.path.dirname(os.path.dirname( os.path.abspath(__file__)))) if PARENT_DIR not in sys.path: sys.path.insert(0, PARENT_DIR) del PARENT_DIR from oletools import record_base # === PYTHON 2+3 SUPPORT ====================================================== if sys.version_info[0] >= 3: unichr = chr ############################################################################### # Helpers ############################################################################### def is_xls(filename): """ determine whether a given file is an excel ole file returns True if given file is an ole file and contains a Workbook stream todo: could further check that workbook stream starts with a globals substream. See also: oleid.OleID.check_excel """ xls_file = None try: xls_file = XlsFile(filename) for stream in xls_file.iter_streams(): if isinstance(stream, WorkbookStream): return True except Exception: logging.debug('Ignoring exception in is_xls, assume is not xls', exc_info=True) finally: if xls_file is not None: xls_file.close() return False def read_unicode(data, start_idx, n_chars): """ read a unicode string from a XLUnicodeStringNoCch structure """ # first bit 0x0 --> only low-bytes are saved, all high bytes are 0 # first bit 0x1 --> 2 bytes per character low_bytes_only = (ord(data[start_idx:start_idx+1]) == 0) if low_bytes_only: end_idx = start_idx + 1 + n_chars return data[start_idx+1:end_idx].decode('ascii'), end_idx else: return read_unicode_2byte(data, start_idx+1, n_chars) def read_unicode_2byte(data, start_idx, n_chars): """ read a unicode string with characters encoded by 2 bytes """ end_idx = start_idx + n_chars * 2 if n_chars < 256: # faster version, long format string for unpack unichars = (unichr(val) for val in unpack('<' + 'H'*n_chars, data[start_idx:end_idx])) else: # slower version but less memory-extensive unichars = (unichr(unpack('<H', data[data_idx:data_idx+2])[0]) for data_idx in range(start_idx, end_idx, 2)) return u''.join(unichars), end_idx ############################################################################### # File, Storage, Stream ############################################################################### class XlsFile(record_base.OleRecordFile): """ An xls file has most streams made up of records """ @classmethod def stream_class_for_name(cls, stream_name): """ helper for iter_streams """ if stream_name == 'Workbook': return WorkbookStream return XlsStream class XlsStream(record_base.OleRecordStream): """ most streams in xls file consist of records """ def read_record_head(self): """ read first few bytes of record to determine size and type returns (type, size, other) where other is None """ rec_type, rec_size = unpack('<HH', self.stream.read(4)) return rec_type, rec_size, None @classmethod def record_class_for_type(cls, rec_type): """ determine a class for given record type returns (clz, force_read) """ return XlsRecord, False class WorkbookStream(XlsStream): """ Stream in excel file that holds most info """ @classmethod def record_class_for_type(cls, rec_type): """ determine a class for given record type returns (clz, force_read) """ if rec_type == XlsRecordBof.TYPE: return XlsRecordBof, True elif rec_type == XlsRecordEof.TYPE: return XlsRecordEof, False elif rec_type == XlsRecordSupBook.TYPE: return XlsRecordSupBook, True else: return XlsRecord, False class XlsbStream(record_base.OleRecordStream): """ binary stream of an xlsb file, usually have a record structure """ HIGH_BIT_MASK = 0b10000000 LOW7_BIT_MASK = 0b01111111 def read_record_head(self): """ read first few bytes of record to determine size and type returns (type, size, other) where other is None """ val = ord(self.stream.read(1)) if val & self.HIGH_BIT_MASK: # high bit of the low byte is 1 val2 = ord(self.stream.read(1)) # need another byte # combine 7 low bits of each byte rec_type = (val & self.LOW7_BIT_MASK) + \ ((val2 & self.LOW7_BIT_MASK) << 7) else: rec_type = val rec_size = 0 shift = 0 for _ in range(4): # rec_size needs up to 4 byte val = ord(self.stream.read(1)) rec_size += (val & self.LOW7_BIT_MASK) << shift shift += 7 if (val & self.HIGH_BIT_MASK) == 0: # high-bit is 0 --> done break return rec_type, rec_size, None @classmethod def record_class_for_type(cls, rec_type): """ determine a class for given record type returns (clz, force_read) """ if rec_type == XlsbBeginSupBook.TYPE: return XlsbBeginSupBook, True else: return XlsbRecord, False ############################################################################### # RECORDS ############################################################################### # records that appear often but do not need their own XlsRecord subclass (yet) FREQUENT_RECORDS = dict([ ( 156, 'BuiltInFnGroupCount'), (2147, 'BookExt'), ( 442, 'CodeName'), ( 66, 'CodePage'), (4195, 'Dat'), (2154, 'DataLabExt'), (2155, 'DataLabExtContents'), ( 215, 'DBCell'), ( 220, 'DbOrParmQry'), (2051, 'DBQueryExt'), (2166, 'DConn'), ( 35, 'ExternName'), ( 23, 'ExternSheet'), ( 255, 'ExtSST'), (2052, 'ExtString'), (2151, 'FeatHdr'), ( 91, 'FileSharing'), (1054, 'Format'), ( 49, 'Font'), (2199, 'GUIDTypeLib'), ( 440, 'HLink'), ( 225, 'InterfaceHdr'), ( 226, 'InterfaceEnd'), ( 523, 'Index'), ( 24, 'Lbl'), ( 193, 'Mms'), ( 93, 'Obj'), (4135, 'ObjectLink'), (2058, 'OleDbConn'), ( 222, 'OleObjectSize'), (2214, 'RichTextStream'), (2146, 'SheetExt'), (1212, 'ShrFmla'), (2060, 'SxViewExt'), (2136, 'SxViewLink'), (2049, 'WebPub'), ( 224, 'XF (formatting)'), (2173, 'XFExt (formatting)'), ( 659, 'Style'), (2194, 'StyleExt') ]) #: records found in xlsb binary parts FREQUENT_RECORDS_XLSB = dict([ (588, 'BrtEndSupBook'), (667, 'BrtSupAddin'), (355, 'BrtSupBookSrc'), (586, 'BrtSupNameBits'), (584, 'BrtSupNameBool'), (587, 'BrtSupNameEnd'), (581, 'BrtSupNameErr'), (585, 'BrtSupNameFmla'), (583, 'BrtSupNameNil'), (580, 'BrtSupNameNum'), (582, 'BrtSupNameSt'), (577, 'BrtSupNameStart'), (579, 'BrtSupNameValueEnd'), (578, 'BrtSupNameValueStart'), (358, 'BrtSupSame'), (357, 'BrtSupSelf'), (359, 'BrtSupTabs'), ]) class XlsRecord(record_base.OleRecordBase): """ basic building block of data in workbook stream """ #: max size of a record in xls stream (does not apply to xlsb) MAX_SIZE = 8224 def _type_str(self): """ simplification for subclasses to create their own __str__ """ try: return FREQUENT_RECORDS[self.type] except KeyError: return 'XlsRecord type {0}'.format(self.type) class XlsRecordBof(XlsRecord): """ record found at beginning of substreams """ TYPE = 2057 SIZE = 16 # types of substreams DOCTYPES = dict([(0x5, 'workbook'), (0x10, 'dialog/worksheet'), (0x20, 'chart'), (0x40, 'macro')]) def finish_constructing(self, _): if self.data is None: self.doctype = None return # parse data (only doctype, ignore rest) self.doctype = unpack('<H', self.data[2:4])[0] def _type_str(self): return 'BOF Record ({0} substream)'.format( self.DOCTYPES[self.doctype] if self.doctype in self.DOCTYPES else 'unknown') class XlsRecordEof(XlsRecord): """ record found at end of substreams """ TYPE = 10 SIZE = 0 def _type_str(self): return 'EOF Record' class XlsRecordSupBook(XlsRecord): """ The SupBook record specifies a supporting link "... The collection of records specifies the contents of an external workbook, DDE data source, or OLE data source." (MS-XLS, paragraph 2.4.271) """ TYPE = 430 LINK_TYPE_UNKNOWN = 'unknown' LINK_TYPE_SELF = 'self-referencing' LINK_TYPE_ADDIN = 'addin-referencing' LINK_TYPE_UNUSED = 'unused' LINK_TYPE_SAMESHEET = 'same-sheet' LINK_TYPE_OLE_DDE = 'ole/dde data source' LINK_TYPE_EXTERNAL = 'external workbook' def finish_constructing(self, _): """Finish constructing this record; called at end of constructor.""" # set defaults self.ctab = None self.cch = None self.virt_path = None self.support_link_type = self.LINK_TYPE_UNKNOWN if self.data is None: return # parse data if self.size < 4: raise ValueError('not enough data (size is {0} but need >= 4)' .format(self.size)) self.ctab, self.cch = unpack('<HH', self.data[:4]) if 0 < self.cch <= 0xff: # this is the length of virt_path self.virt_path, _ = read_unicode(self.data, 4, self.cch) else: self.virt_path, _ = u'', 4 # ignore variable rgst if self.cch == 0x401: # ctab is undefined and to be ignored self.support_link_type = self.LINK_TYPE_SELF elif self.ctab == 0x1 and self.cch == 0x3A01: self.support_link_type = self.LINK_TYPE_ADDIN # next records must be ExternName with all add-in functions elif self.virt_path == u'\u0020': # space ; ctab can be anything self.support_link_type = self.LINK_TYPE_UNUSED elif self.virt_path == u'\u0000': self.support_link_type = self.LINK_TYPE_SAMESHEET elif self.ctab == 0x0 and self.virt_path: self.support_link_type = self.LINK_TYPE_OLE_DDE elif self.ctab > 0 and self.virt_path: self.support_link_type = self.LINK_TYPE_EXTERNAL def _type_str(self): return 'SupBook Record ({0})'.format(self.support_link_type) class XlsbRecord(record_base.OleRecordBase): """ like an xls record, but from binary part of xlsb file has no MAX_SIZE and types have different meanings """ MAX_SIZE = None def _type_str(self): """ simplification for subclasses to create their own __str__ """ try: return FREQUENT_RECORDS_XLSB[self.type] except KeyError: return 'XlsbRecord type {0}'.format(self.type) class XlsbBeginSupBook(XlsbRecord): """ Record beginning an external link in xlsb file contains information about the link itself (e.g. for DDE the link is string1 + ' ' + string2) """ TYPE = 360 LINK_TYPE_WORKBOOK = 'workbook' LINK_TYPE_DDE = 'DDE' LINK_TYPE_OLE = 'OLE' LINK_TYPE_UNEXPECTED = 'unexpected' LINK_TYPE_UNKNOWN = 'unknown' def finish_constructing(self, _): self.link_type = self.LINK_TYPE_UNKNOWN self.string1 = '' self.string2 = '' if self.data is None: return self.sbt = unpack('<H', self.data[0:2])[0] if self.sbt == 0: self.link_type = self.LINK_TYPE_WORKBOOK elif self.sbt == 1: self.link_type = self.LINK_TYPE_DDE elif self.sbt == 2: self.link_type = self.LINK_TYPE_OLE else: logging.warning('Unexpected link type {0} encountered' .format(self.data[0])) self.link_type = self.LINK_TYPE_UNEXPECTED start_idx = 2 n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] if n_chars == 0xFFFFFFFF: logging.warning('Max string length 0xFFFFFFF is not allowed') elif self.size < n_chars*2 + start_idx+4: logging.warning('Impossible string length {0} for data length {1}' .format(n_chars, self.size)) else: self.string1, start_idx = read_unicode_2byte(self.data, start_idx+4, n_chars) n_chars = unpack('<I', self.data[start_idx:start_idx+4])[0] if n_chars == 0xFFFFFFFF: logging.warning('Max string length 0xFFFFFFF is not allowed') elif self.size < n_chars*2 + start_idx+4: logging.warning('Impossible string length {0} for data length {1}' .format(n_chars, self.size) + ' for string2') else: self.string2, _ = read_unicode_2byte(self.data, start_idx+4, n_chars) def _type_str(self): return 'XlsbBeginSupBook Record ({0}, "{1}", "{2}")' \ .format(self.link_type, self.string1, self.string2) ############################################################################### # XLSB Binary Parts ############################################################################### def parse_xlsb_part(file_stream, _, filename): """ Excel xlsb files also have bin files with record structure. iter! """ xlsb_stream = None try: xlsb_stream = XlsbStream(file_stream, file_stream.size, filename, record_base.STGTY_STREAM) for record in xlsb_stream.iter_records(): yield record except Exception: raise finally: if xlsb_stream is not None: xlsb_stream.close() ############################################################################### # TESTING ############################################################################### if __name__ == '__main__': sys.exit(record_base.test(sys.argv[1:], XlsFile, WorkbookStream))
Memory