"""
olefile (formerly OleFileIO_PL)
Module to read/write Microsoft OLE2 files (also called Structured Storage or
Microsoft Compound Document File Format), such as Microsoft Office 97-2003
documents, Image Composer and FlashPix files, Outlook messages, ...
This version is compatible with Python 2.7 and 3.4+
Project website: https://www.decalage.info/olefile
olefile is copyright (c) 2005-2018 Philippe Lagadec
(https://www.decalage.info)
olefile is based on the OleFileIO module from the PIL library v1.1.7
See: http://www.pythonware.com/products/pil/index.htm
and http://svn.effbot.org/public/tags/pil-1.1.7/PIL/OleFileIO.py
The Python Imaging Library (PIL) is
Copyright (c) 1997-2009 by Secret Labs AB
Copyright (c) 1995-2009 by Fredrik Lundh
See source code and LICENSE.txt for information on usage and redistribution.
"""
# Since OleFileIO_PL v0.45, only Python 2.7 and 3.4+ are supported
# This import enables print() as a function rather than a keyword
# (main requirement to be compatible with Python 3.x)
# The comment on the line below should be printed on Python 2.5 or older:
from __future__ import print_function # This version of olefile requires Python 2.7 or 3.4+.
#--- LICENSE ------------------------------------------------------------------
# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2018 Philippe Lagadec
# (https://www.decalage.info)
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ----------
# PIL License:
#
# olefile is based on source code from the OleFileIO module of the Python
# Imaging Library (PIL) published by Fredrik Lundh under the following license:
# The Python Imaging Library (PIL) is
# Copyright (c) 1997-2009 by Secret Labs AB
# Copyright (c) 1995-2009 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its associated
# documentation, you agree that you have read, understood, and will comply with
# the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and its
# associated documentation for any purpose and without fee is hereby granted,
# provided that the above copyright notice appears in all copies, and that both
# that copyright notice and this permission notice appear in supporting
# documentation, and that the name of Secret Labs AB or the author(s) not be used
# in advertising or publicity pertaining to distribution of the software
# without specific, written prior permission.
#
# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,
# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
__date__ = "2018-09-09"
__version__ = '0.46'
__author__ = "Philippe Lagadec"
__all__ = ['isOleFile', 'OleFileIO', 'OleMetadata', 'enable_logging',
'MAGIC', 'STGTY_EMPTY', 'KEEP_UNICODE_NAMES',
'STGTY_STREAM', 'STGTY_STORAGE', 'STGTY_ROOT', 'STGTY_PROPERTY',
'STGTY_LOCKBYTES', 'MINIMAL_OLEFILE_SIZE',
'DEFECT_UNSURE', 'DEFECT_POTENTIAL', 'DEFECT_INCORRECT',
'DEFECT_FATAL', 'DEFAULT_PATH_ENCODING',
'MAXREGSECT', 'DIFSECT', 'FATSECT', 'ENDOFCHAIN', 'FREESECT',
'MAXREGSID', 'NOSTREAM', 'UNKNOWN_SIZE', 'WORD_CLSID'
]
import io
import sys
import struct, array, os.path, datetime, logging
#=== COMPATIBILITY WORKAROUNDS ================================================
# For Python 3.x, need to redefine long as int:
if str is not bytes:
long = int
# Need to make sure we use xrange both on Python 2 and 3.x:
try:
# on Python 2 we need xrange:
iterrange = xrange
except:
# no xrange, for Python 3 it was renamed as range:
iterrange = range
#[PL] workaround to fix an issue with array item size on 64 bits systems:
if array.array('L').itemsize == 4:
# on 32 bits platforms, long integers in an array are 32 bits:
UINT32 = 'L'
elif array.array('I').itemsize == 4:
# on 64 bits platforms, integers in an array are 32 bits:
UINT32 = 'I'
elif array.array('i').itemsize == 4:
# On 64 bit Jython, signed integers ('i') are the only way to store our 32
# bit values in an array in a *somewhat* reasonable way, as the otherwise
# perfectly suited 'H' (unsigned int, 32 bits) results in a completely
# unusable behaviour. This is most likely caused by the fact that Java
# doesn't have unsigned values, and thus Jython's "array" implementation,
# which is based on "jarray", doesn't have them either.
# NOTE: to trick Jython into converting the values it would normally
# interpret as "signed" into "unsigned", a binary-and operation with
# 0xFFFFFFFF can be used. This way it is possible to use the same comparing
# operations on all platforms / implementations. The corresponding code
# lines are flagged with a 'JYTHON-WORKAROUND' tag below.
UINT32 = 'i'
else:
raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...')
#[PL] These workarounds were inspired from the Path module
# (see http://www.jorendorff.com/articles/python/path/)
try:
basestring
except NameError:
basestring = str
#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode
# if False (default PIL behaviour), all filenames are converted to Latin-1.
KEEP_UNICODE_NAMES = True
if sys.version_info[0] < 3:
# On Python 2.x, the default encoding for path names is UTF-8:
DEFAULT_PATH_ENCODING = 'utf-8'
else:
# On Python 3.x, the default encoding for path names is Unicode (None):
DEFAULT_PATH_ENCODING = None
# === LOGGING =================================================================
def get_logger(name, level=logging.CRITICAL+1):
"""
Create a suitable logger object for this module.
The goal is not to change settings of the root logger, to avoid getting
other modules' logs on the screen.
If a logger exists with same name, reuse it. (Else it would have duplicate
handlers and messages would be doubled.)
The level is set to CRITICAL+1 by default, to avoid any logging.
"""
# First, test if there is already a logger with the same name, else it
# will generate duplicate messages (due to duplicate handlers):
if name in logging.Logger.manager.loggerDict:
#NOTE: another less intrusive but more "hackish" solution would be to
# use getLogger then test if its effective level is not default.
logger = logging.getLogger(name)
# make sure level is OK:
logger.setLevel(level)
return logger
# get a new logger:
logger = logging.getLogger(name)
# only add a NullHandler for this logger, it is up to the application
# to configure its own logging:
logger.addHandler(logging.NullHandler())
logger.setLevel(level)
return logger
# a global logger object used for debugging:
log = get_logger('olefile')
def enable_logging():
"""
Enable logging for this module (disabled by default).
This will set the module-specific logger level to NOTSET, which
means the main application controls the actual logging level.
"""
log.setLevel(logging.NOTSET)
#=== CONSTANTS ===============================================================
#: magic bytes that should be at the beginning of every OLE file:
MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'
#[PL]: added constants for Sector IDs (from AAF specifications)
MAXREGSECT = 0xFFFFFFFA #: (-6) maximum SECT
DIFSECT = 0xFFFFFFFC #: (-4) denotes a DIFAT sector in a FAT
FATSECT = 0xFFFFFFFD #: (-3) denotes a FAT sector in a FAT
ENDOFCHAIN = 0xFFFFFFFE #: (-2) end of a virtual stream chain
FREESECT = 0xFFFFFFFF #: (-1) unallocated sector
#[PL]: added constants for Directory Entry IDs (from AAF specifications)
MAXREGSID = 0xFFFFFFFA #: (-6) maximum directory entry ID
NOSTREAM = 0xFFFFFFFF #: (-1) unallocated directory entry
#[PL] object types in storage (from AAF specifications)
STGTY_EMPTY = 0 #: empty directory entry
STGTY_STORAGE = 1 #: element is a storage object
STGTY_STREAM = 2 #: element is a stream object
STGTY_LOCKBYTES = 3 #: element is an ILockBytes object
STGTY_PROPERTY = 4 #: element is an IPropertyStorage object
STGTY_ROOT = 5 #: element is a root storage
# Unknown size for a stream (used by OleStream):
UNKNOWN_SIZE = 0x7FFFFFFF
#
# --------------------------------------------------------------------
# property types
VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;
VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;
VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;
VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;
VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;
VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;
VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;
VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;
VT_VECTOR=0x1000;
# map property id to name (for debugging purposes)
# VT = {}
# for keyword, var in list(vars().items()):
# if keyword[:3] == "VT_":
# VT[var] = keyword
#
# --------------------------------------------------------------------
# Some common document types (root.clsid fields)
WORD_CLSID = "00020900-0000-0000-C000-000000000046"
#TODO: check Excel, PPT, ...
#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()
DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect
DEFECT_POTENTIAL = 20 # a potential defect
DEFECT_INCORRECT = 30 # an error according to specifications, but parsing
# can go on
DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is
# impossible
# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes
# (this is used in isOleFile and OleFile.open)
MINIMAL_OLEFILE_SIZE = 1536
#=== FUNCTIONS ===============================================================
def isOleFile (filename):
"""
Test if a file is an OLE container (according to the magic bytes in its header).
.. note::
This function only checks the first 8 bytes of the file, not the
rest of the OLE structure.
.. versionadded:: 0.16
:param filename: filename, contents or file-like object of the OLE file (string-like or file-like object)
- if filename is a string smaller than 1536 bytes, it is the path
of the file to open. (bytes or unicode string)
- if filename is a string longer than 1535 bytes, it is parsed
as the content of an OLE file in memory. (bytes type only)
- if filename is a file-like object (with read and seek methods),
it is parsed as-is.
:type filename: bytes or str or unicode or file
:returns: True if OLE, False otherwise.
:rtype: bool
"""
# check if filename is a string-like or file-like object:
if hasattr(filename, 'read'):
# file-like object: use it directly
header = filename.read(len(MAGIC))
# just in case, seek back to start of file:
filename.seek(0)
elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE:
# filename is a bytes string containing the OLE file to be parsed:
header = filename[:len(MAGIC)]
else:
# string-like object: filename of file on disk
with open(filename, 'rb') as fp:
header = fp.read(len(MAGIC))
if header == MAGIC:
return True
else:
return False
if bytes is str:
# version for Python 2.x
def i8(c):
return ord(c)
else:
# version for Python 3.x
def i8(c):
return c if c.__class__ is int else c[0]
def i16(c, o = 0):
"""
Converts a 2-bytes (16 bits) string to an integer.
:param c: string containing bytes to convert
:param o: offset of bytes to convert in string
"""
return struct.unpack("<H", c[o:o+2])[0]
def i32(c, o = 0):
"""
Converts a 4-bytes (32 bits) string to an integer.
:param c: string containing bytes to convert
:param o: offset of bytes to convert in string
"""
return struct.unpack("<I", c[o:o+4])[0]
Loading ...