import codecs
import collections
import mmap
import os
import re
import zlib
from ._util import py3
try:
from UserDict import UserDict # Python 2.x
except ImportError:
UserDict = collections.UserDict # Python 3.x
if py3: # Python 3.x
def make_bytes(s):
return s.encode("us-ascii")
else: # Python 2.x
def make_bytes(s): # pragma: no cover
return s # pragma: no cover
# see 7.9.2.2 Text String Type on page 86 and D.3 PDFDocEncoding Character Set on page 656
def encode_text(s):
return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
PDFDocEncoding = {
0x16: u"\u0017",
0x18: u"\u02D8",
0x19: u"\u02C7",
0x1A: u"\u02C6",
0x1B: u"\u02D9",
0x1C: u"\u02DD",
0x1D: u"\u02DB",
0x1E: u"\u02DA",
0x1F: u"\u02DC",
0x80: u"\u2022",
0x81: u"\u2020",
0x82: u"\u2021",
0x83: u"\u2026",
0x84: u"\u2014",
0x85: u"\u2013",
0x86: u"\u0192",
0x87: u"\u2044",
0x88: u"\u2039",
0x89: u"\u203A",
0x8A: u"\u2212",
0x8B: u"\u2030",
0x8C: u"\u201E",
0x8D: u"\u201C",
0x8E: u"\u201D",
0x8F: u"\u2018",
0x90: u"\u2019",
0x91: u"\u201A",
0x92: u"\u2122",
0x93: u"\uFB01",
0x94: u"\uFB02",
0x95: u"\u0141",
0x96: u"\u0152",
0x97: u"\u0160",
0x98: u"\u0178",
0x99: u"\u017D",
0x9A: u"\u0131",
0x9B: u"\u0142",
0x9C: u"\u0153",
0x9D: u"\u0161",
0x9E: u"\u017E",
0xA0: u"\u20AC",
}
def decode_text(b):
if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be")
elif py3: # Python 3.x
return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b)
else: # Python 2.x
return u"".join(PDFDocEncoding.get(ord(byte), byte) for byte in b)
class PdfFormatError(RuntimeError):
"""An error that probably indicates a syntactic or semantic error in the PDF file structure"""
pass
def check_format_condition(condition, error_message):
if not condition:
raise PdfFormatError(error_message)
class IndirectReference(collections.namedtuple("IndirectReferenceTuple", ["object_id", "generation"])):
def __str__(self):
return "%s %s R" % self
def __bytes__(self):
return self.__str__().encode("us-ascii")
def __eq__(self, other):
return other.__class__ is self.__class__ and other.object_id == self.object_id and other.generation == self.generation
def __ne__(self, other):
return not (self == other)
def __hash__(self):
return hash((self.object_id, self.generation))
class IndirectObjectDef(IndirectReference):
def __str__(self):
return "%s %s obj" % self
class XrefTable:
def __init__(self):
self.existing_entries = {} # object ID => (offset, generation)
self.new_entries = {} # object ID => (offset, generation)
self.deleted_entries = {0: 65536} # object ID => generation
self.reading_finished = False
def __setitem__(self, key, value):
if self.reading_finished:
self.new_entries[key] = value
else:
self.existing_entries[key] = value
if key in self.deleted_entries:
del self.deleted_entries[key]
def __getitem__(self, key):
try:
return self.new_entries[key]
except KeyError:
return self.existing_entries[key]
def __delitem__(self, key):
if key in self.new_entries:
generation = self.new_entries[key][1] + 1
del self.new_entries[key]
self.deleted_entries[key] = generation
elif key in self.existing_entries:
generation = self.existing_entries[key][1] + 1
self.deleted_entries[key] = generation
elif key in self.deleted_entries:
generation = self.deleted_entries[key]
else:
raise IndexError("object ID " + str(key) + " cannot be deleted because it doesn't exist")
def __contains__(self, key):
return key in self.existing_entries or key in self.new_entries
def __len__(self):
return len(set(self.existing_entries.keys()) | set(self.new_entries.keys()) | set(self.deleted_entries.keys()))
def keys(self):
return (set(self.existing_entries.keys()) - set(self.deleted_entries.keys())) | set(self.new_entries.keys())
def write(self, f):
keys = sorted(set(self.new_entries.keys()) | set(self.deleted_entries.keys()))
deleted_keys = sorted(set(self.deleted_entries.keys()))
startxref = f.tell()
f.write(b"xref\n")
while keys:
# find a contiguous sequence of object IDs
prev = None
for index, key in enumerate(keys):
if prev is None or prev+1 == key:
prev = key
else:
contiguous_keys = keys[:index]
keys = keys[index:]
break
else:
contiguous_keys = keys
keys = None
f.write(make_bytes("%d %d\n" % (contiguous_keys[0], len(contiguous_keys))))
for object_id in contiguous_keys:
if object_id in self.new_entries:
f.write(make_bytes("%010d %05d n \n" % self.new_entries[object_id]))
else:
this_deleted_object_id = deleted_keys.pop(0)
check_format_condition(object_id == this_deleted_object_id,
"expected the next deleted object "
"ID to be %s, instead found %s" %
(object_id, this_deleted_object_id))
try:
next_in_linked_list = deleted_keys[0]
except IndexError:
next_in_linked_list = 0
f.write(make_bytes("%010d %05d f \n" % (next_in_linked_list, self.deleted_entries[object_id])))
return startxref
class PdfName:
def __init__(self, name):
if isinstance(name, PdfName):
self.name = name.name
elif isinstance(name, bytes):
self.name = name
else:
self.name = name.encode("us-ascii")
def name_as_str(self):
return self.name.decode("us-ascii")
def __eq__(self, other):
return (isinstance(other, PdfName) and other.name == self.name) or other == self.name
def __hash__(self):
return hash(self.name)
def __repr__(self):
return "PdfName(%s)" % repr(self.name)
@classmethod
def from_pdf_stream(cls, data):
return cls(PdfParser.interpret_name(data))
allowed_chars = set(range(33, 127)) - set(ord(c) for c in "#%/()<>[]{}")
def __bytes__(self):
result = bytearray(b"/")
for b in self.name:
if py3: # Python 3.x
if b in self.allowed_chars:
result.append(b)
else:
result.extend(make_bytes("#%02X" % b))
else: # Python 2.x
if ord(b) in self.allowed_chars:
result.append(b)
else:
result.extend(b"#%02X" % ord(b))
return bytes(result)
__str__ = __bytes__
class PdfArray(list):
def __bytes__(self):
return b"[ " + b" ".join(pdf_repr(x) for x in self) + b" ]"
__str__ = __bytes__
class PdfDict(UserDict):
def __setattr__(self, key, value):
if key == "data":
if hasattr(UserDict, "__setattr__"):
UserDict.__setattr__(self, key, value)
else:
self.__dict__[key] = value
else:
if isinstance(key, str):
key = key.encode("us-ascii")
self[key] = value
def __getattr__(self, key):
try:
value = self[key]
except KeyError:
try:
value = self[key.encode("us-ascii")]
except KeyError:
raise AttributeError(key)
if isinstance(value, bytes):
return decode_text(value)
else:
return value
def __bytes__(self):
out = bytearray(b"<<")
for key, value in self.items():
if value is None:
continue
value = pdf_repr(value)
out.extend(b"\n")
out.extend(bytes(PdfName(key)))
out.extend(b" ")
out.extend(value)
out.extend(b"\n>>")
return bytes(out)
if not py3:
__str__ = __bytes__
class PdfBinary:
def __init__(self, data):
self.data = data
if py3: # Python 3.x
def __bytes__(self):
return make_bytes("<%s>" % "".join("%02X" % b for b in self.data))
else: # Python 2.x
def __str__(self):
return "<%s>" % "".join("%02X" % ord(b) for b in self.data)
class PdfStream:
def __init__(self, dictionary, buf):
self.dictionary = dictionary
self.buf = buf
def decode(self):
try:
filter = self.dictionary.Filter
except AttributeError:
return self.buf
if filter == b"FlateDecode":
try:
expected_length = self.dictionary.DL
except AttributeError:
expected_length = self.dictionary.Length
return zlib.decompress(self.buf, bufsize=int(expected_length))
else:
raise NotImplementedError("stream filter %s unknown/unsupported" % repr(self.dictionary.Filter))
def pdf_repr(x):
if x is True:
return b"true"
elif x is False:
return b"false"
elif x is None:
return b"null"
elif isinstance(x, PdfName) or isinstance(x, PdfDict) or isinstance(x, PdfArray) or isinstance(x, PdfBinary):
return bytes(x)
elif isinstance(x, int):
return str(x).encode("us-ascii")
elif isinstance(x, dict):
return bytes(PdfDict(x))
elif isinstance(x, list):
return bytes(PdfArray(x))
elif (py3 and isinstance(x, str)) or (not py3 and isinstance(x, unicode)):
return pdf_repr(encode_text(x))
elif isinstance(x, bytes):
return b"(" + x.replace(b"\\", b"\\\\").replace(b"(", b"\\(").replace(b")", b"\\)") + b")" # XXX escape more chars? handle binary garbage
else:
return bytes(x)
class PdfParser:
"""Based on https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
Supports PDF up to 1.4
"""
Loading ...