Repository URL to install this package:
|
Version:
0.2.4 ▾
|
# -*- coding: utf-8 -*-
# Copyright 2010 Dirk Holtwick, holtwick.it
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, unicode_literals
import copy
import logging
import re
from xml.dom import Node
import xml.dom.minidom
from html5lib import treebuilders # , inputstream
import html5lib
from reportlab.platypus.doctemplate import NextPageTemplate, FrameBreak
from reportlab.platypus.flowables import PageBreak, KeepInFrame
import six
from xhtml2pdf.default import BOX, POS, MUST, FONT
from xhtml2pdf.default import TAGS, STRING, INT, BOOL, SIZE, COLOR, FILE
from xhtml2pdf.tables import * # TODO: Kill wild import!
from xhtml2pdf.tags import * # TODO: Kill wild import!
from xhtml2pdf.util import getBox, getPos, pisaTempFile, transform_attrs
from xhtml2pdf.util import getSize, getBool, toList, getColor, getAlign
import xhtml2pdf.w3c.cssDOMElementInterface as cssDOMElementInterface
from xhtml2pdf.xhtml2pdf_reportlab import PmlRightPageBreak, PmlLeftPageBreak
CSSAttrCache = {}
log = logging.getLogger("xhtml2pdf")
rxhttpstrip = re.compile("https?://[^/]+(.*)", re.M | re.I)
class AttrContainer(dict):
def __getattr__(self, name):
try:
return dict.__getattr__(self, name)
except:
return self[name]
def pisaGetAttributes(c, tag, attributes):
global TAGS
attrs = {}
if attributes:
for k, v in attributes.items():
try:
# XXX no Unicode! Reportlab fails with template names
attrs[str(k)] = str(v)
except:
attrs[k] = v
nattrs = {}
if tag in TAGS:
block, adef = TAGS[tag]
adef["id"] = STRING
for k, v in six.iteritems(adef):
nattrs[k] = None
# print k, v
# defaults, wenn vorhanden
if type(v) == tuple:
if v[1] == MUST:
if k not in attrs:
log.warning(
c.warning("Attribute '%s' must be set!", k))
nattrs[k] = None
continue
nv = attrs.get(k, v[1])
dfl = v[1]
v = v[0]
else:
nv = attrs.get(k, None)
dfl = None
if nv is not None:
if type(v) == list:
nv = nv.strip().lower()
if nv not in v:
#~ raise PML_EXCEPTION, "attribute '%s' of wrong value, allowed is one of: %s" % (k, repr(v))
log.warning(
c.warning("Attribute '%s' of wrong value, allowed is one of: %s", k, repr(v)))
nv = dfl
elif v == BOOL:
nv = nv.strip().lower()
nv = nv in ("1", "y", "yes", "true", str(k))
elif v == SIZE:
try:
nv = getSize(nv)
except:
log.warning(
c.warning("Attribute '%s' expects a size value", k))
elif v == BOX:
nv = getBox(nv, c.pageSize)
elif v == POS:
nv = getPos(nv, c.pageSize)
elif v == INT:
nv = int(nv)
elif v == COLOR:
nv = getColor(nv)
elif v == FILE:
nv = c.getFile(nv)
elif v == FONT:
nv = c.getFontName(nv)
nattrs[k] = nv
return AttrContainer(nattrs)
attrNames = '''
color
font-family
font-size
font-weight
font-style
text-decoration
line-height
letter-spacing
background-color
display
margin-left
margin-right
margin-top
margin-bottom
padding-left
padding-right
padding-top
padding-bottom
border-top-color
border-top-style
border-top-width
border-bottom-color
border-bottom-style
border-bottom-width
border-left-color
border-left-style
border-left-width
border-right-color
border-right-style
border-right-width
text-align
vertical-align
width
height
zoom
page-break-after
page-break-before
list-style-type
list-style-image
white-space
text-indent
-pdf-page-break
-pdf-frame-break
-pdf-next-page
-pdf-keep-with-next
-pdf-outline
-pdf-outline-level
-pdf-outline-open
-pdf-line-spacing
-pdf-keep-in-frame-mode
-pdf-word-wrap
'''.strip().split()
def getCSSAttr(self, cssCascade, attrName, default=NotImplemented):
if attrName in self.cssAttrs:
return self.cssAttrs[attrName]
try:
result = cssCascade.findStyleFor(self.cssElement, attrName, default)
except LookupError:
result = None
# XXX Workaround for inline styles
try:
style = self.cssStyle
except:
style = self.cssStyle = cssCascade.parser.parseInline(
self.cssElement.getStyleAttr() or '')[0]
if attrName in style:
result = style[attrName]
if result == 'inherit':
if hasattr(self.parentNode, 'getCSSAttr'):
result = self.parentNode.getCSSAttr(cssCascade, attrName, default)
elif default is not NotImplemented:
return default
raise LookupError(
"Could not find inherited CSS attribute value for '%s'" % (attrName,))
if result is not None:
self.cssAttrs[attrName] = result
return result
# TODO: Monkeypatching standard lib should go away.
xml.dom.minidom.Element.getCSSAttr = getCSSAttr
# Create an aliasing system. Many sources use non-standard tags, because browsers allow
# them to. This allows us to map a nonstandard name to the standard one.
nonStandardAttrNames = {
'bgcolor': 'background-color',
}
def mapNonStandardAttrs(c, n, attrList):
for attr in nonStandardAttrNames:
if attr in attrList and nonStandardAttrNames[attr] not in c:
c[nonStandardAttrNames[attr]] = attrList[attr]
return c
def getCSSAttrCacheKey(node):
_cl = _id = _st = ''
for k, v in node.attributes.items():
if k == 'class':
_cl = v
elif k == 'id':
_id = v
elif k == 'style':
_st = v
return "%s#%s#%s#%s#%s" % (id(node.parentNode), node.tagName.lower(), _cl, _id, _st)
def CSSCollect(node, c):
#node.cssAttrs = {}
# return node.cssAttrs
if c.css:
_key = getCSSAttrCacheKey(node)
if hasattr(node.parentNode, "tagName"):
if node.parentNode.tagName.lower() != "html":
CachedCSSAttr = CSSAttrCache.get(_key, None)
if CachedCSSAttr is not None:
node.cssAttrs = CachedCSSAttr
return CachedCSSAttr
node.cssElement = cssDOMElementInterface.CSSDOMElementInterface(node)
node.cssAttrs = {}
# node.cssElement.onCSSParserVisit(c.cssCascade.parser)
cssAttrMap = {}
for cssAttrName in attrNames:
try:
cssAttrMap[cssAttrName] = node.getCSSAttr(
c.cssCascade, cssAttrName)
# except LookupError:
# pass
except Exception: # TODO: Kill this catch-all!
log.debug("CSS error '%s'", cssAttrName, exc_info=1)
CSSAttrCache[_key] = node.cssAttrs
return node.cssAttrs
def lower(sequence):
if isinstance(sequence, six.string_types):
return sequence.lower()
else:
return sequence[0].lower()
def CSS2Frag(c, kw, isBlock):
# COLORS
if "color" in c.cssAttr:
c.frag.textColor = getColor(c.cssAttr["color"])
if "background-color" in c.cssAttr:
c.frag.backColor = getColor(c.cssAttr["background-color"])
# FONT SIZE, STYLE, WEIGHT
if "font-family" in c.cssAttr:
c.frag.fontName = c.getFontName(c.cssAttr["font-family"])
if "font-size" in c.cssAttr:
# XXX inherit
c.frag.fontSize = max(
getSize("".join(c.cssAttr["font-size"]), c.frag.fontSize, c.baseFontSize), 1.0)
if "line-height" in c.cssAttr:
leading = "".join(c.cssAttr["line-height"])
c.frag.leading = getSize(leading, c.frag.fontSize)
c.frag.leadingSource = leading
else:
c.frag.leading = getSize(c.frag.leadingSource, c.frag.fontSize)
if "letter-spacing" in c.cssAttr:
c.frag.letterSpacing = c.cssAttr["letter-spacing"]
if "-pdf-line-spacing" in c.cssAttr:
c.frag.leadingSpace = getSize("".join(c.cssAttr["-pdf-line-spacing"]))
# print "line-spacing", c.cssAttr["-pdf-line-spacing"], c.frag.leading
if "font-weight" in c.cssAttr:
value = lower(c.cssAttr["font-weight"])
if value in ("bold", "bolder", "500", "600", "700", "800", "900"):
c.frag.bold = 1
else:
c.frag.bold = 0
for value in toList(c.cssAttr.get("text-decoration", "")):
if "underline" in value:
c.frag.underline = 1
if "line-through" in value:
c.frag.strike = 1
if "none" in value:
c.frag.underline = 0
c.frag.strike = 0
if "font-style" in c.cssAttr:
value = lower(c.cssAttr["font-style"])
if value in ("italic", "oblique"):
c.frag.italic = 1
else:
c.frag.italic = 0
if "white-space" in c.cssAttr:
# normal | pre | nowrap
c.frag.whiteSpace = str(c.cssAttr["white-space"]).lower()
# ALIGN & VALIGN
if "text-align" in c.cssAttr:
c.frag.alignment = getAlign(c.cssAttr["text-align"])
if "vertical-align" in c.cssAttr:
c.frag.vAlign = c.cssAttr["vertical-align"]
# HEIGHT & WIDTH
if "height" in c.cssAttr:
try:
# XXX Relative is not correct!
c.frag.height = "".join(toList(c.cssAttr["height"]))
except TypeError:
# sequence item 0: expected string, tuple found
c.frag.height = "".join(toList(c.cssAttr["height"][0]))
if c.frag.height in ("auto",):
c.frag.height = None
if "width" in c.cssAttr:
try:
# XXX Relative is not correct!
c.frag.width = "".join(toList(c.cssAttr["width"]))
except TypeError:
c.frag.width = "".join(toList(c.cssAttr["width"][0]))
if c.frag.width in ("auto",):
c.frag.width = None
# ZOOM
if "zoom" in c.cssAttr:
# XXX Relative is not correct!
zoom = "".join(toList(c.cssAttr["zoom"]))
if zoom.endswith("%"):
zoom = float(zoom[: - 1]) / 100.0
c.frag.zoom = float(zoom)
# MARGINS & LIST INDENT, STYLE
if isBlock:
transform_attrs(c.frag,
(("spaceBefore", "margin-top"),
("spaceAfter", "margin-bottom"),
("firstLineIndent", "text-indent"),
),
c.cssAttr,
getSize,
extras=c.frag.fontSize
)
if "margin-left" in c.cssAttr:
c.frag.bulletIndent = kw["margin-left"] # For lists
kw["margin-left"] += getSize(c.cssAttr["margin-left"],
c.frag.fontSize)
c.frag.leftIndent = kw["margin-left"]
if "margin-right" in c.cssAttr:
kw["margin-right"] += getSize(
c.cssAttr["margin-right"], c.frag.fontSize)
c.frag.rightIndent = kw["margin-right"]
if "list-style-type" in c.cssAttr:
c.frag.listStyleType = str(c.cssAttr["list-style-type"]).lower()
if "list-style-image" in c.cssAttr:
c.frag.listStyleImage = c.getFile(c.cssAttr["list-style-image"])
# PADDINGS
if isBlock:
transform_attrs(c.frag,
(("paddingTop", "padding-top"),
("paddingBottom", "padding-bottom"),
("paddingLeft", "padding-left"),
("paddingRight", "padding-right"),
),
c.cssAttr,
getSize,
extras=c.frag.fontSize
)
# BORDERS
if isBlock:
transform_attrs(c.frag,
(("borderTopWidth", "border-top-width"),
("borderBottomWidth", "border-bottom-width"),
("borderLeftWidth", "border-left-width"),
("borderRightWidth", "border-right-width"),
),
c.cssAttr,
getSize,
extras=c.frag.fontSize
)
transform_attrs(c.frag,
(
("borderTopStyle", "border-top-style"),
("borderBottomStyle", "border-bottom-style"),
("borderLeftStyle", "border-left-style"),
("borderRightStyle", "border-right-style")
),
c.cssAttr,
lambda x: x
)
transform_attrs(c.frag,
(
("borderTopColor", "border-top-color"),
("borderBottomColor", "border-bottom-color"),
("borderLeftColor", "border-left-color"),
("borderRightColor", "border-right-color")
),
c.cssAttr,
getColor
)
def pisaPreLoop(node, context, collect=False):
"""
Collect all CSS definitions
"""
data = u""
if node.nodeType == Node.TEXT_NODE and collect:
data = node.data
elif node.nodeType == Node.ELEMENT_NODE:
name = node.tagName.lower()
if name in ("style", "link"):
attr = pisaGetAttributes(context, name, node.attributes)
media = [x.strip()
for x in attr.media.lower().split(",") if x.strip()]
if attr.get("type", "").lower() in ("", "text/css") and \
(not media or "all" in media or "print" in media or "pdf" in media):
if name == "style":
for node in node.childNodes:
data += pisaPreLoop(node, context, collect=True)
context.addCSS(data)
return u""
if name == "link" and attr.href and attr.rel.lower() == "stylesheet":
# print "CSS LINK", attr
context.addCSS('\n@import "%s" %s;' %
(attr.href, ",".join(media)))
for node in node.childNodes:
result = pisaPreLoop(node, context, collect=collect)
if collect:
data += result
return data
def pisaLoop(node, context, path=None, **kw):
if path is None:
path = []
# Initialize KW
if not kw:
kw = {
"margin-top": 0,
"margin-bottom": 0,
"margin-left": 0,
"margin-right": 0,
}
else:
kw = copy.copy(kw)
# indent = len(path) * " " # only used for debug print statements
# TEXT
if node.nodeType == Node.TEXT_NODE:
# print indent, "#", repr(node.data) #, context.frag
context.addFrag(node.data)
# context.text.append(node.value)
# ELEMENT
elif node.nodeType == Node.ELEMENT_NODE:
node.tagName = node.tagName.replace(":", "").lower()
if node.tagName in ("style", "script"):
return
path = copy.copy(path) + [node.tagName]
# Prepare attributes
attr = pisaGetAttributes(context, node.tagName, node.attributes)
# log.debug(indent + "<%s %s>" % (node.tagName, attr) +
# repr(node.attributes.items())) #, path
# Calculate styles
context.cssAttr = CSSCollect(node, context)
context.cssAttr = mapNonStandardAttrs(context.cssAttr, node, attr)
context.node = node
# Block?
PAGE_BREAK = 1
PAGE_BREAK_RIGHT = 2
PAGE_BREAK_LEFT = 3
pageBreakAfter = False
frameBreakAfter = False
display = lower(context.cssAttr.get("display", "inline"))
# print indent, node.tagName, display,
# context.cssAttr.get("background-color", None), attr
isBlock = (display == "block")
if isBlock:
context.addPara()
# Page break by CSS
if "-pdf-next-page" in context.cssAttr:
context.addStory(
NextPageTemplate(str(context.cssAttr["-pdf-next-page"])))
if "-pdf-page-break" in context.cssAttr:
if str(context.cssAttr["-pdf-page-break"]).lower() == "before":
context.addStory(PageBreak())
if "-pdf-frame-break" in context.cssAttr:
if str(context.cssAttr["-pdf-frame-break"]).lower() == "before":
context.addStory(FrameBreak())
if str(context.cssAttr["-pdf-frame-break"]).lower() == "after":
frameBreakAfter = True
if "page-break-before" in context.cssAttr:
if str(context.cssAttr["page-break-before"]).lower() == "always":
context.addStory(PageBreak())
if str(context.cssAttr["page-break-before"]).lower() == "right":
context.addStory(PageBreak())
context.addStory(PmlRightPageBreak())
if str(context.cssAttr["page-break-before"]).lower() == "left":
context.addStory(PageBreak())
context.addStory(PmlLeftPageBreak())
if "page-break-after" in context.cssAttr:
if str(context.cssAttr["page-break-after"]).lower() == "always":
pageBreakAfter = PAGE_BREAK
if str(context.cssAttr["page-break-after"]).lower() == "right":
pageBreakAfter = PAGE_BREAK_RIGHT
if str(context.cssAttr["page-break-after"]).lower() == "left":
pageBreakAfter = PAGE_BREAK_LEFT
if display == "none":
# print "none!"
return
# Translate CSS to frags
# Save previous frag styles
context.pushFrag()
# Map styles to Reportlab fragment properties
CSS2Frag(context, kw, isBlock)
# EXTRAS
transform_attrs(context.frag,
(
("keepWithNext", "-pdf-keep-with-next"),
("outline", "-pdf-outline"),
("borderLeftColor", "-pdf-outline-open"),
),
context.cssAttr,
getBool
)
if "-pdf-outline-level" in context.cssAttr:
context.frag.outlineLevel = int(
context.cssAttr["-pdf-outline-level"])
if "-pdf-word-wrap" in context.cssAttr:
context.frag.wordWrap = context.cssAttr["-pdf-word-wrap"]
# handle keep-in-frame
keepInFrameMode = None
keepInFrameMaxWidth = 0
keepInFrameMaxHeight = 0
if "-pdf-keep-in-frame-mode" in context.cssAttr:
value = str(
context.cssAttr["-pdf-keep-in-frame-mode"]).strip().lower()
if value in ("shrink", "error", "overflow", "truncate"):
keepInFrameMode = value
else:
keepInFrameMode = "shrink"
# Added because we need a default value.
if "-pdf-keep-in-frame-max-width" in context.cssAttr:
keepInFrameMaxWidth = getSize(
"".join(context.cssAttr["-pdf-keep-in-frame-max-width"]))
if "-pdf-keep-in-frame-max-height" in context.cssAttr:
keepInFrameMaxHeight = getSize(
"".join(context.cssAttr["-pdf-keep-in-frame-max-height"]))
# ignore nested keep-in-frames, tables have their own KIF handling
keepInFrame = keepInFrameMode is not None and context.keepInFrameIndex is None
if keepInFrame:
# keep track of current story index, so we can wrap everythink
# added after this point in a KeepInFrame
context.keepInFrameIndex = len(context.story)
# BEGIN tag
klass = globals().get("pisaTag%s" %
node.tagName.replace(":", "").upper(), None)
obj = None
# Static block
elementId = attr.get("id", None)
staticFrame = context.frameStatic.get(elementId, None)
if staticFrame:
context.frag.insideStaticFrame += 1
oldStory = context.swapStory()
# Tag specific operations
if klass is not None:
obj = klass(node, attr)
obj.start(context)
# Visit child nodes
context.fragBlock = fragBlock = copy.copy(context.frag)
for nnode in node.childNodes:
pisaLoop(nnode, context, path, **kw)
context.fragBlock = fragBlock
# END tag
if obj:
obj.end(context)
# Block?
if isBlock:
context.addPara()
# XXX Buggy!
# Page break by CSS
if pageBreakAfter:
context.addStory(PageBreak())
if pageBreakAfter == PAGE_BREAK_RIGHT:
context.addStory(PmlRightPageBreak())
if pageBreakAfter == PAGE_BREAK_LEFT:
context.addStory(PmlLeftPageBreak())
if frameBreakAfter:
context.addStory(FrameBreak())
if keepInFrame:
# get all content added after start of -pdf-keep-in-frame and wrap
# it in a KeepInFrame
substory = context.story[context.keepInFrameIndex:]
context.story = context.story[:context.keepInFrameIndex]
context.story.append(
KeepInFrame(
content=substory,
maxWidth=keepInFrameMaxWidth,
maxHeight=keepInFrameMaxHeight,
mode=keepInFrameMode))
# mode wasn't being used; it is necessary for tables or images at
# end of page.
context.keepInFrameIndex = None
# Static block, END
if staticFrame:
context.addPara()
for frame in staticFrame:
frame.pisaStaticStory = context.story
context.swapStory(oldStory)
context.frag.insideStaticFrame -= 1
# context.debug(1, indent, "</%s>" % (node.tagName))
# Reset frag style
context.pullFrag()
# Unknown or not handled
else:
# context.debug(1, indent, "???", node, node.nodeType, repr(node))
# Loop over children
for node in node.childNodes:
pisaLoop(node, context, path, **kw)
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
"""
- Parse HTML and get miniDOM
- Extract CSS informations, add default CSS, parse CSS
- Handle the document DOM itself and build reportlab story
- Return Context object
"""
global CSSAttrCache
CSSAttrCache = {}
if xhtml:
# TODO: XHTMLParser doesn't see to exist...
parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
else:
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
if isinstance(src, six.text_type):
# If an encoding was provided, do not change it.
if not encoding:
encoding = "utf-8"
src = src.encode(encoding)
src = pisaTempFile(src, capacity=context.capacity)
# # Test for the restrictions of html5lib
# if encoding:
# # Workaround for html5lib<0.11.1
# if hasattr(inputstream, "isValidEncoding"):
# if encoding.strip().lower() == "utf8":
# encoding = "utf-8"
# if not inputstream.isValidEncoding(encoding):
# log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
# else:
# if inputstream.codecName(encoding) is None:
# log.error("%r is not a valid encoding", encoding)
document = parser.parse(
src,
) # encoding=encoding)
if xml_output:
if encoding:
xml_output.write(document.toprettyxml(encoding=encoding))
else:
xml_output.write(document.toprettyxml(encoding="utf8"))
if default_css:
context.addDefaultCSS(default_css)
pisaPreLoop(document, context)
# try:
context.parseCSS()
# except:
# context.cssText = DEFAULT_CSS
# context.parseCSS()
# context.debug(9, pprint.pformat(context.css))
pisaLoop(document, context)
return context
# Shortcuts
HTML2PDF = pisaParser
def XHTML2PDF(*a, **kw):
kw["xhtml"] = True
return HTML2PDF(*a, **kw)
XML2PDF = XHTML2PDF