Gemfury

doctorondemand / xhtml2pdf python

Repository URL to install this package:
Details
xhtml2pdf / xhtml2pdf / parser.py
# -*- coding: utf-8 -*-

# Copyright 2010 Dirk Holtwick, holtwick.it
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, unicode_literals

import copy
import logging
import re
from xml.dom import Node
import xml.dom.minidom

from html5lib import treebuilders  # , inputstream
import html5lib
from reportlab.platypus.doctemplate import NextPageTemplate, FrameBreak
from reportlab.platypus.flowables import PageBreak, KeepInFrame
import six

from xhtml2pdf.default import BOX, POS, MUST, FONT
from xhtml2pdf.default import TAGS, STRING, INT, BOOL, SIZE, COLOR, FILE
from xhtml2pdf.tables import *  # TODO: Kill wild import!
from xhtml2pdf.tags import *  # TODO: Kill wild import!
from xhtml2pdf.util import getBox, getPos, pisaTempFile, transform_attrs
from xhtml2pdf.util import getSize, getBool, toList, getColor, getAlign
import xhtml2pdf.w3c.cssDOMElementInterface as cssDOMElementInterface
from xhtml2pdf.xhtml2pdf_reportlab import PmlRightPageBreak, PmlLeftPageBreak


CSSAttrCache = {}

log = logging.getLogger("xhtml2pdf")

rxhttpstrip = re.compile("https?://[^/]+(.*)", re.M | re.I)


class AttrContainer(dict):

    def __getattr__(self, name):
        try:
            return dict.__getattr__(self, name)
        except:
            return self[name]


def pisaGetAttributes(c, tag, attributes):
    global TAGS

    attrs = {}
    if attributes:
        for k, v in attributes.items():
            try:
                # XXX no Unicode! Reportlab fails with template names
                attrs[str(k)] = str(v)
            except:
                attrs[k] = v

    nattrs = {}
    if tag in TAGS:
        block, adef = TAGS[tag]
        adef["id"] = STRING

        for k, v in six.iteritems(adef):
            nattrs[k] = None
            # print k, v
            # defaults, wenn vorhanden
            if type(v) == tuple:
                if v[1] == MUST:
                    if k not in attrs:
                        log.warning(
                            c.warning("Attribute '%s' must be set!", k))
                        nattrs[k] = None
                        continue
                nv = attrs.get(k, v[1])
                dfl = v[1]
                v = v[0]
            else:
                nv = attrs.get(k, None)
                dfl = None

            if nv is not None:
                if type(v) == list:
                    nv = nv.strip().lower()
                    if nv not in v:
                        #~ raise PML_EXCEPTION, "attribute '%s' of wrong value, allowed is one of: %s" % (k, repr(v))
                        log.warning(
                            c.warning("Attribute '%s' of wrong value, allowed is one of: %s", k, repr(v)))
                        nv = dfl

                elif v == BOOL:
                    nv = nv.strip().lower()
                    nv = nv in ("1", "y", "yes", "true", str(k))

                elif v == SIZE:
                    try:
                        nv = getSize(nv)
                    except:
                        log.warning(
                            c.warning("Attribute '%s' expects a size value", k))

                elif v == BOX:
                    nv = getBox(nv, c.pageSize)

                elif v == POS:
                    nv = getPos(nv, c.pageSize)

                elif v == INT:
                    nv = int(nv)

                elif v == COLOR:
                    nv = getColor(nv)

                elif v == FILE:
                    nv = c.getFile(nv)

                elif v == FONT:
                    nv = c.getFontName(nv)

                nattrs[k] = nv

    return AttrContainer(nattrs)


attrNames = '''
    color
    font-family
    font-size
    font-weight
    font-style
    text-decoration
    line-height
    letter-spacing
    background-color
    display
    margin-left
    margin-right
    margin-top
    margin-bottom
    padding-left
    padding-right
    padding-top
    padding-bottom
    border-top-color
    border-top-style
    border-top-width
    border-bottom-color
    border-bottom-style
    border-bottom-width
    border-left-color
    border-left-style
    border-left-width
    border-right-color
    border-right-style
    border-right-width
    text-align
    vertical-align
    width
    height
    zoom
    page-break-after
    page-break-before
    list-style-type
    list-style-image
    white-space
    text-indent
    -pdf-page-break
    -pdf-frame-break
    -pdf-next-page
    -pdf-keep-with-next
    -pdf-outline
    -pdf-outline-level
    -pdf-outline-open
    -pdf-line-spacing
    -pdf-keep-in-frame-mode
    -pdf-word-wrap
    '''.strip().split()


def getCSSAttr(self, cssCascade, attrName, default=NotImplemented):
    if attrName in self.cssAttrs:
        return self.cssAttrs[attrName]

    try:
        result = cssCascade.findStyleFor(self.cssElement, attrName, default)
    except LookupError:
        result = None

    # XXX Workaround for inline styles
    try:
        style = self.cssStyle
    except:
        style = self.cssStyle = cssCascade.parser.parseInline(
            self.cssElement.getStyleAttr() or '')[0]
    if attrName in style:
        result = style[attrName]

    if result == 'inherit':
        if hasattr(self.parentNode, 'getCSSAttr'):
            result = self.parentNode.getCSSAttr(cssCascade, attrName, default)
        elif default is not NotImplemented:
            return default
        raise LookupError(
            "Could not find inherited CSS attribute value for '%s'" % (attrName,))

    if result is not None:
        self.cssAttrs[attrName] = result
    return result


# TODO: Monkeypatching standard lib should go away.
xml.dom.minidom.Element.getCSSAttr = getCSSAttr

# Create an aliasing system.  Many sources use non-standard tags, because browsers allow
# them to.  This allows us to map a nonstandard name to the standard one.
nonStandardAttrNames = {
    'bgcolor': 'background-color',
}


def mapNonStandardAttrs(c, n, attrList):
    for attr in nonStandardAttrNames:
        if attr in attrList and nonStandardAttrNames[attr] not in c:
            c[nonStandardAttrNames[attr]] = attrList[attr]
    return c


def getCSSAttrCacheKey(node):
    _cl = _id = _st = ''
    for k, v in node.attributes.items():
        if k == 'class':
            _cl = v
        elif k == 'id':
            _id = v
        elif k == 'style':
            _st = v
    return "%s#%s#%s#%s#%s" % (id(node.parentNode), node.tagName.lower(), _cl, _id, _st)


def CSSCollect(node, c):
    #node.cssAttrs = {}
    # return node.cssAttrs

    if c.css:

        _key = getCSSAttrCacheKey(node)

        if hasattr(node.parentNode, "tagName"):
            if node.parentNode.tagName.lower() != "html":
                CachedCSSAttr = CSSAttrCache.get(_key, None)
                if CachedCSSAttr is not None:
                    node.cssAttrs = CachedCSSAttr
                    return CachedCSSAttr

        node.cssElement = cssDOMElementInterface.CSSDOMElementInterface(node)
        node.cssAttrs = {}
        # node.cssElement.onCSSParserVisit(c.cssCascade.parser)
        cssAttrMap = {}
        for cssAttrName in attrNames:
            try:
                cssAttrMap[cssAttrName] = node.getCSSAttr(
                    c.cssCascade, cssAttrName)
            # except LookupError:
            #    pass
            except Exception:  # TODO: Kill this catch-all!
                log.debug("CSS error '%s'", cssAttrName, exc_info=1)

        CSSAttrCache[_key] = node.cssAttrs

    return node.cssAttrs


def lower(sequence):
    if isinstance(sequence, six.string_types):
        return sequence.lower()
    else:
        return sequence[0].lower()


def CSS2Frag(c, kw, isBlock):
    # COLORS
    if "color" in c.cssAttr:
        c.frag.textColor = getColor(c.cssAttr["color"])
    if "background-color" in c.cssAttr:
        c.frag.backColor = getColor(c.cssAttr["background-color"])
        # FONT SIZE, STYLE, WEIGHT
    if "font-family" in c.cssAttr:
        c.frag.fontName = c.getFontName(c.cssAttr["font-family"])
    if "font-size" in c.cssAttr:
        # XXX inherit
        c.frag.fontSize = max(
            getSize("".join(c.cssAttr["font-size"]), c.frag.fontSize, c.baseFontSize), 1.0)
    if "line-height" in c.cssAttr:
        leading = "".join(c.cssAttr["line-height"])
        c.frag.leading = getSize(leading, c.frag.fontSize)
        c.frag.leadingSource = leading
    else:
        c.frag.leading = getSize(c.frag.leadingSource, c.frag.fontSize)
    if "letter-spacing" in c.cssAttr:
        c.frag.letterSpacing = c.cssAttr["letter-spacing"]
    if "-pdf-line-spacing" in c.cssAttr:
        c.frag.leadingSpace = getSize("".join(c.cssAttr["-pdf-line-spacing"]))
        # print "line-spacing", c.cssAttr["-pdf-line-spacing"], c.frag.leading
    if "font-weight" in c.cssAttr:
        value = lower(c.cssAttr["font-weight"])
        if value in ("bold", "bolder", "500", "600", "700", "800", "900"):
            c.frag.bold = 1
        else:
            c.frag.bold = 0
    for value in toList(c.cssAttr.get("text-decoration", "")):
        if "underline" in value:
            c.frag.underline = 1
        if "line-through" in value:
            c.frag.strike = 1
        if "none" in value:
            c.frag.underline = 0
            c.frag.strike = 0
    if "font-style" in c.cssAttr:
        value = lower(c.cssAttr["font-style"])
        if value in ("italic", "oblique"):
            c.frag.italic = 1
        else:
            c.frag.italic = 0
    if "white-space" in c.cssAttr:
        # normal | pre | nowrap
        c.frag.whiteSpace = str(c.cssAttr["white-space"]).lower()
        # ALIGN & VALIGN
    if "text-align" in c.cssAttr:
        c.frag.alignment = getAlign(c.cssAttr["text-align"])
    if "vertical-align" in c.cssAttr:
        c.frag.vAlign = c.cssAttr["vertical-align"]
        # HEIGHT & WIDTH
    if "height" in c.cssAttr:
        try:
            # XXX Relative is not correct!
            c.frag.height = "".join(toList(c.cssAttr["height"]))
        except TypeError:
            # sequence item 0: expected string, tuple found
            c.frag.height = "".join(toList(c.cssAttr["height"][0]))
        if c.frag.height in ("auto",):
            c.frag.height = None
    if "width" in c.cssAttr:
        try:
            # XXX Relative is not correct!
            c.frag.width = "".join(toList(c.cssAttr["width"]))
        except TypeError:
            c.frag.width = "".join(toList(c.cssAttr["width"][0]))
        if c.frag.width in ("auto",):
            c.frag.width = None
        # ZOOM
    if "zoom" in c.cssAttr:
        # XXX Relative is not correct!
        zoom = "".join(toList(c.cssAttr["zoom"]))
        if zoom.endswith("%"):
            zoom = float(zoom[: - 1]) / 100.0
        c.frag.zoom = float(zoom)
        # MARGINS & LIST INDENT, STYLE
    if isBlock:
        transform_attrs(c.frag,
                        (("spaceBefore", "margin-top"),
                         ("spaceAfter", "margin-bottom"),
                         ("firstLineIndent", "text-indent"),
                         ),
                        c.cssAttr,
                        getSize,
                        extras=c.frag.fontSize
                        )

        if "margin-left" in c.cssAttr:
            c.frag.bulletIndent = kw["margin-left"]  # For lists
            kw["margin-left"] += getSize(c.cssAttr["margin-left"],
                                         c.frag.fontSize)
            c.frag.leftIndent = kw["margin-left"]
        if "margin-right" in c.cssAttr:
            kw["margin-right"] += getSize(
                c.cssAttr["margin-right"], c.frag.fontSize)
            c.frag.rightIndent = kw["margin-right"]

        if "list-style-type" in c.cssAttr:
            c.frag.listStyleType = str(c.cssAttr["list-style-type"]).lower()
        if "list-style-image" in c.cssAttr:
            c.frag.listStyleImage = c.getFile(c.cssAttr["list-style-image"])
        # PADDINGS
    if isBlock:
        transform_attrs(c.frag,
                        (("paddingTop", "padding-top"),
                         ("paddingBottom", "padding-bottom"),
                         ("paddingLeft", "padding-left"),
                         ("paddingRight", "padding-right"),
                         ),
                        c.cssAttr,
                        getSize,
                        extras=c.frag.fontSize
                        )

        # BORDERS
    if isBlock:
        transform_attrs(c.frag,
                        (("borderTopWidth", "border-top-width"),
                         ("borderBottomWidth", "border-bottom-width"),
                         ("borderLeftWidth", "border-left-width"),
                         ("borderRightWidth", "border-right-width"),
                         ),
                        c.cssAttr,
                        getSize,
                        extras=c.frag.fontSize
                        )
        transform_attrs(c.frag,
                        (
                            ("borderTopStyle", "border-top-style"),
                            ("borderBottomStyle", "border-bottom-style"),
                            ("borderLeftStyle", "border-left-style"),
                            ("borderRightStyle", "border-right-style")
                        ),
                        c.cssAttr,
                        lambda x: x
                        )

        transform_attrs(c.frag,
                        (
                            ("borderTopColor", "border-top-color"),
                            ("borderBottomColor", "border-bottom-color"),
                            ("borderLeftColor", "border-left-color"),
                            ("borderRightColor",  "border-right-color")
                        ),
                        c.cssAttr,
                        getColor
                        )


def pisaPreLoop(node, context, collect=False):
    """
    Collect all CSS definitions
    """

    data = u""
    if node.nodeType == Node.TEXT_NODE and collect:
        data = node.data

    elif node.nodeType == Node.ELEMENT_NODE:
        name = node.tagName.lower()

        if name in ("style", "link"):
            attr = pisaGetAttributes(context, name, node.attributes)
            media = [x.strip()
                     for x in attr.media.lower().split(",") if x.strip()]

            if attr.get("type", "").lower() in ("", "text/css") and \
                    (not media or "all" in media or "print" in media or "pdf" in media):

                if name == "style":
                    for node in node.childNodes:
                        data += pisaPreLoop(node, context, collect=True)
                    context.addCSS(data)
                    return u""

                if name == "link" and attr.href and attr.rel.lower() == "stylesheet":
                    # print "CSS LINK", attr
                    context.addCSS('\n@import "%s" %s;' %
                                   (attr.href, ",".join(media)))

    for node in node.childNodes:
        result = pisaPreLoop(node, context, collect=collect)
        if collect:
            data += result

    return data


def pisaLoop(node, context, path=None, **kw):
    if path is None:
        path = []

    # Initialize KW
    if not kw:
        kw = {
            "margin-top": 0,
            "margin-bottom": 0,
            "margin-left": 0,
            "margin-right": 0,
        }
    else:
        kw = copy.copy(kw)

    # indent = len(path) * "  " # only used for debug print statements

    # TEXT
    if node.nodeType == Node.TEXT_NODE:
        # print indent, "#", repr(node.data) #, context.frag
        context.addFrag(node.data)

        # context.text.append(node.value)

    # ELEMENT
    elif node.nodeType == Node.ELEMENT_NODE:

        node.tagName = node.tagName.replace(":", "").lower()

        if node.tagName in ("style", "script"):
            return

        path = copy.copy(path) + [node.tagName]

        # Prepare attributes
        attr = pisaGetAttributes(context, node.tagName, node.attributes)
        # log.debug(indent + "<%s %s>" % (node.tagName, attr) +
        # repr(node.attributes.items())) #, path

        # Calculate styles
        context.cssAttr = CSSCollect(node, context)
        context.cssAttr = mapNonStandardAttrs(context.cssAttr, node, attr)
        context.node = node

        # Block?
        PAGE_BREAK = 1
        PAGE_BREAK_RIGHT = 2
        PAGE_BREAK_LEFT = 3

        pageBreakAfter = False
        frameBreakAfter = False
        display = lower(context.cssAttr.get("display", "inline"))
        # print indent, node.tagName, display,
        # context.cssAttr.get("background-color", None), attr
        isBlock = (display == "block")

        if isBlock:
            context.addPara()

            # Page break by CSS
            if "-pdf-next-page" in context.cssAttr:
                context.addStory(
                    NextPageTemplate(str(context.cssAttr["-pdf-next-page"])))
            if "-pdf-page-break" in context.cssAttr:
                if str(context.cssAttr["-pdf-page-break"]).lower() == "before":
                    context.addStory(PageBreak())
            if "-pdf-frame-break" in context.cssAttr:
                if str(context.cssAttr["-pdf-frame-break"]).lower() == "before":
                    context.addStory(FrameBreak())
                if str(context.cssAttr["-pdf-frame-break"]).lower() == "after":
                    frameBreakAfter = True
            if "page-break-before" in context.cssAttr:
                if str(context.cssAttr["page-break-before"]).lower() == "always":
                    context.addStory(PageBreak())
                if str(context.cssAttr["page-break-before"]).lower() == "right":
                    context.addStory(PageBreak())
                    context.addStory(PmlRightPageBreak())
                if str(context.cssAttr["page-break-before"]).lower() == "left":
                    context.addStory(PageBreak())
                    context.addStory(PmlLeftPageBreak())
            if "page-break-after" in context.cssAttr:
                if str(context.cssAttr["page-break-after"]).lower() == "always":
                    pageBreakAfter = PAGE_BREAK
                if str(context.cssAttr["page-break-after"]).lower() == "right":
                    pageBreakAfter = PAGE_BREAK_RIGHT
                if str(context.cssAttr["page-break-after"]).lower() == "left":
                    pageBreakAfter = PAGE_BREAK_LEFT

        if display == "none":
            # print "none!"
            return

        # Translate CSS to frags

        # Save previous frag styles
        context.pushFrag()

        # Map styles to Reportlab fragment properties
        CSS2Frag(context, kw, isBlock)

        # EXTRAS
        transform_attrs(context.frag,
                        (
                            ("keepWithNext", "-pdf-keep-with-next"),
                            ("outline", "-pdf-outline"),
                            ("borderLeftColor", "-pdf-outline-open"),
                        ),
                        context.cssAttr,
                        getBool
                        )

        if "-pdf-outline-level" in context.cssAttr:
            context.frag.outlineLevel = int(
                context.cssAttr["-pdf-outline-level"])

        if "-pdf-word-wrap" in context.cssAttr:
            context.frag.wordWrap = context.cssAttr["-pdf-word-wrap"]

        # handle keep-in-frame
        keepInFrameMode = None
        keepInFrameMaxWidth = 0
        keepInFrameMaxHeight = 0
        if "-pdf-keep-in-frame-mode" in context.cssAttr:
            value = str(
                context.cssAttr["-pdf-keep-in-frame-mode"]).strip().lower()
            if value in ("shrink", "error", "overflow", "truncate"):
                keepInFrameMode = value
            else:
                keepInFrameMode = "shrink"
            # Added because we need a default value.

        if "-pdf-keep-in-frame-max-width" in context.cssAttr:
            keepInFrameMaxWidth = getSize(
                "".join(context.cssAttr["-pdf-keep-in-frame-max-width"]))
        if "-pdf-keep-in-frame-max-height" in context.cssAttr:
            keepInFrameMaxHeight = getSize(
                "".join(context.cssAttr["-pdf-keep-in-frame-max-height"]))

        # ignore nested keep-in-frames, tables have their own KIF handling
        keepInFrame = keepInFrameMode is not None and context.keepInFrameIndex is None
        if keepInFrame:
            # keep track of current story index, so we can wrap everythink
            # added after this point in a KeepInFrame
            context.keepInFrameIndex = len(context.story)

        # BEGIN tag
        klass = globals().get("pisaTag%s" %
                              node.tagName.replace(":", "").upper(), None)
        obj = None

        # Static block
        elementId = attr.get("id", None)
        staticFrame = context.frameStatic.get(elementId, None)
        if staticFrame:
            context.frag.insideStaticFrame += 1
            oldStory = context.swapStory()

        # Tag specific operations
        if klass is not None:
            obj = klass(node, attr)
            obj.start(context)

        # Visit child nodes
        context.fragBlock = fragBlock = copy.copy(context.frag)
        for nnode in node.childNodes:
            pisaLoop(nnode, context, path, **kw)
        context.fragBlock = fragBlock

        # END tag
        if obj:
            obj.end(context)

        # Block?
        if isBlock:
            context.addPara()

            # XXX Buggy!

            # Page break by CSS
            if pageBreakAfter:
                context.addStory(PageBreak())
                if pageBreakAfter == PAGE_BREAK_RIGHT:
                    context.addStory(PmlRightPageBreak())
                if pageBreakAfter == PAGE_BREAK_LEFT:
                    context.addStory(PmlLeftPageBreak())
            if frameBreakAfter:
                context.addStory(FrameBreak())

        if keepInFrame:
            # get all content added after start of -pdf-keep-in-frame and wrap
            # it in a KeepInFrame
            substory = context.story[context.keepInFrameIndex:]
            context.story = context.story[:context.keepInFrameIndex]
            context.story.append(
                KeepInFrame(
                    content=substory,
                    maxWidth=keepInFrameMaxWidth,
                    maxHeight=keepInFrameMaxHeight,
                    mode=keepInFrameMode))
            # mode wasn't being used; it is necessary for tables or images at
            # end of page.
            context.keepInFrameIndex = None

        # Static block, END
        if staticFrame:
            context.addPara()
            for frame in staticFrame:
                frame.pisaStaticStory = context.story
            context.swapStory(oldStory)
            context.frag.insideStaticFrame -= 1

        # context.debug(1, indent, "</%s>" % (node.tagName))

        # Reset frag style
        context.pullFrag()

    # Unknown or not handled
    else:
        # context.debug(1, indent, "???", node, node.nodeType, repr(node))
        # Loop over children
        for node in node.childNodes:
            pisaLoop(node, context, path, **kw)


def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        # TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if isinstance(src, six.text_type):
        # If an encoding was provided, do not change it.
        if not encoding:
            encoding = "utf-8"
        src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # # Test for the restrictions of html5lib
    # if encoding:
    #     # Workaround for html5lib<0.11.1
    #     if hasattr(inputstream, "isValidEncoding"):
    #         if encoding.strip().lower() == "utf8":
    #             encoding = "utf-8"
    #         if not inputstream.isValidEncoding(encoding):
    #             log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
    #     else:
    #         if inputstream.codecName(encoding) is None:
    #             log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
    )  # encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))

    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    # try:
    context.parseCSS()
    # except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context


# Shortcuts

HTML2PDF = pisaParser


def XHTML2PDF(*a, **kw):
    kw["xhtml"] = True
    return HTML2PDF(*a, **kw)


XML2PDF = XHTML2PDF
doctorondemand / xhtml2pdf python

Products

About

Resources

Contact Gemfury