Repository URL to install this package:
|
Version:
0.0.6 ▾
|
3stack-rfc6266
/
rfc6266.py
|
|---|
"""Implements RFC 6266, the Content-Disposition HTTP header.
parse_headers handles the receiver side.
It has shortcuts for some http libraries:
parse_httplib2_response and parse_requests_response.
It returns a ContentDisposition object with attributes like is_inline,
filename_unsafe, filename_sanitized.
build_header handles the sender side.
"""
import logging
from string import ascii_letters, digits
from urllib.parse import quote
LOGGER = logging.getLogger("rfc6266")
try:
LOGGER.addHandler(logging.NullHandler())
except AttributeError:
pass
__all__ = ("build_header",)
def percent_encode(string, safe, encoding):
return quote(string, safe, encoding, errors="strict")
# RFC 2616
separator_chars = '()<>@,;:\\"/[]?={} \t'
ctl_chars = "".join(chr(i) for i in range(32)) + chr(127)
nontoken_chars = separator_chars + ctl_chars
# RFC 5987
attr_chars_nonalnum = "!#$&+-.^_`|~"
attr_chars = ascii_letters + digits + attr_chars_nonalnum
# RFC 5987 gives this alternative construction of the token character class
token_chars = attr_chars + "*'%"
def is_token_char(ch):
# Must be ascii, and neither a control char nor a separator char
asciicode = ord(ch)
# < 128 means ascii, exclude control chars at 0-31 and 127,
# exclude separator characters.
return 31 < asciicode < 127 and ch not in separator_chars
def usesonlycharsfrom(candidate, chars):
# Found that shortcut in urllib.quote
return candidate.rstrip(chars) == ""
def is_token(candidate):
# return usesonlycharsfrom(candidate, token_chars)
return all(is_token_char(ch) for ch in candidate)
def is_ascii(text):
return all(ord(ch) < 128 for ch in text)
def fits_inside_codec(text, codec):
try:
text.encode(codec)
except UnicodeEncodeError:
return False
else:
return True
def is_lws_safe(text):
return normalize_ws(text) == text
def normalize_ws(text):
return " ".join(text.split())
def qd_quote(text):
return text.replace("\\", "\\\\").replace('"', '\\"')
def build_header(filename, disposition="attachment", filename_compat=None):
"""Generate a Content-Disposition header for a given filename.
For legacy clients that don't understand the filename* parameter,
a filename_compat value may be given.
It should either be ascii-only (recommended) or iso-8859-1 only.
In the later case it should be a character string
(unicode in Python 2).
Options for generating filename_compat (only useful for legacy clients):
- ignore (will only send filename*);
- strip accents using unicode's decomposing normalisations,
which can be done from unicode data (stdlib), and keep only ascii;
- use the ascii transliteration tables from Unidecode (PyPI);
- use iso-8859-1
Ignore is the safest, and can be used to trigger a fallback
to the document location (which can be percent-encoded utf-8
if you control the URLs).
See https://tools.ietf.org/html/rfc6266#appendix-D
"""
# While this method exists, it could also sanitize the filename
# by rejecting slashes or other weirdness that might upset a receiver.
if disposition != "attachment":
assert is_token(disposition)
rv = disposition
if is_token(filename):
rv += "; filename=%s" % (filename,)
return rv
elif is_ascii(filename) and is_lws_safe(filename):
qd_filename = qd_quote(filename)
rv += '; filename="%s"' % (qd_filename,)
if qd_filename == filename:
# RFC 6266 claims some implementations are iffy on qdtext's
# backslash-escaping, we'll include filename* in that case.
return rv
elif filename_compat:
if is_token(filename_compat):
rv += "; filename=%s" % (filename_compat,)
else:
assert is_lws_safe(filename_compat)
rv += '; filename="%s"' % (qd_quote(filename_compat),)
# alnum are already considered always-safe, but the rest isn't.
# Python encodes ~ when it shouldn't, for example.
rv += "; filename*=utf-8''%s" % (
percent_encode(filename, safe=attr_chars_nonalnum, encoding="utf-8"),
)
# This will only encode filename_compat, if it used non-ascii iso-8859-1.
return rv.encode("iso-8859-1")