Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

hemamaps / Scrapy   python

Repository URL to install this package:

Version: 1.1.1 

/ xlib / tx / _newclient.py

# -*- test-case-name: twisted.web.test.test_newclient -*-
# Copyright (c) Twisted Matrix Laboratories.
# See LICENSE for details.

"""
An U{HTTP 1.1<http://www.w3.org/Protocols/rfc2616/rfc2616.html>} client.

The way to use the functionality provided by this module is to:

  - Connect a L{HTTP11ClientProtocol} to an HTTP server
  - Create a L{Request} with the appropriate data
  - Pass the request to L{HTTP11ClientProtocol.request}
  - The returned Deferred will fire with a L{Response} object
  - Create a L{IProtocol} provider which can handle the response body
  - Connect it to the response with L{Response.deliverBody}
  - When the protocol's C{connectionLost} method is called, the response is
    complete.  See L{Response.deliverBody} for details.

Various other classes in this module support this usage:

  - HTTPParser is the basic HTTP parser.  It can handle the parts of HTTP which
    are symmetric between requests and responses.

  - HTTPClientParser extends HTTPParser to handle response-specific parts of
    HTTP.  One instance is created for each request to parse the corresponding
    response.
"""

__metaclass__ = type

from zope.interface import implements

from twisted.python import log
from twisted.python.reflect import fullyQualifiedName
from twisted.python.failure import Failure
from twisted.internet.interfaces import IConsumer, IPushProducer
from twisted.internet.error import ConnectionDone
from twisted.internet.defer import Deferred, succeed, fail, maybeDeferred
from twisted.internet.defer import CancelledError
from twisted.internet.protocol import Protocol
from twisted.protocols.basic import LineReceiver
from twisted.web.http_headers import Headers
from twisted.web.http import NO_CONTENT, NOT_MODIFIED
from twisted.web.http import _DataLoss, PotentialDataLoss
from twisted.web.http import _IdentityTransferDecoder, _ChunkedTransferDecoder

from .iweb import IResponse, UNKNOWN_LENGTH

# States HTTPParser can be in
STATUS = 'STATUS'
HEADER = 'HEADER'
BODY = 'BODY'
DONE = 'DONE'


class BadHeaders(Exception):
    """
    Headers passed to L{Request} were in some way invalid.
    """



class ExcessWrite(Exception):
    """
    The body L{IBodyProducer} for a request tried to write data after
    indicating it had finished writing data.
    """


class ParseError(Exception):
    """
    Some received data could not be parsed.

    @ivar data: The string which could not be parsed.
    """
    def __init__(self, reason, data):
        Exception.__init__(self, reason, data)
        self.data = data



class BadResponseVersion(ParseError):
    """
    The version string in a status line was unparsable.
    """



class _WrapperException(Exception):
    """
    L{_WrapperException} is the base exception type for exceptions which
    include one or more other exceptions as the low-level causes.

    @ivar reasons: A list of exceptions.  See subclass documentation for more
        details.
    """
    def __init__(self, reasons):
        Exception.__init__(self, reasons)
        self.reasons = reasons



class RequestGenerationFailed(_WrapperException):
    """
    There was an error while creating the bytes which make up a request.

    @ivar reasons: A C{list} of one or more L{Failure} instances giving the
        reasons the request generation was considered to have failed.
    """



class RequestTransmissionFailed(_WrapperException):
    """
    There was an error while sending the bytes which make up a request.

    @ivar reasons: A C{list} of one or more L{Failure} instances giving the
        reasons the request transmission was considered to have failed.
    """



class ConnectionAborted(Exception):
    """
    The connection was explicitly aborted by application code.
    """



class WrongBodyLength(Exception):
    """
    An L{IBodyProducer} declared the number of bytes it was going to
    produce (via its C{length} attribute) and then produced a different number
    of bytes.
    """



class ResponseDone(Exception):
    """
    L{ResponseDone} may be passed to L{IProtocol.connectionLost} on the
    protocol passed to L{Response.deliverBody} and indicates that the entire
    response has been delivered.
    """



class ResponseFailed(_WrapperException):
    """
    L{ResponseFailed} indicates that all of the response to a request was not
    received for some reason.

    @ivar reasons: A C{list} of one or more L{Failure} instances giving the
        reasons the response was considered to have failed.

    @ivar response: If specified, the L{Response} received from the server (and
        in particular the status code and the headers).
    """

    def __init__(self, reasons, response=None):
        _WrapperException.__init__(self, reasons)
        self.response = response



class ResponseNeverReceived(ResponseFailed):
    """
    A L{ResponseFailed} that knows no response bytes at all have been received.
    """



class RequestNotSent(Exception):
    """
    L{RequestNotSent} indicates that an attempt was made to issue a request but
    for reasons unrelated to the details of the request itself, the request
    could not be sent.  For example, this may indicate that an attempt was made
    to send a request using a protocol which is no longer connected to a
    server.
    """



def _callAppFunction(function):
    """
    Call C{function}.  If it raises an exception, log it with a minimal
    description of the source.

    @return: C{None}
    """
    try:
        function()
    except:
        log.err(None, "Unexpected exception from %s" % (
                fullyQualifiedName(function),))



class HTTPParser(LineReceiver):
    """
    L{HTTPParser} handles the parsing side of HTTP processing. With a suitable
    subclass, it can parse either the client side or the server side of the
    connection.

    @ivar headers: All of the non-connection control message headers yet
        received.

    @ivar state: State indicator for the response parsing state machine.  One
        of C{STATUS}, C{HEADER}, C{BODY}, C{DONE}.

    @ivar _partialHeader: C{None} or a C{list} of the lines of a multiline
        header while that header is being received.
    """

    # NOTE: According to HTTP spec, we're supposed to eat the
    # 'Proxy-Authenticate' and 'Proxy-Authorization' headers also, but that
    # doesn't sound like a good idea to me, because it makes it impossible to
    # have a non-authenticating transparent proxy in front of an authenticating
    # proxy. An authenticating proxy can eat them itself. -jknight
    #
    # Further, quoting
    # http://homepages.tesco.net/J.deBoynePollard/FGA/web-proxy-connection-header.html
    # regarding the 'Proxy-Connection' header:
    #
    #    The Proxy-Connection: header is a mistake in how some web browsers
    #    use HTTP. Its name is the result of a false analogy. It is not a
    #    standard part of the protocol. There is a different standard
    #    protocol mechanism for doing what it does. And its existence
    #    imposes a requirement upon HTTP servers such that no proxy HTTP
    #    server can be standards-conforming in practice.
    #
    # -exarkun

    # Some servers (like http://news.ycombinator.com/) return status lines and
    # HTTP headers delimited by \n instead of \r\n.
    delimiter = '\n'

    CONNECTION_CONTROL_HEADERS = set([
            'content-length', 'connection', 'keep-alive', 'te', 'trailers',
            'transfer-encoding', 'upgrade', 'proxy-connection'])

    def connectionMade(self):
        self.headers = Headers()
        self.connHeaders = Headers()
        self.state = STATUS
        self._partialHeader = None


    def switchToBodyMode(self, decoder):
        """
        Switch to body parsing mode - interpret any more bytes delivered as
        part of the message body and deliver them to the given decoder.
        """
        if self.state == BODY:
            raise RuntimeError("already in body mode")

        self.bodyDecoder = decoder
        self.state = BODY
        self.setRawMode()


    def lineReceived(self, line):
        """
        Handle one line from a response.
        """
        # Handle the normal CR LF case.
        if line[-1:] == '\r':
            line = line[:-1]

        if self.state == STATUS:
            self.statusReceived(line)
            self.state = HEADER
        elif self.state == HEADER:
            if not line or line[0] not in ' \t':
                if self._partialHeader is not None:
                    header = ''.join(self._partialHeader)
                    name, value = header.split(':', 1)
                    value = value.strip()
                    self.headerReceived(name, value)
                if not line:
                    # Empty line means the header section is over.
                    self.allHeadersReceived()
                else:
                    # Line not beginning with LWS is another header.
                    self._partialHeader = [line]
            else:
                # A line beginning with LWS is a continuation of a header
                # begun on a previous line.
                self._partialHeader.append(line)


    def rawDataReceived(self, data):
        """
        Pass data from the message body to the body decoder object.
        """
        self.bodyDecoder.dataReceived(data)


    def isConnectionControlHeader(self, name):
        """
        Return C{True} if the given lower-cased name is the name of a
        connection control header (rather than an entity header).

        According to RFC 2616, section 14.10, the tokens in the Connection
        header are probably relevant here.  However, I am not sure what the
        practical consequences of either implementing or ignoring that are.
        So I leave it unimplemented for the time being.
        """
        return name in self.CONNECTION_CONTROL_HEADERS


    def statusReceived(self, status):
        """
        Callback invoked whenever the first line of a new message is received.
        Override this.

        @param status: The first line of an HTTP request or response message
            without trailing I{CR LF}.
        @type status: C{str}
        """


    def headerReceived(self, name, value):
        """
        Store the given header in C{self.headers}.
        """
        name = name.lower()
        if self.isConnectionControlHeader(name):
            headers = self.connHeaders
        else:
            headers = self.headers
        headers.addRawHeader(name, value)


    def allHeadersReceived(self):
        """
        Callback invoked after the last header is passed to C{headerReceived}.
        Override this to change to the C{BODY} or C{DONE} state.
        """
        self.switchToBodyMode(None)



class HTTPClientParser(HTTPParser):
    """
Loading ...