Python网络爬虫(二十一)——Request 和 Response

Request

class Request(object_ref):

    def __init__(self, url, callback=None, method='GET', headers=None, body=None,
                 cookies=None, meta=None, encoding='utf-8', priority=0,
                 dont_filter=False, errback=None, flags=None, cb_kwargs=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        if not isinstance(priority, int):
            raise TypeError("Request priority not an integer: %r" % priority)
        self.priority = priority

        if callback is not None and not callable(callback):
            raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
        if errback is not None and not callable(errback):
            raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None
        self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
        self.flags = [] if flags is None else list(flags)

request 类就是我们 spider 发送给 Engine 的请求,通常情况下并不会被显式调用,因为 Scrapy 已经封装好了。常见的参数为:

  • url:即 request 对象发送请求的 url
  • callback:Downloader 下载完响应的数据后执行的回调函数
  • method:请求的方法,默认为 GET 方法
  • headers:请求头,固定的设置可以在 settings.py 中指定,非固定的可以在发送请求时指定
  • meta:用于在不同的请求之间传递数据
  • encoding:编码方式,默认为 utf-8
  • dont_filter:表示不经过 Scheduler 过滤
  • errback:在发生错误的时候执行的函数

Response

class Response(object_ref):

    def __init__(self, url, status=200, headers=None, body=b'', flags=None,
                 request=None, certificate=None, ip_address=None):
        self.headers = Headers(headers or {})
        self.status = int(status)
        self._set_body(body)
        self._set_url(url)
        self.request = request
        self.flags = [] if flags is None else list(flags)
        self.certificate = certificate
        self.ip_address = ip_address

    @property
    def cb_kwargs(self):
        try:
            return self.request.cb_kwargs
        except AttributeError:
            raise AttributeError(
                "Response.cb_kwargs not available, this response "
                "is not tied to any request"
            )

    @property
    def meta(self):
        try:
            return self.request.meta
        except AttributeError:
            raise AttributeError(
                "Response.meta not available, this response "
                "is not tied to any request"
            )

    def _get_url(self):
        return self._url

    def _set_url(self, url):
        if isinstance(url, str):
            self._url = url
        else:
            raise TypeError('%s url must be str, got %s:' %
                            (type(self).__name__, type(url).__name__))

    url = property(_get_url, obsolete_setter(_set_url, 'url'))

    def _get_body(self):
        return self._body

    def _set_body(self, body):
        if body is None:
            self._body = b''
        elif not isinstance(body, bytes):
            raise TypeError(
                "Response body must be bytes. "
                "If you want to pass unicode body use TextResponse "
                "or HtmlResponse.")
        else:
            self._body = body

    body = property(_get_body, obsolete_setter(_set_body, 'body'))

    def __str__(self):
        return "<%d %s>" % (self.status, self.url)

    __repr__ = __str__

    def copy(self):
        """Return a copy of this Response"""
        return self.replace()

    def replace(self, *args, **kwargs):
        """Create a new Response with the same attributes except for those
        given new values.
        """
        for x in ['url', 'status', 'headers', 'body',
                  'request', 'flags', 'certificate', 'ip_address']:
            kwargs.setdefault(x, getattr(self, x))
        cls = kwargs.pop('cls', self.__class__)
        return cls(*args, **kwargs)

    def urljoin(self, url):
        """Join this Response's url with a possible relative url to form an
        absolute interpretation of the latter."""
        return urljoin(self.url, url)

    @property
    def text(self):
        """For subclasses of TextResponse, this will return the body
        as str
        """
        raise AttributeError("Response content isn't text")

    def css(self, *a, **kw):
        """Shortcut method implemented only by responses whose content
        is text (subclasses of TextResponse).
        """
        raise NotSupported("Response content isn't text")

    def xpath(self, *a, **kw):
        """Shortcut method implemented only by responses whose content
        is text (subclasses of TextResponse).
        """
        raise NotSupported("Response content isn't text")

    def follow(self, url, callback=None, method='GET', headers=None, body=None,
               cookies=None, meta=None, encoding='utf-8', priority=0,
               dont_filter=False, errback=None, cb_kwargs=None, flags=None):
        # type: (...) -> Request
        """
        Return a :class:`~.Request` instance to follow a link ``url``.
        It accepts the same arguments as ``Request.__init__`` method,
        but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
        not only an absolute URL.

        :class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
        method which supports selectors in addition to absolute/relative URLs
        and Link objects.

        .. versionadded:: 2.0
           The *flags* parameter.
        """
        if isinstance(url, Link):
            url = url.url
        elif url is None:
            raise ValueError("url can't be None")
        url = self.urljoin(url)

        return Request(
            url=url,
            callback=callback,
            method=method,
            headers=headers,
            body=body,
            cookies=cookies,
            meta=meta,
            encoding=encoding,
            priority=priority,
            dont_filter=dont_filter,
            errback=errback,
            cb_kwargs=cb_kwargs,
            flags=flags,
        )

    def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
                   cookies=None, meta=None, encoding='utf-8', priority=0,
                   dont_filter=False, errback=None, cb_kwargs=None, flags=None):
        # type: (...) -> Generator[Request, None, None]
        """
        .. versionadded:: 2.0

        Return an iterable of :class:`~.Request` instances to follow all links
        in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
        but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,
        not only absolute URLs.

        :class:`~.TextResponse` provides a :meth:`~.TextResponse.follow_all`
        method which supports selectors in addition to absolute/relative URLs
        and Link objects.
        """
        if not hasattr(urls, '__iter__'):
            raise TypeError("'urls' argument must be an iterable")
        return (
            self.follow(
                url=url,
                callback=callback,
                method=method,
                headers=headers,
                body=body,
                cookies=cookies,
                meta=meta,
                encoding=encoding,
                priority=priority,
                dont_filter=dont_filter,
                errback=errback,
                cb_kwargs=cb_kwargs,
                flags=flags,
            )
            for url in urls
        )

Response 对象通常也是由 Scrapy,在 Scrapy 项目运行的时候,通常并不需要我们进行 Response 对象的显式构建,而可以直接在 spider 中进行 response 的解析。Response 对象主要的属性有:

  • meta:用来保持多个请求之间的数据连接
  • encoding:返回当前字符串编码和解码格式
  • text:将返回的数据作为 unicode 字符串返回
  • body:将返回的数据作为 bytes 数据返回
  • xpath:xpath 选择器
  • css:css 选择器

POST

通常我们爬取的都是页面上的信息,这些信息可能不需要登陆就能够直接访问,但是有些网站的资源却需要登陆之后才能够访问,或者是要登陆某些特定的网站都需要向该 url 发送 POST 请求,因此此时就需要对爬虫进行重写。

  • 如果在请求数据的时候发送 POST 请求的话,此时需要使用 Request 的子类 FormRequest
  • 如果需要爬虫在一开始就发送 POST 请求,此时就需要重写 start_requests(self) 方法,并且不再调用 start_urls 中的 url

实例

settings.py

仍旧需要设置:

  • ROBOTSTXT_OBEY:设置为 False,否则为 True。True 表示遵守机器协议,此时爬虫会首先找 robots.txt 文件,如果找不到则会停止
  • DEFAULT_REQUEST_HEADERS:默认请求头,可以在其中添加 User-Agent,表示该请求是从浏览器发出的,而不是爬虫

spider

# -*- coding: utf-8 -*-
import scrapy


class PeopleSpider(scrapy.Spider):
    name = 'people'
    allowed_domains = ['www.renren.com']
    start_urls = ['http://www.renren.com/']

    def start_requests(self):
        url = "http://www.renren.com/PLogin.do"
        user_info = {"email": "用户名", "password": "密码"}
        request = scrapy.FormRequest(url=url,formdata=user_info,callback=self.parse)
        yield request

    def parse(self, response):
        with open('people.html','w',encoding='utf-8') as fp:
            fp.write(response.text)
        fp.close()
        request = scrapy.Request(url='http://www.renren.com/880151247/profile',callback=self.parse_profile)
        yield request

    def parse_profile(self,response):
        with open('profile.html','w',encoding='utf-8') as fp:
            fp.write(response.text)
        fp.close()

通过重写 start_requests 函数,并构造 FormRequest 类对象,发送了 POST 请求,模拟了浏览器的登陆请求。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章