Request
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None, cb_kwargs=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()
self._set_url(url)
self._set_body(body)
if not isinstance(priority, int):
raise TypeError("Request priority not an integer: %r" % priority)
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
if errback is not None and not callable(errback):
raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
self.callback = callback
self.errback = errback
self.cookies = cookies or {}
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
self.flags = [] if flags is None else list(flags)
request 类就是我们 spider 发送给 Engine 的请求,通常情况下并不会被显式调用,因为 Scrapy 已经封装好了。常见的参数为:
- url:即 request 对象发送请求的 url
- callback:Downloader 下载完响应的数据后执行的回调函数
- method:请求的方法,默认为 GET 方法
- headers:请求头,固定的设置可以在 settings.py 中指定,非固定的可以在发送请求时指定
- meta:用于在不同的请求之间传递数据
- encoding:编码方式,默认为 utf-8
- dont_filter:表示不经过 Scheduler 过滤
- errback:在发生错误的时候执行的函数
Response
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
request=None, certificate=None, ip_address=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
self.certificate = certificate
self.ip_address = ip_address
@property
def cb_kwargs(self):
try:
return self.request.cb_kwargs
except AttributeError:
raise AttributeError(
"Response.cb_kwargs not available, this response "
"is not tied to any request"
)
@property
def meta(self):
try:
return self.request.meta
except AttributeError:
raise AttributeError(
"Response.meta not available, this response "
"is not tied to any request"
)
def _get_url(self):
return self._url
def _set_url(self, url):
if isinstance(url, str):
self._url = url
else:
raise TypeError('%s url must be str, got %s:' %
(type(self).__name__, type(url).__name__))
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
elif not isinstance(body, bytes):
raise TypeError(
"Response body must be bytes. "
"If you want to pass unicode body use TextResponse "
"or HtmlResponse.")
else:
self._body = body
body = property(_get_body, obsolete_setter(_set_body, 'body'))
def __str__(self):
return "<%d %s>" % (self.status, self.url)
__repr__ = __str__
def copy(self):
"""Return a copy of this Response"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body',
'request', 'flags', 'certificate', 'ip_address']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
return urljoin(self.url, url)
@property
def text(self):
"""For subclasses of TextResponse, this will return the body
as str
"""
raise AttributeError("Response content isn't text")
def css(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def xpath(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Request
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
not only an absolute URL.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
.. versionadded:: 2.0
The *flags* parameter.
"""
if isinstance(url, Link):
url = url.url
elif url is None:
raise ValueError("url can't be None")
url = self.urljoin(url)
return Request(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Generator[Request, None, None]
"""
.. versionadded:: 2.0
Return an iterable of :class:`~.Request` instances to follow all links
in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,
not only absolute URLs.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow_all`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
"""
if not hasattr(urls, '__iter__'):
raise TypeError("'urls' argument must be an iterable")
return (
self.follow(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
for url in urls
)
Response 对象通常也是由 Scrapy,在 Scrapy 项目运行的时候,通常并不需要我们进行 Response 对象的显式构建,而可以直接在 spider 中进行 response 的解析。Response 对象主要的属性有:
- meta:用来保持多个请求之间的数据连接
- encoding:返回当前字符串编码和解码格式
- text:将返回的数据作为 unicode 字符串返回
- body:将返回的数据作为 bytes 数据返回
- xpath:xpath 选择器
- css:css 选择器
POST
通常我们爬取的都是页面上的信息,这些信息可能不需要登陆就能够直接访问,但是有些网站的资源却需要登陆之后才能够访问,或者是要登陆某些特定的网站都需要向该 url 发送 POST 请求,因此此时就需要对爬虫进行重写。
- 如果在请求数据的时候发送 POST 请求的话,此时需要使用 Request 的子类 FormRequest
- 如果需要爬虫在一开始就发送 POST 请求,此时就需要重写 start_requests(self) 方法,并且不再调用 start_urls 中的 url
实例
settings.py
仍旧需要设置:
- ROBOTSTXT_OBEY:设置为 False,否则为 True。True 表示遵守机器协议,此时爬虫会首先找 robots.txt 文件,如果找不到则会停止
- DEFAULT_REQUEST_HEADERS:默认请求头,可以在其中添加 User-Agent,表示该请求是从浏览器发出的,而不是爬虫
spider
# -*- coding: utf-8 -*-
import scrapy
class PeopleSpider(scrapy.Spider):
name = 'people'
allowed_domains = ['www.renren.com']
start_urls = ['http://www.renren.com/']
def start_requests(self):
url = "http://www.renren.com/PLogin.do"
user_info = {"email": "用户名", "password": "密码"}
request = scrapy.FormRequest(url=url,formdata=user_info,callback=self.parse)
yield request
def parse(self, response):
with open('people.html','w',encoding='utf-8') as fp:
fp.write(response.text)
fp.close()
request = scrapy.Request(url='http://www.renren.com/880151247/profile',callback=self.parse_profile)
yield request
def parse_profile(self,response):
with open('profile.html','w',encoding='utf-8') as fp:
fp.write(response.text)
fp.close()
通过重写 start_requests 函数,并构造 FormRequest 类对象,发送了 POST 请求,模拟了浏览器的登陆请求。