Request
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None, cb_kwargs=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()
self._set_url(url)
self._set_body(body)
if not isinstance(priority, int):
raise TypeError("Request priority not an integer: %r" % priority)
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
if errback is not None and not callable(errback):
raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
self.callback = callback
self.errback = errback
self.cookies = cookies or {}
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
self.flags = [] if flags is None else list(flags)
request 類就是我們 spider 發送給 Engine 的請求,通常情況下並不會被顯式調用,因爲 Scrapy 已經封裝好了。常見的參數爲:
- url:即 request 對象發送請求的 url
- callback:Downloader 下載完響應的數據後執行的回調函數
- method:請求的方法,默認爲 GET 方法
- headers:請求頭,固定的設置可以在 settings.py 中指定,非固定的可以在發送請求時指定
- meta:用於在不同的請求之間傳遞數據
- encoding:編碼方式,默認爲 utf-8
- dont_filter:表示不經過 Scheduler 過濾
- errback:在發生錯誤的時候執行的函數
Response
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
request=None, certificate=None, ip_address=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
self.certificate = certificate
self.ip_address = ip_address
@property
def cb_kwargs(self):
try:
return self.request.cb_kwargs
except AttributeError:
raise AttributeError(
"Response.cb_kwargs not available, this response "
"is not tied to any request"
)
@property
def meta(self):
try:
return self.request.meta
except AttributeError:
raise AttributeError(
"Response.meta not available, this response "
"is not tied to any request"
)
def _get_url(self):
return self._url
def _set_url(self, url):
if isinstance(url, str):
self._url = url
else:
raise TypeError('%s url must be str, got %s:' %
(type(self).__name__, type(url).__name__))
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
elif not isinstance(body, bytes):
raise TypeError(
"Response body must be bytes. "
"If you want to pass unicode body use TextResponse "
"or HtmlResponse.")
else:
self._body = body
body = property(_get_body, obsolete_setter(_set_body, 'body'))
def __str__(self):
return "<%d %s>" % (self.status, self.url)
__repr__ = __str__
def copy(self):
"""Return a copy of this Response"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body',
'request', 'flags', 'certificate', 'ip_address']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
return urljoin(self.url, url)
@property
def text(self):
"""For subclasses of TextResponse, this will return the body
as str
"""
raise AttributeError("Response content isn't text")
def css(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def xpath(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Request
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
not only an absolute URL.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
.. versionadded:: 2.0
The *flags* parameter.
"""
if isinstance(url, Link):
url = url.url
elif url is None:
raise ValueError("url can't be None")
url = self.urljoin(url)
return Request(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Generator[Request, None, None]
"""
.. versionadded:: 2.0
Return an iterable of :class:`~.Request` instances to follow all links
in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,
not only absolute URLs.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow_all`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
"""
if not hasattr(urls, '__iter__'):
raise TypeError("'urls' argument must be an iterable")
return (
self.follow(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
for url in urls
)
Response 對象通常也是由 Scrapy,在 Scrapy 項目運行的時候,通常並不需要我們進行 Response 對象的顯式構建,而可以直接在 spider 中進行 response 的解析。Response 對象主要的屬性有:
- meta:用來保持多個請求之間的數據連接
- encoding:返回當前字符串編碼和解碼格式
- text:將返回的數據作爲 unicode 字符串返回
- body:將返回的數據作爲 bytes 數據返回
- xpath:xpath 選擇器
- css:css 選擇器
POST
通常我們爬取的都是頁面上的信息,這些信息可能不需要登陸就能夠直接訪問,但是有些網站的資源卻需要登陸之後才能夠訪問,或者是要登陸某些特定的網站都需要向該 url 發送 POST 請求,因此此時就需要對爬蟲進行重寫。
- 如果在請求數據的時候發送 POST 請求的話,此時需要使用 Request 的子類 FormRequest
- 如果需要爬蟲在一開始就發送 POST 請求,此時就需要重寫 start_requests(self) 方法,並且不再調用 start_urls 中的 url
實例
settings.py
仍舊需要設置:
- ROBOTSTXT_OBEY:設置爲 False,否則爲 True。True 表示遵守機器協議,此時爬蟲會首先找 robots.txt 文件,如果找不到則會停止
- DEFAULT_REQUEST_HEADERS:默認請求頭,可以在其中添加 User-Agent,表示該請求是從瀏覽器發出的,而不是爬蟲
spider
# -*- coding: utf-8 -*-
import scrapy
class PeopleSpider(scrapy.Spider):
name = 'people'
allowed_domains = ['www.renren.com']
start_urls = ['http://www.renren.com/']
def start_requests(self):
url = "http://www.renren.com/PLogin.do"
user_info = {"email": "用戶名", "password": "密碼"}
request = scrapy.FormRequest(url=url,formdata=user_info,callback=self.parse)
yield request
def parse(self, response):
with open('people.html','w',encoding='utf-8') as fp:
fp.write(response.text)
fp.close()
request = scrapy.Request(url='http://www.renren.com/880151247/profile',callback=self.parse_profile)
yield request
def parse_profile(self,response):
with open('profile.html','w',encoding='utf-8') as fp:
fp.write(response.text)
fp.close()
通過重寫 start_requests 函數,並構造 FormRequest 類對象,發送了 POST 請求,模擬了瀏覽器的登陸請求。