python爬蟲學習筆記 4.7 (Request/Response)
Request
Request 部分源碼:
# 部分代碼
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()
self._set_url(url)
self._set_body(body)
assert isinstance(priority, int), "Request priority not an integer: %r" % priority
self.priority = priority
assert callback or not errback, "Cannot use errback without a callback"
self.callback = callback
self.errback = errback
self.cookies = cookies or {}
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
@property
def meta(self):
if self._meta is None:
self._meta = {}
return self._meta
其中,比較常用的參數:
url: 就是需要請求,並進行下一步處理的url
callback: 指定該請求返回的Response,由那個函數來處理。
method: 請求一般不需要指定,默認GET方法,可設置爲"GET", "POST", "PUT"等,且保證字符串大寫
headers: 請求時,包含的頭文件。一般不需要。內容一般如下:
# 自己寫過爬蟲的肯定知道
Host: media.readthedocs.org
User-Agent: Mozilla/5.0 (Windows NT 6.2; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0
Accept: text/css,*/*;q=0.1
Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding: gzip, deflate
Referer: http://scrapy-chs.readthedocs.org/zh_CN/0.24/
Cookie: _ga=GA1.2.1612165614.1415584110;
Connection: keep-alive
If-Modified-Since: Mon, 25 Aug 2014 21:59:35 GMT
Cache-Control: max-age=0
meta: 比較常用,在不同的請求之間傳遞數據使用的。字典dict型
request_with_cookies = Request(
url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'},
meta={'dont_merge_cookies': True}
)
encoding: 使用默認的 'utf-8' 就行。
dont_filter: 表明該請求不由調度器過濾。這是當你想使用多次執行相同的請求,忽略重複的過濾器。默認爲False。
errback: 指定錯誤處理函數
Response
部分代碼
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
@property
def meta(self):
try:
return self.request.meta
except AttributeError:
raise AttributeError("Response.meta not available, this response " \
"is not tied to any request")
大部分參數和上面的差不多:
status: 響應碼
_set_body(body): 響應體
_set_url(url):響應url
self.request = request
發送POST請求
-
可以使用 yield scrapy.FormRequest(url, formdata, callback)方法發送POST請求。
-
如果希望程序執行一開始就發送POST請求,可以重寫Spider類的start_requests(self) 方法,並且不再調用start_urls裏的url。
class mySpider(scrapy.Spider):
# start_urls = ["http://www.example.com/"]
def start_requests(self):
url = 'http://www.renren.com/PLogin.do'
# FormRequest 是Scrapy發送POST請求的方法
yield scrapy.FormRequest(
url = url,
formdata = {"email" : "[email protected]", "password" : "axxxxxxxe"},
callback = self.parse_page
)
def parse_page(self, response):
# do something
模擬登陸
使用FormRequest.from_response()方法模擬用戶登錄
通常網站通過 實現對某些表單字段(如數據或是登錄界面中的認證令牌等)的預填充。
使用Scrapy抓取網頁時,如果想要預填充或重寫像用戶名、用戶密碼這些表單字段, 可以使用 FormRequest.from_response() 方法實現。
下面是使用這種方法的爬蟲例子:
import scrapy
class LoginSpider(scrapy.Spider):
name = 'example.com'
start_urls = ['http://www.example.com/users/login.php']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.after_login
)
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
return
# continue scraping with authenticated session...
知乎爬蟲案例參考:
zhihuSpider.py爬蟲代碼
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy import Request, FormRequest
from zhihu.items import ZhihuItem
class ZhihuSipder(CrawlSpider) :
name = "zhihu"
allowed_domains = ["www.zhihu.com"]
start_urls = [
"http://www.zhihu.com"
]
rules = (
Rule(LinkExtractor(allow = ('/question/\d+#.*?', )), callback = 'parse_page', follow = True),
Rule(LinkExtractor(allow = ('/question/\d+', )), callback = 'parse_page', follow = True),
)
headers = {
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4",
"Connection": "keep-alive",
"Content-Type":" application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2125.111 Safari/537.36",
"Referer": "http://www.zhihu.com/"
}
#重寫了爬蟲類的方法, 實現了自定義請求, 運行成功後會調用callback回調函數
def start_requests(self):
return [Request("https://www.zhihu.com/login", meta = {'cookiejar' : 1}, callback = self.post_login)]
def post_login(self, response):
print 'Preparing login'
#下面這句話用於抓取請求網頁後返回網頁中的_xsrf字段的文字, 用於成功提交表單
xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract()[0]
print xsrf
#FormRequeset.from_response是Scrapy提供的一個函數, 用於post表單
#登陸成功後, 會調用after_login回調函數
return [FormRequest.from_response(response, #"http://www.zhihu.com/login",
meta = {'cookiejar' : response.meta['cookiejar']},
headers = self.headers, #注意此處的headers
formdata = {
'_xsrf': xsrf,
'email': '[email protected]',
'password': '123456'
},
callback = self.after_login,
dont_filter = True
)]
def after_login(self, response) :
for url in self.start_urls :
yield self.make_requests_from_url(url)
def parse_page(self, response):
problem = Selector(response)
item = ZhihuItem()
item['url'] = response.url
item['name'] = problem.xpath('//span[@class="name"]/text()').extract()
print item['name']
item['title'] = problem.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract()
item['description'] = problem.xpath('//div[@class="zm-editable-content"]/text()').extract()
item['answer']= problem.xpath('//div[@class=" zm-editable-content clearfix"]/text()').extract()
return item
Item類設置
from scrapy.item import Item, Field
class ZhihuItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
url = Field() #保存抓取問題的url
title = Field() #抓取問題的標題
description = Field() #抓取問題的描述
answer = Field() #抓取問題的答案
name = Field() #個人用戶的名稱
setting.py 設置抓取間隔
BOT_NAME = 'zhihu'
SPIDER_MODULES = ['zhihu.spiders']
NEWSPIDER_MODULE = 'zhihu.spiders'
DOWNLOAD_DELAY = 0.25 #設置下載間隔爲250ms