請求的手動發送
class SecondPipeline(object):
f = None
def open_spider(self,spider):
print('start')
self.f = open('./qiubai.text','w',encoding='utf-8')
def process_item(self, item, spider):
self.f.write(item['author']+':'+ item['content'])
return item
def close_spider(self,spider):
self.f.close()
print('end')
import scrapy
from second.items import SecondItem
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/']
url = 'https://www.qiushibaike.com/text/page/%d/'
pageNum = 1
def parse(self, response):
print('正在爬蟲')
div_list = response.xpath("//div[@id='content-left']/div")
for div in div_list:
author = div.xpath('./div/a[2]/h2/text()').extract_first()
content = div.xpath(".//div[@class='content']/span/text()").extract_first()
items = SecondItem()
items['author'] = author
items['content'] = content
yield items
# 執行到此,第一個url已經獲取成功,
# 手動添加url,yield scrapy.Request,
# callback 函數解析,可以自定義,也可以再次利用parse,不加(),遞歸加條件
# url = 'https://www.qiushibaike.com/text/page/2/'
# yield scrapy.Request(url=url, callback=self.parse)
if self.pageNum <= 13:
self.pageNum += 1
new_url = format(self.url % self.pageNum)
yield scrapy.Request(url=new_url, callback=self.parse)
小結:
- 多個url手動添加,callback函數執行
- 頁面佈局一致時,循環調用解析函數遞歸注意終止條件
- yield scrapy.Request
- 注意面向對象 數據屬性 與實例化對象 結合的特性!
- format函數的使用