起步
完成對爬蟲基礎知識的回顧,我們就正式進入了頁面的爬取,這是我們第一個要爬取的圖片頁面:
第一個scrapy工程
打開cmd,新建工程目錄PictureSpider
scrapy用basic模板自動創建的mmonly.py文件
MmonlySpider類繼承的scrapy.Spider類,裏面的response和request與Django裏的功能類似:
pycharm調試scrapy的技巧:
修改setting文件,不遵循robots協議:
在mmonly.py中設置斷點,debug main,py啓動項目,按住Ctrl查看response內容:
xpath
用xpath選擇器提取頁面信息:
換種更爲高效的調試模式,利用cmd:在項目根目錄,輸入scrapy shell +你需要爬取的頁面url
css選擇器
用css選擇器提取頁面信息:
把上面複製的內容去除空格放入response.css(’ ')中:
點開第一張圖片,我們來用xpath和css提取更多的頁面信息:
我們需要提取的字段名稱:
'''
通過xpath提取字段
'''
title=response.xpath('//div[@class="wrapper clearfix imgtitle"]/h1/text()').extract()[0]
create_date=response.xpath("//span[@class='tip']/text()").extract()[0].strip()
match_cd = re.match(".*時間:?(\d{4}[-]\d{2}[-]\d{2}.*\d{2})?.*", create_date)
if match_cd:
create_date = match_cd.group(1)
praise_num=response.xpath("//span[@id='hits']/text()").extract()[0]
match_pn = re.match(".*?(\d+).*", praise_num)
if match_pn:
praise_num = match_pn.group(1)
pages_num=response.xpath("//div[@class='pages']/ul/li[1]/a/text()").extract()[0]
match_pg=re.match(".*?(\d+).*",pages_num)
if match_pg:
pages_num=match_pg.group(1)
imgs=response.xpath('//*[@id="big-pic"]/p/a/img/@src').extract()[0]
'''
通過css提取字段
'''
title=response.css(".wrapper h1::text").extract()[0] # 圖片標題
create_date =response.css(".tip ::text").extract()[0] # 創建時間
match_cd = re.match(".*時間:?(\d{4}[-]\d{2}[-]\d{2}.*\d{2})?.*", create_date)
if match_cd:
create_date = match_cd.group(1)
praise_num=response.css("#hits ::text").extract()[0]
match_pn = re.match(".*?(\d+).*", praise_num)
if match_pn:
praise_num = int(match_pn.group(1))
else:
praise_num = 0
pages_num=response.css(".pages ::text").extract()[2]
match_pg = re.match(".*?(\d+).*", pages_num)
if match_pg:
pages_num = int(match_pg.group(1))
else:
pages_num= 0
imgs=response.css("a[href] img::attr(src)").extract()[1]
爬取首頁圖片列表中的全部圖片
一級首頁圖片列表共24張圖,對應24個url
獲取一級圖片列表頁中的下一頁url:
next_url = response.css(".pages li:nth-last-child(2) a ::attr(href)").extract()[0] # 一級列表頁的下一頁
尋找二級詳情頁中url與頁碼的規律:
根據規律遍歷詳情頁的url:
pag_num=response.css(".pages li:nth-child(1) a::text").extract()[0]
match_pn = re.match(".*?(\d+).*", pag_num)
if match_pn:
pag_num = int(match_pn.group(1))
for i in [response.url.replace('.html','') + "_{}.html".format(str(x)) for x in range(1, pag_num + 1)]:
if i == response.url.replace('.html','') + "_1.html":
item['Referer'] = i.replace('_1','')
else:
item['Referer'] = i
完整步驟
mmonly.py
def parse(self, response):
'''
一級列表頁處理
'''
post_nodes = response.css('.item_t .img .ABox a') # 一級列表頁中的圖片的所有url列表
for post_node in post_nodes: # 循環遍歷url列表 取出二級詳情頁首頁url
post_url = post_node.css("::attr(href)").extract_first("")
yield Request(url=post_url,callback=self.parse_image) # 將二級詳情頁首頁url回調給parse_image函數處理
next_url = response.css(".pages li:nth-last-child(2) a ::attr(href)").extract()[0] # 一級列表頁的下一頁
if next_url: # 下一頁有值 就回調給本函數再次遍歷
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_image(self, response):
'''
二級詳情頁首頁處理
'''
item = PicturespiderItem()
pag_num=response.css(".pages li:nth-child(1) a::text").extract()[0]
match_pn = re.match(".*?(\d+).*", pag_num)
if match_pn:
pag_num = int(match_pn.group(1))
for i in [response.url.replace('.html','') + "_{}.html".format(str(x)) for x in range(1, pag_num + 1)]:
if i == response.url.replace('.html','') + "_1.html":
item['Referer'] = i.replace('_1','')
else:
item['Referer'] = i
yield Request(url=item['Referer'],meta={'meta_1': item},callback=self.parse_detail,dont_filter=True) # 回調給詳情頁字段提取函數 關閉自動去重
'''
二級詳情頁字段提取
'''
def parse_detail(self, response):
item = response.meta['meta_1']
image_url = response.css("a[href] img::attr(src)").extract()[1]
title = response.css(".wrapper h1::text").extract()[0]
item["image_url"] = image_url
item["title"] =title
yield item
items.py
class PicturespiderItem(scrapy.Item):
'''
定義item中的字段
'''
title = scrapy.Field()
image_url = scrapy.Field()
image_path = scrapy.Field()
Referer = scrapy.Field()
pipelines.py
# 移動圖片
import shutil
import scrapy
# 導入項目設置
from scrapy.utils.project import get_project_settings
# 導入scrapy框架的圖片下載類
from scrapy.pipelines.images import ImagesPipeline
import os
class PicturespiderPipeline(ImagesPipeline):
# 獲取settings文件裏設置的變量值
IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
def get_media_requests(self, item, info):
'''
圖片下載請求
'''
image_url = item["image_url"]
#headers是請求頭主要是防反爬蟲
yield scrapy.Request(image_url,headers={'Referer':item['Referer']})
def item_completed(self, result, item, info):
'''
按二級詳情頁的圖片標題分類保存
'''
image_path = [x["path"] for ok, x in result if ok]
# 定義分類保存的路徑
img_path = "%s\%s" % (self.IMAGES_STORE, item['title'])
# 目錄不存在則創建目錄
if os.path.exists(img_path) == False:
os.mkdir(img_path)
# 將文件從默認下路路徑移動到指定路徑下
shutil.move(self.IMAGES_STORE + "\\" +image_path[0], img_path + "\\" +image_path[0][image_path[0].find("full\\")+6:])
item['image_path'] = img_path + "\\" + image_path[0][image_path[0].find("full\\")+6:]
return item
setting.py