python爬蟲框架scrapy學習第四課
任務:爬取鳳凰網導航下所有一級、二級和具體新聞數據
鳳凰網導航
一級標題:
二級標題:
新聞鏈接:
具體新聞標題:
執行:爬蟲實例
1、items.py文件:明確要爬取的數據字段
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class IfengprojectItem(scrapy.Item):
#一級大標題和超鏈接
parentTitle = scrapy.Field()
parentUrls = scrapy.Field()
#二級標題和超鏈接
secondTitle = scrapy.Field()
secondUrls = scrapy.Field()
#新聞鏈接路徑和路徑
newsUrls = scrapy.Field()
newsFileName = scrapy.Field()
#具體新聞標題、正文內容、新聞發佈時間
newsHead = scrapy.Field()
newsContent = scrapy.Field()
newsPublicTime = scrapy.Field()
2、ifeng.py文件:執行具體爬蟲過程
# -*- coding: utf-8 -*-
import scrapy
import os
from iFengProject.items import IfengprojectItem
class IfengSpider(scrapy.Spider):
name = 'ifeng'
allowed_domains = ['ifeng.com']
start_urls = ['http://www.ifeng.com/daohang/']
def parse(self, response):
items = []
#一級大標題和超鏈接
parentUrls = response.xpath('//div[@class = "col3"]/h2/a/@href').extract()
parentTitle = response.xpath('//div[@class = "col3"]/h2/a/text()').extract()
#二級標題和超鏈接
secondUrls = response.xpath('//ul[@class = "clearfix"]/li/a/@href').extract()
secondTitle = response.xpath('//ul[@class = "clearfix"]/li/a/text()').extract()
#爬取所有的一級大標題和超鏈接
for i in range(0, len(parentTitle)):
#指定大類目錄的路徑和目錄名
parentFileName = "./數據/"+parentTitle[i]
#如果目錄不存在,則創建目錄
if(not os.path.exists(parentFileName)):
os.makedirs(parentFileName)
#爬取所有二級數據
for j in range(0, len(secondTitle)):
item = IfengprojectItem()
#保存大類的title和urls
item['parentTitle'] = parentTitle[i]
item['parentUrls'] = parentUrls[i]
# 檢查小類的url是否以同類別大類url開頭,如果是返回True
# 由於網站二級鏈接有些並不以一級鏈接的url開頭,故會丟棄一部分數據
if_belong = secondUrls[j].startswith(item['parentUrls'])
if(if_belong):
secondFileName = parentFileName + '/' + secondTitle[j]
#如果目錄不存在,創建二級路徑
if(not os.path.exists(secondFileName)):
os.makedirs(secondFileName)
item['secondUrls'] = secondUrls[j]
item['secondTitle'] = secondTitle[j]
item['newsFileName'] = secondFileName
filename = 'secondFileName.html'
f = open(filename,'a+')
f.write(item['newsFileName']+ " ** ")
f.close()
items.append(item)
for item in items:
yield scrapy.Request(url=item['secondUrls'], meta={'meta_1': item}, callback=self.second_parse)
#對於二級標題和超鏈接下的url,再進行遞歸請求
def second_parse(self, response):
#提取每次Response的meta數據
meta_1 = response.meta['meta_1']
#取出小類裏的新聞鏈接list
xpathStr = "//div[@class='juti_list']/h3/a/@href"
xpathStr += " | " +"//div[@class='box_list clearfix']/h2/a/@href"
newsUrls = response.xpath(xpathStr).extract()
items=[]
for i in range(0, len(newsUrls)):
# 檢查每個鏈接是否以大類url開頭、以.shtml結尾,如果是返回True
if_belong = newsUrls[i].endswith('.shtml') and newsUrls[i].startswith(meta_1['parentUrls'])
if(if_belong):
item = IfengprojectItem()
item['parentUrls'] = meta_1['parentUrls']
item['parentTitle'] = meta_1['parentTitle']
item['secondUrls'] = meta_1['secondUrls']
item['secondTitle'] = meta_1['secondTitle']
item['newsFileName'] = meta_1['newsFileName']
item['newsUrls'] = newsUrls[i]
items.append(item)
for item in items:
yield scrapy.Request(url=item['newsUrls'], meta={'meta_2':item}, callback=self.news_parse)
def news_parse(self,response):
item = response.meta['meta_2']
content = ""
head = response.xpath("//title/text()")[0].extract()
content_list = response.xpath('//div[@id="main_content"]/p/text() | //div[@id="yc_con_txt"]/p/text()').extract()
if response.xpath("//span[@class='ss01']/text() | //div[@class='yc_tit']/p/span/text()"):
newsPublicTime = response.xpath("//span[@class='ss01']/text() | //div[@class='yc_tit']/p/span/text()")[0].extract()
else:
newsPublicTime = "時間未統計出來"
for each in content_list:
content += each
item['newsHead'] = head
item['newsContent'] = content
item['newsPublicTime'] = newsPublicTime
yield item
3、pipelines.py文件:執行爬取數據的存儲操作
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
class IfengprojectPipeline(object):
def process_item(self, item, spider):
head = item['newsHead']
headStr = re.sub("[\\\\/:*?\"<>|]","",head)
newsPublicTime = item['newsPublicTime']
filename = newsPublicTime + head.rstrip()
pattern=r'[\\/:*?"<>|\r\n]+'
filename = re.sub(pattern,'-',filename)
filename = filename + ".txt"
fp = open(item['newsFileName']+ '/'+ filename, "w",encoding='utf-8')
fp.write(item['newsContent'])
fp.close()
4、settings.py文件:配置文件,打開管道處理爬蟲數據的開關
# 設置管道文件
ITEM_PIPELINES = {
'iFengProject.pipelines.IfengprojectPipeline': 300,
}
結果:爬取結果展示
補充:由於每一個新聞鏈接的源碼格式不統一,在爬取過程,設置的規則有限,並不能覆蓋所有新聞鏈接,因爲有些文件夾/文件爲空。