這篇文章主要介紹了Python使用scrapy爬取陽光熱線問政平臺過程解析,文中通過示例代碼介紹的非常詳細,對大家的學習或者工作具有一定的參考學習價值,需要的朋友可以參考下
目的:爬取陽光熱線問政平臺問題反映每個帖子裏面的標題、內容、編號和帖子url
CrawlSpider版流程如下:
創建爬蟲項目dongguang
scrapy startproject dongguang
設置items.py文件
# -*- coding: utf-8 -*-
import scrapy
class NewdongguanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
# 每頁的帖子鏈接
url = scrapy.Field()
# 帖子標題
title = scrapy.Field()
# 帖子編號
number = scrapy.Field()
# 帖子內容
content = scrapy.Field()
在spiders目錄裏面,創建並編寫爬蟲文件sun.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem
class SunSpider(CrawlSpider):
name = 'dg'
allowed_domains = ['wz.sun0769.com']
start_urls = ['http://wz.sun0769.com/html/top/report.shtml']
# rules是Rule的集合,每個rule規則同時執行。另外,如果發現web服務器有反爬蟲機制如返回一個假的url,則可以使用Rule裏面的參數process_links調用一個自編函數來處理url後返回一個真的url
rules = (
# 每個url都有一個獨一無二的指紋,每個爬蟲項目都有一個去重隊列
# Rule裏面沒有回調函數,則默認對匹配的鏈接要跟進,就是對匹配的鏈接在進行請求獲取響應後對響應裏面匹配的鏈接繼續跟進,只不過沒有回調函數對響應數據進行處理
# Rule(LinkExtractor(allow="page="))如果設置爲follow=False,則不會跟進,只顯示當前頁面匹配的鏈接。如設置爲follow=True,則會對每個匹配的鏈接發送請求獲取響應進而從每個響應裏面再次匹配跟進,直至沒有。python遞歸深度默認爲不超過1000,否則會報異常
Rule(LinkExtractor(allow="page=")),
Rule(LinkExtractor(allow='http://wz.sun0769.com/html/question/\d+/\d+.shtml'),callback='parse_item')
)
def parse_item(self, response):
print(response.url)
item = DongguanItem()
item['url'] = response.url
item['title'] = response.xpath('//div[@class="pagecenter p3"]//strong/text()').extract()[0]
item['number'] = response.xpath('//div[@class="pagecenter p3"]//strong/text()').extract()[0].split(' ')[-1].split(':')[-1]
# 對帖子裏面有圖片的處理,發現沒有圖片時則沒有class="contentext"的div標籤,以此作爲標準獲取帖子內容
if len(response.xpath('//div[@class="contentext"]')) == 0:
item['content'] = ''.join(response.xpath('//div[@class="c1 text14_2"]/text()').extract())
else:
item['content'] = ''.join(response.xpath('//div[@class="contentext"]/text()').extract())
yield item
編寫管道pipelines.py文件
# -*- coding: utf-8 -*-
import json
class DongguanPipeline(object):
def __init__(self):
self.file = open('dongguan.json','w')
def process_item(self, item, spider):
content = json.dumps(dict(item),ensure_ascii=False).encode('utf-8') + '\n'
self.file.write(content)
return item
def closespider(self):
self.file.close()
編寫settings.py文件
# -*- coding: utf-8 -*-
BOT_NAME = 'dongguan'
SPIDER_MODULES = ['dongguan.spiders']
NEWSPIDER_MODULE = 'dongguan.spiders'
# log日誌文件默認保存在當前目錄,下面爲日誌級別,當大於或等於INFO時將被保存
LOG_FILE = 'dongguan.log'
LOG_LEVEL = 'INFO'
# 爬取深度設置
# DEPTH_LIMIT = 1
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dongguan (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'dongguan.pipelines.DongguanPipeline': 300,
}鄭州人流醫院 http://www.zykdfk.com/
測試運行爬蟲,終端執行命令(只要在項目目錄內即可)
scrapy crawl dg
Spider版流程如下:
創建爬蟲項目newdongguang
scrapy startproject newdongguan
設置items.py文件
# -*- coding: utf-8 -*-
import scrapy
class NewdongguanItem(scrapy.Item):
# 每頁的帖子鏈接
url = scrapy.Field()
# 帖子標題
title = scrapy.Field()
# 帖子編號
number = scrapy.Field()
# 帖子內容
content = scrapy.Field()
在spiders目錄裏面,創建並編寫爬蟲文件newsun.py
# -*- coding: utf-8 -*-
import scrapy
from newdongguan.items import NewdongguanItem
class NewsunSpider(scrapy.Spider):
name = 'ndg'
# 設置爬取的域名範圍,可寫可不寫,不寫則表示爬取時候不限域名,結果有可能會導致爬蟲失控。
allowed_domains = ['wz.sun0769.com']
offset = 0
url = 'http://wz.sun0769.com/index.php/question/report?page=' + str(offset)
start_urls = [url]
def parse(self, response):
link_list = response.xpath("//a[@class='news14']/@href").extract()
for each in link_list:
# 對每頁的帖子發送請求,獲取帖子內容裏面指定數據返回給管道文件
yield scrapy.Request(each,callback=self.deal_link)
self.offset += 30
if self.offset <= 124260:
url = 'http://wz.sun0769.com/index.php/question/report?page=' + str(self.offset)
# 對指定分頁發送請求,響應交給parse函數處理
yield scrapy.Request(url,callback=self.parse)
# 從每個分頁帖子內容獲取數據,返回給管道
def deal_link(self,response):
item = NewdongguanItem()
item['url'] = response.url
item['title'] = response.xpath("//div[@class='pagecenter p3']//strong[@class='tgray14']/text()").extract()[0]
item['number'] = response.xpath("//div[@class='pagecenter p3']//strong[@class='tgray14']/text()").extract()[0].split(' ')[-1].split(':')[-1]
if len(response.xpath("//div[@class='contentext']")) == 0:
item['content'] = ''.join(response.xpath("//div[@class='c1 text14_2']/text()").extract())
else:
item['content'] = ''.join(response.xpath("//div[@class='contentext']/text()").extract())
yield item
編寫管道pipelines.py文件
# -*- coding: utf-8 -*-
import codecs
import json
class NewdongguanPipeline(object):
def __init__(self):
# 使用codecs寫文件,直接設置文件內容編碼格式,省去每次都要對內容進行編碼
self.file = codecs.open('newdongguan.json','w',encoding = 'utf-8')
# 以前文件寫法
# self.file = open('newdongguan.json','w')
def process_item(self, item, spider):
print(item['title'])
content = json.dumps(dict(item),ensure_ascii=False) + '\n'
# 以前文件寫法
# self.file.write(content.encode('utf-8'))
self.file.write(content)
return item
def close_spider(self):
self.file.close()
編寫settings.py文件
# -*- coding: utf-8 -*-
BOT_NAME = 'newdongguan'
SPIDER_MODULES = ['newdongguan.spiders']
NEWSPIDER_MODULE = 'newdongguan.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'newdongguan (+http://www.yourdomain.com)'
USER_AGENT = 'User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'newdongguan.pipelines.NewdongguanPipeline': 300,
}
測試運行爬蟲,終端執行命
srapy crawl ndg
備註:markdown語法關於代碼塊縮進問題,可通過tab鍵來解決。而簡單文本則可以通過回車鍵來解決,如Spider版流程如下:和1. 創建爬蟲項目newdongguang