一、任務需求
1. 爬取新聞網站的文章及評論
2. 新聞網頁數目不少於10萬頁
3. 每個新聞網頁及其評論能在1天內更新
二、功能設計
1. 設計一個網絡爬蟲,能夠爬取指定網站的全部頁面,並提取其中的文章及評論內容
2. 定時運行網絡爬蟲,實現每日更新數據
三、系統架構
首先簡單介紹下scrapy框架,這是一個爬蟲框架
綠線是數據流向,
(1)首先從初始URL 開始,Scheduler 會將其交給 Downloader 進行下載,
(2)下載之後會交給 Spider 進行分析,這裏的spider就是爬蟲的核心功能代碼
(3)Spider分析出來的結果有兩種:一種是需要進一步抓取的鏈接,它們會通過middleware傳回 Scheduler ;另一種是需要保存的數據,送入Item Pipeline ,進行處理和存儲
(4)最後將所有數據輸出並保存爲文件
四、代碼實現
Spider
# -*- coding:utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from ..items import newsItem
from scrapy.linkextractors import LinkExtractor
import re, requests, json
from scrapy.selector import Selector
count = 0
class news163_Spider(CrawlSpider):
# 網易新聞爬蟲名稱
name = "163news"
# 僞裝成瀏覽器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}
#網易全網
allowed_domains = [
"163.com"
]
#新聞版
start_urls = [
'http://news.163.com/'
]
#可以繼續訪問的url規則,http://news.163.com/\d\d\d\d\d(/([\w\._+-])*)*$
rules = [
Rule(LinkExtractor(
allow=(
('http://news\.163\.com/.*$'),
('http://ent\.163\.com/.*$'),
('http://money\.163\.com/.*$'),
('http://war\.163\.com/.*$'),
('http://sport\.163\.com/.*$'),
('http://tech\.163\.com/.*$'),
('http://fashion\.163\.com/.*$'),
('http://auto\.163\.com/.*$'),
('http://jiankang\.163\.com/.*$')
),
deny = ('http://.*.163.com/photo.*$')
),
callback="parse_item",
follow=True)
]
def parse_item(self, response):
# response是當前url的響應
article = Selector(response)
article_url = response.url
global count
# 分析網頁類型
# 比較新的網易新聞 http://news.163.com/05-17/
if get_category(article) == 1:
articleXpath = '//*[@id="epContentLeft"]'
if article.xpath(articleXpath):
titleXpath = '//*[@id="epContentLeft"]/h1/text()'
dateXpath = '//*[@id="epContentLeft"]/div[1]/text()'
contentXpath = '//*[@id="endText"]'
news_infoXpath ='//*[@id="post_comment_area"]/script[3]/text()'
# 標題
if article.xpath(titleXpath):
news_item = newsItem()
news_item['url'] = article_url
get_title(article, titleXpath, news_item)
# 日期
if article.xpath(dateXpath):
get_date(article, dateXpath, news_item)
# 內容
if article.xpath(contentXpath):
get_content(article, contentXpath, news_item)
count = count + 1
news_item['id'] = count
# 評論
try:
comment_url = get_comment_url(article, news_infoXpath)
# 評論處理
comments = get_comment(comment_url, news_item)[1]
news_item['comments'] = comments
except:
news_item['comments'] = ' '
news_item['heat'] = 0
yield news_item
# http://news.163.com/40706/
if get_category(article) == 2:
articleXpath = '/html/body/table[9]/tr/td[1]'
if article.xpath(articleXpath):
titleXpath = '/html/body/table[9]/tr/td[1]/table[1]/tr[1]/td/text()'
dateXpath = '/html/body/table[9]/tr/td[1]/table[1]/tr[2]/td[2]/table/tbody/tr[2]/td[1]/text()[1]'
contentXpath = '//*[@id="content"]'
news_item = newsItem()
news_item['url'] = article_url
# 標題
if article.xpath(titleXpath):
get_title(article, titleXpath, news_item)
# 日期
if article.xpath(dateXpath):
get_date(article, dateXpath, news_item)
# 內容
if article.xpath(contentXpath):
get_content(article, contentXpath, news_item)
count = count + 1
news_item['id'] = count
news_item['heat'] = 0
news_item['comments'] = ' '
yield news_item
'''通用標題處理函數'''
def get_title(article, titleXpath, news_item):
#標題
try:
article_title = article.xpath(titleXpath).extract()[0]
article_title = article_title.replace('\n', '')
article_title = article_title.replace('\r', '')
article_title = article_title.replace('\t', '')
article_title = article_title.replace(' ', '')
news_item['title'] = article_title
except:
news_item['title'] = ' '
'''通用日期處理函數'''
def get_date(article, dateXpath, news_item):
# 時間
try:
article_date = article.xpath(dateXpath).extract()[0]
pattern = re.compile("(\d.*\d)") # 正則匹配新聞時間
article_datetime = pattern.findall(article_date)[0]
#article_datetime = datetime.datetime.strptime(article_datetime, "%Y-%m-%d %H:%M:%S")
news_item['date'] = article_datetime
except:
news_item['date'] = '2010-10-01 17:00:00'
'''網站分類函數'''
def get_category(article):
if article.xpath('//*[@id="epContentLeft"]'):
case = 1 # 最近的網易新聞
return case
elif article.xpath('/html/body/table[9]/tr/td[1]'):
case = 2 # 零幾年的網易新聞
return case
'''字符過濾函數'''
def str_replace(content):
# article_content = ' '.join(content)
# rule = re.compile('\w')
try:
article_content = re.sub('[\sa-zA-Z\[\]!/*(^)$%~@#…&¥—+=_<>.{}\'\-:;"‘’|]', '', content)
return article_content
except:
return content
'''通用正文處理函數'''
def get_content(article, contentXpath, news_item):
try:
content_data = article.xpath(contentXpath )
article_content = content_data.xpath('string(.)').extract()[0]
article_content = str_replace(article_content)
news_item['content'] = article_content
# 匹配新聞簡介,前100個字
try:
abstract = article_content[0:100]
news_item['abstract'] = abstract
except 1:
news_item['abstract'] = article_content
# except 2:
# index = article_content.find('。')
# abstract = article_content[0:index]
# news_item['abstract'] = abstract
except:
news_item['content'] = ' '
news_item['abstract'] = ' '
'''評論信息提取函數'''
def get_comment_url(article,news_infoXpath):
news_info = article.xpath(news_infoXpath)
news_info_text = news_info.extract()[0]
pattern_productKey = re.compile("\"productKey\" :.*")
productKey_text = pattern_productKey.findall(news_info_text)[0]
productKey = re.findall(r"\"productKey\".*\"(.*)\"", productKey_text)
pattern_docId = re.compile("\"docId\" :.*")
docId_text = pattern_docId.findall(news_info_text)[0]
docId = re.findall(r"\"docId\".*\"(.*)\"", docId_text)
comment_url = 'http://comment.news.163.com/api/v1/products/' + productKey[0] + '/threads/' + docId[0] + '/comments/newList?offset=0'
return comment_url
'''評論處理函數'''
def get_comment(comment_url, news_item):
comments = []
comment_id = 0
try:
comment_data = requests.get(comment_url).text
js_comment = json.loads(comment_data)
try:
heat = js_comment['newListSize']
news_item['heat'] = heat
js_comments = js_comment['comments']
for each,value in js_comments.items():
comment_id += 1
comments_dict = {}
# 評論id
comments_dict['id'] = comment_id
# 評論用戶名
try:
comments_dict['username'] = value['user']['nickname']
except:
comments_dict['username'] = '匿名用戶'
try:
# 評論時間,datetime格式
date_time = value['createTime']
#date_time = datetime.datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
comments_dict['date_time'] = date_time
except:
comments_dict['date_time'] = news_item['date']
# 評論內容
ori_content = value['content']
content = str_replace(ori_content)
comments_dict['content'] = content
comments.append(comments_dict)
if comments:
return heat, comments
else:
return 0,''
except:
return 0, ''
except:
return 0, ''
下面將代碼分部解釋
rules = [
Rule(LinkExtractor(
allow=(
('http://news\.163\.com/.*$'),
('http://ent\.163\.com/.*$'),
('http://money\.163\.com/.*$'),
('http://war\.163\.com/.*$'),
('http://sport\.163\.com/.*$'),
('http://tech\.163\.com/.*$'),
('http://fashion\.163\.com/.*$'),
('http://auto\.163\.com/.*$'),
('http://jiankang\.163\.com/.*$')
),
deny = ('http://.*.163.com/photo.*$')
),
callback="parse_item",
follow=True)
]
這個是爬蟲的網頁訪問規則,linkextractor根據其中的allow和deny規定的URL規則尋找並訪問指定的鏈接(只允許訪問某些板塊並禁止訪問帶有某些後綴的網址)。callback=“parse_item”表示將URL的訪問結果返回給方法parse_item進行處理,follow=True表示爬蟲遍歷起始頁面後會繼續遍歷每個鏈接頁面裏的鏈接,直到達到停止要求,false表示只遍歷起始頁內的鏈接,不再繼續跟進。
def parse_item(self, response):
# response是當前url的響應
article = Selector(response)
article_url = response.url
global count
# 分析網頁類型
# 比較新的網易新聞 http://news.163.com/05-17/
if get_category(article) == 1:
articleXpath = '//*[@id="epContentLeft"]'
if article.xpath(articleXpath):
titleXpath = '//*[@id="epContentLeft"]/h1/text()'
dateXpath = '//*[@id="epContentLeft"]/div[1]/text()'
contentXpath = '//*[@id="endText"]'
news_infoXpath ='//*[@id="post_comment_area"]/script[3]/text()'
# 標題
if article.xpath(titleXpath):
news_item = newsItem()
news_item['url'] = article_url
get_title(article, titleXpath, news_item)
# 日期
if article.xpath(dateXpath):
get_date(article, dateXpath, news_item)
# 內容
if article.xpath(contentXpath):
get_content(article, contentXpath, news_item)
count = count + 1
news_item['id'] = count
# 評論
try:
comment_url = get_comment_url(article, news_infoXpath)
# 評論處理
comments = get_comment(comment_url, news_item)[1]
news_item['comments'] = comments
except:
news_item['comments'] = ' '
news_item['heat'] = 0
yield news_item
由於網易的文章頁面結構不止一種,所以需要分類進行爬取,根據頁面xpath層級結構可以將整個網站分爲若干種,這裏只取主要的兩種,通過瀏覽器的F12調試功能查看網頁中不同內容所屬的xpath,後續通過xpath進行內容定位
'''評論信息提取函數'''
def get_comment_url(article,news_infoXpath):
news_info = article.xpath(news_infoXpath)
news_info_text = news_info.extract()[0]
pattern_productKey = re.compile("\"productKey\" :.*")
productKey_text = pattern_productKey.findall(news_info_text)[0]
productKey = re.findall(r"\"productKey\".*\"(.*)\"", productKey_text)
pattern_docId = re.compile("\"docId\" :.*")
docId_text = pattern_docId.findall(news_info_text)[0]
docId = re.findall(r"\"docId\".*\"(.*)\"", docId_text)
comment_url = 'http://comment.news.163.com/api/v1/products/' + productKey[0] + '/threads/' + docId[0] + '/comments/newList?offset=0'
return comment_url
'''評論處理函數'''
def get_comment(comment_url, news_item):
comments = []
comment_id = 0
try:
comment_data = requests.get(comment_url).text
js_comment = json.loads(comment_data)
try:
heat = js_comment['newListSize']
news_item['heat'] = heat
js_comments = js_comment['comments']
for each,value in js_comments.items():
comment_id += 1
comments_dict = {}
# 評論id
comments_dict['id'] = comment_id
# 評論用戶名
try:
comments_dict['username'] = value['user']['nickname']
except:
comments_dict['username'] = '匿名用戶'
try:
# 評論時間,datetime格式
date_time = value['createTime']
#date_time = datetime.datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
comments_dict['date_time'] = date_time
except:
comments_dict['date_time'] = news_item['date']
# 評論內容
ori_content = value['content']
content = str_replace(ori_content)
comments_dict['content'] = content
comments.append(comments_dict)
if comments:
return heat, comments
else:
return 0,''
except:
return 0, ''
except:
return 0, ''
新聞的評論是用JavaScript動態生成的,源代碼中沒有,所以不能再用xpath的方式提取,這裏使用一個討巧的方法,通過瀏覽器調試功能,找到加載評論對應的網頁響應,分析評論頁的鏈接URL組成,在新聞頁的源代碼中找到對應關鍵字,拼湊而成
Item
from scrapy import Item, Field
class newsItem(Item):#新聞
# 文章標題
title = Field()
# 時間
date = Field()
# 正文
content = Field()
#簡介(20個字)
abstract = Field()
# 文章熱度(參與數)
heat = Field()
# ID
id = Field()
# 鏈接
url = Field()
# 評論字典
comments = Field()
Item裏存放的是爬取下來的新聞各部分,送入pipeline格式化輸出到文件Pipeline
import json
import codecs
class ScrapyspiderPipeline(object):
def __init__(self):
self.file = codecs.open('xxxx.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
管道,用於將Item裏存放的數據輸出到文件,可以輸出爲json或csv,默認是不輸出的settings
BOT_NAME = 'scrapyspider'
SPIDER_MODULES = ['scrapyspider.spiders']
NEWSPIDER_MODULE = 'scrapyspider.spiders'
# Configure maximum concurrent requests performed by Scrapy (default: 16)最大併發請求數
CONCURRENT_REQUESTS = 128
COOKIES_ENABLED = False
FEED_EXPORT_ENCODING = 'utf-8'
DOWNLOAD_DELAY = 0.01
CLOSESPIDER_ITEMCOUNT = 50100
DOWNLOAD_TIMEOUT = 10
COOKIES_ENABLED = False
禁止cookies,防止被ban
DOWNLOAD_DELAY = 0.01
下載延時,減輕服務器壓力,防止被ban,
CLOSESPIDER_ITEMCOUNT = 50100
爬取頁面的數量,爬到指定數量後停止跟進,輸出文件並退出爬蟲DOWNLOAD_TIMEOUT = 10
單個頁面爬取超時時間,超時後捨棄該頁面
Launch
import time
import datetime
from scrapy import cmdline
def runnews(h, m):
'''h表示設定的小時,m爲設定的分鐘'''
while True:
# 判斷是否達到設定時間,例如0:00
while True:
now = datetime.datetime.now()
if (now.hour == h and now.minute >= m) or (now.hour > h):
break
# 每10秒再次檢測
time.sleep(10)
cmdline.execute("scrapy crawl ifengnews -o xxxx.csv".split())
runnews(8, 38)
定時爬取模塊