爬取字段信息有:
- 關鍵詞
- 微博ID
- 微博內容信息
- 微博讚的個數
- 微博轉發個數
- 微博評論個數
- 轉發微博的轉發原因
- 微博日期
- 轉發源ID
- 原微博的贊個數
- 原微博的評論個數
- 原微博的轉發個數
- 存入數據庫的ID值(可忽略)
spiders文件夾下的microBlogSpider.py裏這樣寫:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider, Request, FormRequest
from scrapy.selector import Selector
import datetime
import random
from blogSpider.items import microBlogItem, keyWordItem
from blogSpider.items import keyWordItem
import json
class MicroblogspiderSpider(scrapy.Spider):
name = 'microBlogSpider'
allowed_domains = ['weibo.cn']
search_url = 'https://weibo.cn/search/mblog'
# 默認100頁
max_page = 100
myCookie = 'xxxxxxxxxxx'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': ' max-age=0',
'Connection': ' keep-alive',
'Content-Type': ' application/x-www-form-urlencoded',
'Host': ' weibo.cn',
'Origin': ' https://weibo.cn',
'Upgrade-Insecure-Requests': ' 1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
def start_requests(self):
keyword = 羅志祥'
startTime = '2020-05-01'
endTime = '2020-05-02'
start_time = datetime.datetime.strptime(startTime, '%Y-%m-%d')
end_time = datetime.datetime.strptime(endTime, '%Y-%m-%d')
# print(start_time)
# print(end_time)
cookie = {}
for i in self.myCookie.split(';')[:-1]:
cookie[i.split('=')[0]] = i.split('=')[1]
while start_time <= end_time:
currentTime = start_time
# 處理時間格式
start = str(currentTime)
end = str(currentTime)
start_T = start[:start.index(' ')].replace('-', '')
end_T = end[:end.index(' ')].replace('-', '')
url = '{url}?hideSearchFrame=&keyword={keyword}&starttime={st}&endtime={et}&sort=hot'.format(url=self.search_url, keyword=keyword, st=start_T, et=end_T)
yield scrapy.FormRequest(
url,
callback=self.parse_index,
cookies=cookie,
headers=self.headers,
meta={
'time': start_time,
'keyWord': keyword,
}
)
start_time += datetime.timedelta(days=1)
def parse_index(self, response):
if not response.body:
return
dateTime = response.meta['time']
keyWord = response.meta['keyWord']
# print(dateTime)
cookie = {}
for i in self.myCookie.split(';')[:-1]:
cookie[i.split('=')[0]] = i.split('=')[1]
url = '{url}&page={page}'.format(url=response.url, page=1)
yield FormRequest(
url,
headers=self.headers,
callback=self.getInfo,
cookies=cookie,
meta={
'time': dateTime,
'keyWord': keyWord,
}
)
def getInfo(self, response):
pageInfo = Selector(response)
# print(response.body.decode('utf-8'))
# allDiv是獲取當前頁面中所有含有微博信息的div標籤
allDiv = pageInfo.xpath('//div[starts-with(@id,"M_")]')
# print(allDiv)
# print(len(allDiv))
# # 然後對所有DIV進行解析
for div in allDiv:
# microBlogInfo = microBlogItem()
# print(div)
# print(type(div))
# 用來標註是否存入數據
flag = 0
microBlogInfo = microBlogItem()
microBlogInfo['keyWord'] = response.meta['keyWord']
microBlogInfo['blogDate'] = str(response.meta['time']).split(' ')[0]
microBlogInfo['microBlogId'] = div.css('a.nk ::text').extract()[0]
microBlogInfo['originalBlogSupportNum'] = ''
microBlogInfo['originalBlogCommentsNum'] = ''
microBlogInfo['originalBlogForwardingNum'] = ''
microBlogInfo['forwardingReason'] = ''
microBlogInfo['fromBlogId'] = ''
# print(microBlogInfo['microBlogId'])
divs = div.xpath('div')
divNum = len(divs)
# print(divNum)
if divNum == 3:
# 包含3個DIV的類型爲帶圖帶轉發微博
try:
microBlogInfo['fromBlogId'] = div.xpath('div[1]/span[@class="cmt"]/a/text()').extract()[0]
except:
flag = 1
# print(microBlogInfo['fromBlogId'])
if flag == 0:
contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
microBlogInfo['contentInfo'] = ''
for content in contents:
content = "".join(content.split())
microBlogInfo['contentInfo'] += content
# 去掉冒號
if microBlogInfo['contentInfo'][0] == ':':
microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
# print(microBlogInfo['contentInfo'])
# 包含贊和轉發數['贊[8322]', '原文轉發[2927]']
originalInfo = div.xpath('div[2]/span[@class="cmt"]/text()').extract()
oSNum = originalInfo[0][originalInfo[0].index('[')+1:originalInfo[0].index(']')]
oFNum = originalInfo[1][originalInfo[1].index('[')+1:originalInfo[1].index(']')]
# 包含評論數['原文評論[333]']
originalComment = div.xpath('div[2]/a[last()]/text()').extract()
oCNum = originalComment[0][originalComment[0].index('[') + 1:originalComment[0].index(']')]
# print(originalInfo)
# print(oSNum)
# print(oFNum)
# print(originalComment)
# print(oCNum)
microBlogInfo['originalBlogSupportNum'] = oSNum
microBlogInfo['originalBlogCommentsNum'] = oCNum
microBlogInfo['originalBlogForwardingNum'] = oFNum
lastDivInfo1 = div.xpath('div[3]/text()').extract()
lastDivInfo2 = div.xpath('div[3]/*/text()').extract()
microBlogInfo['forwardingReason'] = ''
for info1 in lastDivInfo1:
info1 = "".join(info1.split())
microBlogInfo['forwardingReason'] += info1
# print(microBlogInfo['forwardingReason'])
for info2 in lastDivInfo2:
info2 = "".join(info2.split())
if info2.startswith('贊['):
microBlogInfo['numOfSupport'] = info2[info2.index('[')+1:info2.index(']')]
elif info2.startswith('評論['):
microBlogInfo['numOfComments'] = info2[info2.index('[')+1:info2.index(']')]
elif info2.startswith('轉發['):
microBlogInfo['numOfForwarding'] = info2[info2.index('[')+1:info2.index(']')]
else:
continue
# print('讚的個數:'+microBlogInfo['numOfSupport'])
# print('評論個數:'+microBlogInfo['numOfComments'])
# print('轉發個數:'+microBlogInfo['numOfForwarding'])
elif divNum == 2:
firstSpan = div.xpath('div[1]/span[1]/@class').extract()[0]
# print(firstSpan)
# 第一個span的class=‘cmt’則證明是非帶圖轉發微博
if firstSpan == 'cmt':
try:
microBlogInfo['fromBlogId'] = div.xpath('div[1]/span[@class="cmt"]/a/text()').extract()[0]
except:
flag = 1
if flag == 0:
contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
microBlogInfo['contentInfo'] = ''
for content in contents:
content = "".join(content.split())
microBlogInfo['contentInfo'] += content
# 去掉冒號
if microBlogInfo['contentInfo'][0] == ':':
microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
originalInfo = div.xpath('div[1]/span[@class="cmt"]/text()').extract()
oSNum = originalInfo[len(originalInfo)-2][originalInfo[len(originalInfo)-2].index('[')+1:originalInfo[len(originalInfo)-2].index(']')]
oFNum = originalInfo[len(originalInfo)-1][originalInfo[len(originalInfo)-1].index('[')+1:originalInfo[len(originalInfo)-1].index(']')]
# print(oSNum)
# print(oFNum)
originalComment = div.xpath('div[1]/a[last()]/text()').extract()
oCNum = originalComment[0][originalComment[0].index('[') + 1:originalComment[0].index(']')]
# print(oCNum)
microBlogInfo['originalBlogSupportNum'] = oSNum
microBlogInfo['originalBlogCommentsNum'] = oCNum
microBlogInfo['originalBlogForwardingNum'] = oFNum
lastDivInfo1 = div.xpath('div[2]/text()').extract()
lastDivInfo2 = div.xpath('div[2]/*/text()').extract()
microBlogInfo['forwardingReason'] = ''
for info1 in lastDivInfo1:
info1 = "".join(info1.split())
microBlogInfo['forwardingReason'] += info1
# print(microBlogInfo['forwardingReason'])
for info2 in lastDivInfo2:
info2 = "".join(info2.split())
if info2.startswith('贊['):
microBlogInfo['numOfSupport'] = info2[info2.index('[') + 1:info2.index(']')]
elif info2.startswith('評論['):
microBlogInfo['numOfComments'] = info2[info2.index('[') + 1:info2.index(']')]
elif info2.startswith('轉發['):
microBlogInfo['numOfForwarding'] = info2[info2.index('[') + 1:info2.index(']')]
else:
continue
# print('讚的個數:'+microBlogInfo['numOfSupport'])
# print('評論個數:'+microBlogInfo['numOfComments'])
# print('轉發個數:'+microBlogInfo['numOfForwarding'])
# ctt則爲帶圖非轉發微博
elif firstSpan == 'ctt':
contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
microBlogInfo['contentInfo'] = ''
for content in contents:
content = "".join(content.split())
microBlogInfo['contentInfo'] += content
# 去掉冒號
if microBlogInfo['contentInfo'][0] == ':':
microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
lastDivInfo = div.xpath('div[2]/a/text()').extract()
# print(lastDivInfo)
for info in lastDivInfo:
info = "".join(info.split())
if info.startswith('贊['):
microBlogInfo['numOfSupport'] = info[info.index('[') + 1:info.index(']')]
elif info.startswith('評論['):
microBlogInfo['numOfComments'] = info[info.index('[') + 1:info.index(']')]
elif info.startswith('轉發['):
microBlogInfo['numOfForwarding'] = info[info.index('[') + 1:info.index(']')]
else:
continue
# print('讚的個數:'+microBlogInfo['numOfSupport'])
# print('評論個數:'+microBlogInfo['numOfComments'])
# print('轉發個數:'+microBlogInfo['numOfForwarding'])
# 原創不帶圖微博
elif divNum == 1:
contents = div.xpath('div/span[@class="ctt"]/text()').extract()
microBlogInfo['contentInfo'] = ''
for content in contents:
content = "".join(content.split())
microBlogInfo['contentInfo'] += content
# 去掉冒號
if microBlogInfo['contentInfo'][0] == ':':
microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
# print(microBlogInfo['contentInfo'])
lastDivInfo = div.xpath('div/a/text()').extract()
# print(lastDivInfo)
for info in lastDivInfo:
info = "".join(info.split())
if info.startswith('贊['):
microBlogInfo['numOfSupport'] = info[info.index('[') + 1:info.index(']')]
elif info.startswith('評論['):
microBlogInfo['numOfComments'] = info[info.index('[') + 1:info.index(']')]
elif info.startswith('轉發['):
microBlogInfo['numOfForwarding'] = info[info.index('[') + 1:info.index(']')]
else:
continue
# print('讚的個數:'+microBlogInfo['numOfSupport'])
# print('評論個數:'+microBlogInfo['numOfComments'])
# print('轉發個數:'+microBlogInfo['numOfForwarding'])
if flag == 0:
yield microBlogInfo
注意:mycookie填寫你登錄之後的cookie信息就行,具體操作步驟如下:
爬蟲網站登錄之後,F12打開瀏覽器網頁詳細界面,按照下面的點擊,然後複製cookie粘貼到爬蟲文件裏即可
items.py裏這樣寫:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class microBlogItem(scrapy.Item):
'''
微博信息
'''
collection = 'microBlogData'
keyWord = scrapy.Field()
microBlogId = scrapy.Field()
contentInfo = scrapy.Field()
numOfSupport = scrapy.Field()
numOfComments = scrapy.Field()
numOfForwarding = scrapy.Field()
forwardingReason = scrapy.Field()
blogDate = scrapy.Field()
fromBlogId = scrapy.Field()
originalBlogSupportNum = scrapy.Field()
originalBlogCommentsNum = scrapy.Field()
originalBlogForwardingNum = scrapy.Field()
id = scrapy.Field()
pipelines.py裏這樣寫:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class BlogspiderPipeline(object):
def __init__(self):
'''
初始化mongodb的各項參數
:param mongoUrl:
:param mongoPort:
:param mongoDB:
'''
self.mongoUrl = '127.0.0.1'
self.mongoPort = 27017
self.mongoDB = 'SpiderTest'
def open_spider(self, spider):
'''
開啓爬蟲時,鏈接數據庫
:param spider:
:return:
'''
self.client = pymongo.MongoClient(self.mongoUrl, self.mongoPort)
self.db = self.client[self.mongoDB]
def process_item(self, item, spider):
'''
將數據寫入數據庫
:param item:
:param spider:
:return:
'''
exist = self.db[item.collection].find({'microBlogId': item['microBlogId'], 'contentInfo': item['contentInfo'], 'blogDate': item['blogDate']}).count()
if exist == 0:
count = self.db[item.collection].count()
item['id'] = count + 1
self.db[item.collection].insert_one(dict(item))
return item
def close_spider(self, spider):
'''
關閉爬蟲,關閉數據庫
:param spider:
:return:
'''
self.client.close()
接下來,執行爬蟲就可以啦!