本文的主要目標是獲取微博評論數據,具體包括微博評論鏈接、總評論數、用戶ID、用戶暱稱、評論時間、評論內容、用戶詳情鏈接等。
實現代碼如下所示:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 8 16:39:07 2017
@author: Administrator
"""
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import pymysql.cursors
def crawlDetailPage(url,page,i):
#讀取微博網頁的JSON信息
req = requests.get(url)
jsondata = req.text
data = json.loads(jsondata)
#評論數
commentCounts = data['total_number']
print("第{}頁第{}條微博的評論數爲{}".format(page,i+1,commentCounts))
#循環輸出每一頁的微博信息
for comment in data['data']:
userId = comment['user']['id']
userName = comment['user']['screen_name']
commentTime = comment['created_at']
commentText = comment['text']
userProfileUrl = comment['user']['profile_url']
print("用戶{}創建於:{}".format(userName,commentTime))
print("評論內容爲:{}".format(commentText))
print("用戶詳情鏈接爲:{}".format(userProfileUrl))
'''
數據庫操作
'''
#獲取數據庫鏈接
connection = pymysql.connect(host = 'localhost',
user = 'root',
password = '123456',
db = 'weibo',
charset = 'utf8mb4')
try:
#獲取會話指針
with connection.cursor() as cursor:
#創建sql語句
sql = "insert into `comment` (`commentUrl`,`commentCounts`,`userId`,`userName`,`commentTime`,`commentText`,`userProfileUrl`) values (%s,%s,%s,%s,%s,%s,%s)"
#執行sql語句
cursor.execute(sql,(url,commentCounts,userId,userName,commentTime,commentText,userProfileUrl))
#提交數據庫
connection.commit()
finally:
connection.close()
def crawl(url,page):
#讀取微博網頁的JSON信息
req = requests.get(url)
jsondata = req.text
data = json.loads(jsondata)
#print(data['cards'][5]['scheme'])
#獲取每一頁的數據
content = data['cards']
#print(content[6]['scheme'])
#循環輸出每一頁微博的詳情鏈接
for i in range(2,11):
contentId = content[i]['mblog']['id']
#contentUrl = "https://m.weibo.cn/status/" + contentId
commentUrl = "https://m.weibo.cn/api/comments/show?id=" +str(contentId)
#print("第{}條微博的詳情鏈接爲:{}".format(i+1,commentUrl))
crawlDetailPage(commentUrl,page,i)
t = random.randint(11,13)
print("休眠時間爲:{}s".format(t))
time.sleep(t)
for i in range(1,2):
print("正在獲取第{}頁微博數據:".format(i))
#知乎官方微博數據的JSON鏈接
url = "https://m.weibo.cn/api/container/getIndex?uid=1939498534&type=uid&value=1939498534&containerid=1076031939498534&page=" + str(i)
crawl(url,i)
#設置休眠時間
time.sleep(random.randint(31,33))
部分運行結果如下圖所示:
MySQL數據庫中的數據存儲如下圖所示: