import scrapy
import json
import re
import datetime
import time
from w3lib.html import remove_tags
import math
from my_project.items import WeiboItem
class WeiboSpider(scrapy.Spider):
name = ‘weibo’
allowed_domains = [‘weibo.cn’]
start_urls = [‘https://m.weibo.cn/api/container/getIndex?type=uid&value=1793285524&containerid=1076031793285524‘]
def parse(self, response):
res_dict = json.loads(response.text)
# print(res_dict)
#循環所有微博內容並獲取
for data in res_dict['data']['cards']:
try:
name = data['mblog']['user']['screen_name']
print(name)
time = data['mblog']['created_at']
#判斷時間使其輸出格式統一
if '小時' in time:
print(time)
res_time = re.match('\d+', time)
ISOTIMEFORMAT = '%Y-%m-%d %X'
time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
res_hour = re.search(r'\s(\d+):', time_now)
res = int(res_time[0]) - int(res_hour[1])
if res > 0:
now = datetime.datetime.now()
time = (now + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
else:
time = datetime.datetime.now().strftime("%Y-%m-%d")
print(time)
content = data['mblog']['text']
# content = remove_tags(content)
print(content)
passnum = data['mblog']['reposts_count']
print(passnum)
comments_count = data['mblog']['comments_count']
print(comments_count)
attitudes_count = data['mblog']['attitudes_count']
print(attitudes_count)
uid = data['mblog']['user']['id']
#print(uid)
item = WeiboItem()
item['name'] = name
item['time'] = time
item['content'] = content
item['passnum'] = passnum
item['comments_count'] = comments_count
item['attitudes_count'] = attitudes_count
yield item
except Exception as e:
print(e)
try:
#獲取其微博的總數
count_num = res_dict['data']['cardlistInfo']['total']
# print(count_num)
#判斷分了多少頁
res_num = math.ceil(count_num/10)
except Exception as e:
print(e)
# print(res_num)
#獲取其全部微博
for i in range(1,res_num+1):
page_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+str(uid)+'&containerid=107603'+str(uid)+'&page='+str(i)+''
yield scrapy.Request(url=page_url,callback=self.parse)
#找到他的關注的人的ajax請求,並攜帶自身id傳給下個函數
uid_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_'+str(uid)+'_-_1042015:tagCategory_050&luicode=10000011&lfid=107603'+str(uid)+'&type=uid&value='+str(uid)+''
#print(uid)
yield scrapy.Request(url=uid_url,callback=self.parse_info,meta={'gid':uid})
# 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_1878546883_-_1042015%253AtagCategory_012&luicode=10000011&lfid=1076031878546883&type=uid&value=1878546883&page=2'
def parse_info(self,response):
res_dict = json.loads(response.text)
#獲取他的所有關注的人
try:
res_num = res_dict['data']['cardlistInfo']['total']
res_num = math.ceil(res_num/20)
except:
#若果找不到這個值,就付給一個固定頁數,通過一個人找其他的人,所以不需要很多的人
res_num = 5
#這個是由上個函數攜帶多來的
uid = response.meta['gid']
print(uid)
#獲取他關注的人的每頁的內容
for i in range(2,res_num+1):
bs_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_'+str(uid)+'_-_1042015:tagCategory_050&luicode=10000011&lfid=107603'+str(uid)+'&type=uid&value='+str(uid)+'&page='+str(i)+''
#print('我來到了關豬的所有人的地方')
yield scrapy.Request(url=bs_url,callback=self.parse_info,meta={'gid':uid})
# 獲取其他人的id
try:
res_mmp = res_dict['data']['cards'][3]['card_group']
except:
res_mmp = res_dict['data']['cards'][0]['card_group']
for data in res_mmp:
#print('我來到了其他人的主頁')
#print(data)
try:
uid = data['buttons'][0]['actionlog']['oid']
except Exception as e:
print(e)
#print(str(uid)+'------------')
#獲取到其他人的id並返回給上個函數
uid_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+str(uid)+'&containerid=107603'+str(uid)+''
yield scrapy.Request(url=uid_url,callback=self.parse)