# -*- coding: utf-8 -*-
from urllib import request
import urllib
from lxml import etree
import datetime
import time
import random
import os
import pandas as pd
def setSleep(): #設置隨眠時間
interval=40 #設置默認間隔時間
## 設置兩個鄰近URL請求之間的隨機休眠時間,防止Be Caught。目前沒有模擬登陸
sleeptime_one = random.randint(interval - 30, interval - 10)
# sleeptime_two = random.randint(interval + 10, interval + 30)
# if sleeptime_two % 2 == 0:
# sleeptime = sleeptime_two
# else:
# sleeptime = sleeptime_one
print('sleeping ' + str(sleeptime_one) + ' seconds...')
time.sleep(sleeptime_one)
return 0
def getTimescope(start_day, end_day,CMlist):#開始結束時間,以及傳過來的類別
# start_datetime = datetime.datetime.fromtimestamp(time.mktime(time.strptime(times_list[-1], "%Y-%m-%d-%H")))
nowTime = datetime.datetime.strptime(start_day, '%Y-%m-%d %H:%M:%S')
# endtime = datetime.datetime.strptime(end_day, '%Y-%m-%d %H:%M:%S')
priorYear=str(start_day)[0:4]
timePath = rootPath + priorYear #按照年份建立文件夾
try:
os.makedirs(timePath)
except:
print("路徑存在不用建立:"+timePath)
fullPath= rootPath + priorYear + "/"+CMlist +".csv" #根據類別建立csv文件
onceCMlist = urllib.parse.urlencode({"kw": CMlist})[3:] # 改變編碼 如中醫——》%E4%B8%AD%E5%8C%BB 用於處理url 爲後面請求網址做準備
print("改變類別編碼:" + onceCMlist)
# 遞增的時間
delta = datetime.timedelta(hours=1)
while str(nowTime.strftime('%Y-%m-%d %H:%M:%S')) != end_day:
#nowTime#爲當前小時
priorTime=nowTime
nowTime =nowTime+ delta #下一個小時
# print(nowTime)
changePriorTime=str(priorTime).replace(' ', '-')[0:13]
changeNowTime = str(nowTime).replace(' ', '-')[0:13]
print(changePriorTime)
print(changeNowTime)
# getWeiboDate(changePriorTime,changeNowTime,CMlist)
nowYear = str(changeNowTime)[0:4]
if priorYear!=nowYear:#如果當前年與之前的不一樣,就重新賦值
priorYear=nowYear
timePath=rootPath+priorYear
if not os.path.exists(timePath):
os.makedirs(timePath)
fullPath = rootPath + priorYear + "/" + CMlist + ".csv" # 根據類別建立csv文件
#getDate(nowTime,nextTime)#獲取每小時的數據
getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist)#獲取微博內容
def chineseMedicineList(): #設置檢索關鍵字列表
MedicineList = ["中醫","把脈"]#"中藥"
return MedicineList
def getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist):#爬取的內容格式轉化
# https://s.weibo.com/weibo?q=%E4%B8%AD%E5%8C%BB&typeall=1&suball=1×cope=custom:2018-04-03-1:2018-09-17-1&Refer=g
# 變爲請求
print("----------------------------")
url = "https://s.weibo.com/weibo?q="+onceCMlist+"×cope=custom:"+changePriorTime+":"+changeNowTime+"&Refer=g"
# url='https://s.weibo.com/weibo?q'+str(once)+'&typeall=1&suball=1×cope=custom:2018-04-03-1:2018-09-17-1&page=6'
try: # 這裏會報錯
res = request.urlopen(url, timeout=12)
html = res.read()#如果格式錯誤 當前時間再次檢索《手動檢測過重當前時間開始不會報錯》
txt = html.decode('utf-8')
page = etree.HTML(txt)
dls = page.xpath("//div[@mid]") # 使用xpath解析 contributor: @Michael Luo <[email protected]>
# 提取
for dl in dls:
mid = str(dl.attrib.get('mid'))#獲取 評論編號
print( "獲取編號:"+mid)
# result = page.xpath(
# "//div[@mid=4284473291699585]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']")[
# 0]
try: # 如果含有展開連接執行這個
url = "//div[@mid=" + str(
mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']"
result = page.xpath(url)[0]
except: # 如果沒有展開連接執行
url = "//div[@mid=" + str( mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='txt']"
result = page.xpath(url)[0]
evaluationText = result.xpath('string(.)')
evaluationText = str(str(str(evaluationText).replace('收起全文d', '')).replace(" ","")).replace("\n","")#獲取評論
print( "這是評論內容:"+str(evaluationText))
urlnickName = page.xpath("//div[@mid=" + str(
mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@nick-name]") # 用於獲得名字
nike_name = str(urlnickName[0].attrib.get('nick-name'))
print("nike_name:" + nike_name)
thisistime = page.xpath("//div[@mid=" + str(
mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='from']/a/text()") # 用於獲得時間
istime = str(str(thisistime[0]).replace(" ", "")).replace("\n","")
print("發表時間:" + istime) # 因爲我的數據中含有很多空格,所以刪了空格
'''
把內容寫到csv中去
'''
allinfomation = [(
mid, # 編號
nike_name, # 用戶名
evaluationText, # 發表內容
istime, # 發表時間
)]
allinfomationToCsv = pd.DataFrame(allinfomation)
allinfomationToCsv.to_csv(fullPath, header=False, index=False, mode='a+',encoding='utf_8_sig')
except:
print("報錯了 報異常解決 再來一遍 ~~~哈哈哈哈哈哈")
getWeiboDate(changePriorTime, changeNowTime, fullPath, onceCMlist)
#爬完一個url後停幾幾秒鐘,防止被抓
setSleep()
'''
存放爬取內容的位置路徑, root+爬取年份+分類.txt
'''
rootPath="E:/AApaper/weiboData/" #根目錄
# timePath="" #根目錄下按照年份建立文件夾
# fullPath="" #最終目標路徑
chineseMedicineLists=chineseMedicineList()#獲取檢索列表
for CMlist in chineseMedicineLists:
startime = "2015-05-09 10:00:00" # 設置爬蟲開始時間 2015-01-01 20160509
endtime = "2015-07-01 01:00:00" # 設置爬蟲結束時間
getTimescope(startime, endtime,CMlist)
print(CMlist)
# startime = "2015-10-20 22:00:00" # 設置爬蟲開始時間
# endtime = "2015-10-21 00:00:00" # 設置爬蟲結束時間
# getTimescope(startime, endtime)