爬 - 台部落

# -*- coding: utf-8 -*-

from urllib import request
import urllib
from lxml import etree
import datetime
import time
import random
import os
import pandas as pd






def setSleep(): #設置隨眠時間
    interval=40 #設置默認間隔時間
    ## 設置兩個鄰近URL請求之間的隨機休眠時間，防止Be Caught。目前沒有模擬登陸
    sleeptime_one = random.randint(interval - 30, interval - 10)
    # sleeptime_two = random.randint(interval + 10, interval + 30)
    # if sleeptime_two % 2 == 0:
    #     sleeptime = sleeptime_two
    # else:
    #     sleeptime = sleeptime_one
    print('sleeping ' + str(sleeptime_one) + ' seconds...')
    time.sleep(sleeptime_one)
    return 0
def getTimescope(start_day, end_day,CMlist):#開始結束時間，以及傳過來的類別
    # start_datetime = datetime.datetime.fromtimestamp(time.mktime(time.strptime(times_list[-1], "%Y-%m-%d-%H")))
    nowTime = datetime.datetime.strptime(start_day, '%Y-%m-%d %H:%M:%S')
    # endtime = datetime.datetime.strptime(end_day, '%Y-%m-%d %H:%M:%S')

    priorYear=str(start_day)[0:4]
    timePath = rootPath + priorYear #按照年份建立文件夾
    try:
        os.makedirs(timePath)
    except:
        print("路徑存在不用建立："+timePath)

    fullPath=  rootPath + priorYear + "/"+CMlist +".csv" #根據類別建立csv文件

    onceCMlist = urllib.parse.urlencode({"kw": CMlist})[3:]  # 改變編碼 如中醫——》%E4%B8%AD%E5%8C%BB  用於處理url 爲後面請求網址做準備
    print("改變類別編碼：" + onceCMlist)

    # 遞增的時間
    delta = datetime.timedelta(hours=1)
    while str(nowTime.strftime('%Y-%m-%d %H:%M:%S')) != end_day:
        #nowTime#爲當前小時
        priorTime=nowTime
        nowTime =nowTime+ delta #下一個小時
        # print(nowTime)
        changePriorTime=str(priorTime).replace(' ', '-')[0:13]
        changeNowTime = str(nowTime).replace(' ', '-')[0:13]
        print(changePriorTime)
        print(changeNowTime)
        # getWeiboDate(changePriorTime,changeNowTime,CMlist)
        nowYear = str(changeNowTime)[0:4]
        if priorYear!=nowYear:#如果當前年與之前的不一樣，就重新賦值
            priorYear=nowYear
            timePath=rootPath+priorYear
            if not os.path.exists(timePath):
                os.makedirs(timePath)
            fullPath = rootPath + priorYear + "/" + CMlist + ".csv"  # 根據類別建立csv文件
        #getDate(nowTime,nextTime)#獲取每小時的數據
        getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist)#獲取微博內容

def chineseMedicineList(): #設置檢索關鍵字列表
    MedicineList = ["中醫","把脈"]#"中藥"
    return MedicineList

def getWeiboDate(changePriorTime,changeNowTime,fullPath,onceCMlist):#爬取的內容格式轉化
    # https://s.weibo.com/weibo?q=%E4%B8%AD%E5%8C%BB&typeall=1&suball=1&timescope=custom:2018-04-03-1:2018-09-17-1&Refer=g
    # 變爲請求

    print("----------------------------")
    url = "https://s.weibo.com/weibo?q="+onceCMlist+"&timescope=custom:"+changePriorTime+":"+changeNowTime+"&Refer=g"
    # url='https://s.weibo.com/weibo?q'+str(once)+'&typeall=1&suball=1&timescope=custom:2018-04-03-1:2018-09-17-1&page=6'
    try:  # 這裏會報錯
        res = request.urlopen(url, timeout=12)
        html = res.read()#如果格式錯誤 當前時間再次檢索《手動檢測過重當前時間開始不會報錯》

        txt = html.decode('utf-8')
        page = etree.HTML(txt)


        dls = page.xpath("//div[@mid]")  # 使用xpath解析 contributor: @Michael Luo <[email protected]>

        # 提取
        for dl in dls:

            mid = str(dl.attrib.get('mid'))#獲取 評論編號
            print( "獲取編號："+mid)
            # result = page.xpath(
            #     "//div[@mid=4284473291699585]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']")[
            #     0]
            try:  # 如果含有展開連接執行這個
                url = "//div[@mid=" + str(
                    mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@style='display: none']"
                result = page.xpath(url)[0]


            except:  # 如果沒有展開連接執行
                url = "//div[@mid=" + str( mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='txt']"
                result = page.xpath(url)[0]

            evaluationText = result.xpath('string(.)')
            evaluationText = str(str(str(evaluationText).replace('收起全文d', '')).replace(" ","")).replace("\n","")#獲取評論
            print(  "這是評論內容:"+str(evaluationText))
            urlnickName = page.xpath("//div[@mid=" + str(
                mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@nick-name]")  # 用於獲得名字
            nike_name = str(urlnickName[0].attrib.get('nick-name'))
            print("nike_name:" + nike_name)

            thisistime = page.xpath("//div[@mid=" + str(
                mid) + "]/div[@class='card']/div[@class='card-feed']/div[@class='content']/p[@class='from']/a/text()")  # 用於獲得時間
            istime = str(str(thisistime[0]).replace(" ", "")).replace("\n","")
            print("發表時間:" + istime)  # 因爲我的數據中含有很多空格，所以刪了空格
            '''
            把內容寫到csv中去
            '''
            allinfomation = [(
                mid,  # 編號
                nike_name,  # 用戶名
                evaluationText,  # 發表內容
                istime,  # 發表時間
            )]
            allinfomationToCsv = pd.DataFrame(allinfomation)
            allinfomationToCsv.to_csv(fullPath, header=False, index=False, mode='a+',encoding='utf_8_sig')
    except:
        print("報錯了 報異常解決 再來一遍 ~~~哈哈哈哈哈哈")
        getWeiboDate(changePriorTime, changeNowTime, fullPath, onceCMlist)

    #爬完一個url後停幾幾秒鐘，防止被抓
    setSleep()



'''
存放爬取內容的位置路徑， root+爬取年份+分類.txt
'''
rootPath="E:/AApaper/weiboData/" #根目錄  
# timePath="" #根目錄下按照年份建立文件夾
# fullPath="" #最終目標路徑

chineseMedicineLists=chineseMedicineList()#獲取檢索列表
for CMlist in chineseMedicineLists:
    startime = "2015-05-09 10:00:00"  # 設置爬蟲開始時間  2015-01-01    20160509
    endtime = "2015-07-01 01:00:00"  # 設置爬蟲結束時間
    getTimescope(startime, endtime,CMlist)
    print(CMlist)


# startime = "2015-10-20 22:00:00"  # 設置爬蟲開始時間
# endtime = "2015-10-21 00:00:00"  # 設置爬蟲結束時間
# getTimescope(startime, endtime)

爬

MySQL 核心模塊揭祕 | 18 期 | 鎖在內存里長什麼樣*

使用perf工具生成火焰圖

HttpSecurity 是如何組裝過濾器鏈的

數說海南——近6年海南各市縣人口簡單看

長序列中Transformers的高級注意力機制總結

WebStorm 創建 Vue 項目

大齡程序員思考

響應式界面控件DevExtreme * 更強的數據分析和可視化功能

ccf 201909-2 小明種蘋果（續）以及出現的問題

ccf 201706-1 打醬油 201706-2 公共鑰匙盒 c++ ac

ccf C++ 201712-1最小值差 201712-2 遊戲

ccf 201803-1 跳一跳 201803-2 碰撞的小球 AC代碼 C++

ccf 201903-2 二十四點 C/C++

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結