利用python3自動在36kr裏查找自己感興趣的內容

最近常常在36kr網站的快訊裏查看自己感興趣內容的及時信息,由於快訊裏信息更新得比較及時快速,自己也很難一直盯着看,故想着要是寫個腳本讓其自動在後天掛着每隔5分鐘查詢一次,有的話就寫入txt檔中並在控制檯打印出來,這樣自己有空時就看一眼,感覺就會要方便一下,就是玩玩. 腳本如下,供參考:

 

#!/user/bin/env python3
#-*- coding:utf-8 -*-

import requests
from lxml import etree
from time import sleep
import time
import os
   
'''
python           3.6.5
lxml             4.3.3
requests         2.21.0
windows10
'''

class Check36kr:
    def __init__(self):
        self.hrefList       = list()
        self.titleList      = list()
        self.focusInfoDict  = dict()
        self.oldinfoList    = list()
        # 自己感興趣內容的關鍵字, 可自定義
        self.keywords       = ('微信', '微博', 'QQ', '騰訊', '阿里', '百度', '多閃', '視頻', '優酷', '愛奇藝', 'AI', '識別')
        
        # 上網相關的info
        self.url = 'https://36kr.com/newsflashes'
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3833.0 Safari/537.36'}
        
        # 新建新資訊收集txt
        if os.path.exists("C:\\Users\\" + os.getlogin() + "\\Desktop\\新資訊收集.txt") == False:
            with open("C:\\Users\\" + os.getlogin() + "\\Desktop\\新資訊收集.txt", 'a') as f:
                f.write('Title' + ' '*60 + 'Link \n')
    
    # 計算當前總共有幾頁的文章, 以字典的形式返回
    def getpagecontent(self):
        self.focusInfoDict.clear()
        self.hrefList.clear() 
        self.titleList.clear()

        self.r           = requests.get(self.url, headers = self.headers)
        self.html        = etree.HTML(self.r.text)
        self.titleList   = self.html.xpath('//div[1]/div[2]/div/div[1]/div/div[2]/div/a/text()')
        self.hrefList    = self.html.xpath('//div[1]/div[2]/div/div[1]/div/div[2]/div/a/@href')
        return dict(map(lambda x,y:[x,y], self.titleList,self.hrefList))
        
    def isExistsOfKeyword(self, keywords):
        for title, href in self.getpagecontent().items():
            for keyword in keywords:
                if keyword in title:
                    if title not in self.oldinfoList:
                        self.oldinfoList.append(title)
                        self.focusInfoDict.update({title: 'https://36kr.com' + href})
        if len(self.focusInfoDict) == 0:
            return False
        else:
            return True
            
    def printInfo(self): 
        if self.isExistsOfKeyword(self.keywords):
            for title, href in self.focusInfoDict.items():
                self.savedata(title, href)
                print(title)
            
    def savedata(self, title, href):
        # title/href都變成長度爲60個字符寬度,不足以*填充
        # 目的是在txt中對齊,方便查看(打開txt記得用notepad打開)
        def is_Chinese(word):
            chinese_count = 0
            length = len(word)
            for ch in word:
                if '\u4e00' <= ch <= '\u9fff':
                    chinese_count += 1
            if chinese_count == len(word):
                return True
            else:
                return False
            
        title_e_length = 0
        href_e_length  = 0
        title_length   = 0
        href_length    = 0
        
        specific_char = ('(', ')', '、', ':', ',', '!')
        for single in title:
            if is_Chinese(single) == False and single not in specific_char:
                title_e_length += 1
        for single in href:
            if is_Chinese(single) == False and single not in specific_char:
                href_e_length += 1
        title_length = title_e_length*1 + (len(title) - title_e_length)*2
        href_length  = href_e_length*1 + (len(href) - href_e_length)*2
        
        if title_length < 65:
            title = title + ' '*(65-title_length)
        
        # 保存info
        with open("C:\\Users\\" + os.getlogin() + "\\Desktop\\新資訊收集.txt", 'a') as f:
            f.write(title)
            f.write(href + '\n')      
            
    def run(self):
        while True:
            self.printInfo()
            sleep(300)
                
if __name__ == '__main__':
    check36kr = Check36kr()
    check36kr.run()  

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章