用 python 寫個小爬蟲監控某人的 CSDN 文章數據

python / pandas + mysql 寫了一個簡單的小爬蟲,用來爬取 我的 csdn 文章基本數據。python代碼如下。如果不用 mysql,把文件讀寫的部分改成 pandasread_csv / to_csv 也OK。

如果想監控某大佬,或某一組 csdn id 的數據,也可以這麼幹。不過需要在此基礎上簡單加工下。

如果想要持續採集數據,定時執行,可以用到操作系統自帶的計劃任務功能。windows 10 如何設定計劃任務自動執行 python 腳本?

當數據量積累起來後,可以根據數據爬取與分析的目的,用 pandas 進一步統計分析。


import datetime
import re
import urllib.request
import pandas as pd
import random
import pymysql
import os.path

from sqlalchemy import create_engine
conn = create_engine('mysql+pymysql://root:password@localhost:3306/zhihuclawer',encoding='utf8')  

csdn_path = 'D:/crawler/output_csdn/'
wf_log = open(csdn_path + 'log_csdn_my_article.txt', 'at')

# 讀取網頁,把讀取的數據返回爲一個移除了換行空格等符號的 str 對象,供後續 re 處理調用
def read_url(url):
    #header庫實現隨機header,一定程度上剋制反爬
    headers_list = [
        ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.112 Safari/537.38"),
    ]
    rand = random.randint(1,len(headers_list))
    opener = urllib.request.build_opener()
    opener.addheaders = headers_list.copy()[rand-1:rand]
    data = opener.open(url).read()
    html_text = str(data,encoding='utf-8')
    html_text = clean_str(html_text)
    return html_text

def clean_str(str_obj):
    str_obj = str_obj.replace('\n','')
    str_obj = str_obj.replace('\t','')
    str_obj = str_obj.replace(' ','')
    str_obj = str_obj.replace(' ','')
    str_obj = str_obj.replace('\r','')
    return str_obj

def read_a_page(url):
    html_text = read_url(url)
    attn = r'<h4class=""><ahref="(.*?)"target="_blank"><spanclass="article-typetype-(.*?)float-none">(.*?)</span>(.*?)</a></h4><pclass="content"><ahref="(.*?)"target="_blank">(.*?)</a></p><divclass="info-boxd-flexalign-content-center"><p><spanclass="date">(.*?)</span></p><pclass="point"></p><p><spanclass="read-num">閱讀數<spanclass="num">(.*?)</span></span></p><pclass="point"></p><p><spanclass="read-num">評論數<spanclass="num">(.*?)</span></span></p></div></div>'
    result = re.findall(attn,html_text,re.S)
    if result:
        article_df = pd.DataFrame(result,columns=['articlt_url','articlt_itype','articlt_type','articlt_title','articlt_url_2','articlt_desct','press_time','read_count','reply_count'])
        article_df['update_time'] = str(datetime.datetime.now())
        pd.io.sql.to_sql(article_df, "csdn_my_article", conn, if_exists='append', index=False)
        print(datetime.datetime.now(),url,'is done.')
        wf_log.write(str(datetime.datetime.now()) + str(url) + 'is done.\n')
    return article_df

def read_all_pages(pages=5,csdn_id='qiaoanlu'):
    print(datetime.datetime.now(),'read_all_pages ...')
    wf_log.write(str(datetime.datetime.now()) + 'read_all_pages ... args:' + str(pages) + ', ' + csdn_id+'\n')
    for i in range(1,pages+1):
        url = 'https://blog.csdn.net/' + csdn_id + '/article/list/' + str(i) + '?'
        try:
            read_a_page(url)
        except:
            print(datetime.datetime.now(),'some error happended',i)
            wf_log.write(str(datetime.datetime.now()) + str(i) +'some error happended')
            continue
    print(datetime.datetime.now(),'read_all_pages is done.')
    wf_log.write(str(datetime.datetime.now()) +'read_all_pages is done.\n')

def output_file():
    conn = pymysql.connect('localhost','root','789351','zhihuclawer')
    cursor = conn.cursor()
    sql_search = 'select * from csdn_my_article;'
    csdn_my_article = pd.read_sql(sql_search,conn)
    cursor.close()
    conn.close()

    rlt_url = csdn_path + 'csdn_my_article'+str(datetime.datetime.now())[:10]+'.csv'
    csdn_my_article.to_csv(rlt_url, encoding='utf_8_sig', index=False)

    print('文件已生成',rlt_url)
    wf_log.write(str(datetime.datetime.now()) + ' output_file is done.' + '\n')
    return csdn_my_article

def main():
    read_all_pages(pages=5,csdn_id='qiaoanlu')
    output_file()

if __name__ == "__main__":
    main()

如果大神路過,不妨指教下這個小爬蟲腳本還有哪些可以改進的地方。感激不盡。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章