python 多線程代理刷csdn瀏覽量

原創

2020-03-05 15:48

環境：python3.7

首先拉取自己的所有文章鏈接

# coding: UTF-8
from bs4 import BeautifulSoup
import urllib.request as urlrequest

inFile = open('csdnlink.txt')


def download(url):  # 下載當前網頁內容
    if url is None:
        print("鏈接爲空！")
        return None
    response = urlrequest.urlopen(url)
    if response.getcode() != 200:
        print("訪問失敗！")
        return None
    return response.read()


class Spider(object):
    def __init__(self):
        self.pages = []
        self.datas = []
        self.root = "https://blog.csdn.net/qq_40548741"  # 替換成自己csdn鏈接

    def claw(self, startpage, endpage):
        for i in range(startpage, endpage + 1):
            self.pages.append(self.root + "/article/list/%d?" % i)
        for url in self.pages:
            self.getDatas(url)

    def getDatas(self, url):  # 獲取當前頁所有文章信息
        csdnFile = open('csdnlink.txt', 'a')
        html_cont = download(url)
        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='UTF-8')
        articles = soup.find_all('div', class_='article-item-box csdn-tracking-statistics')

        for article in articles:
            tag_a = article.find('h4').find('a')
            url = tag_a['href']
            csdnFile.write(url + "\n")
        print('共%d' % len(articles) + '篇文章，鏈接寫入完畢')


if __name__ == "__main__":
    tmp = open('csdnlink.txt', 'w')
    tmp.write("")
    spider = Spider()
    spider.claw(1, 1)
    inFile.close()

拉取完後

再把ip代理也按上面格式放入到一個ip.txt文件中，這裏就不放如代理ip了
最後放上刷訪問量代碼

# coding=gbk
import time
import threading
import re
import requests

proxy_list = []
link_list = []


# 讀取代理ip列表
def get_proxy_list():
    global proxy_list
    # ip文件
    f = open("ip.txt")
    line = f.readline().strip('\n')
    while line:
        proxy_list.append(line)
        line = f.readline().strip('\n')
    f.close()


# 讀取文章列表
def get_link_list():
    global link_list
    f = open("csdnlink.txt")
    line = f.readline().strip('\n')
    while line:
        link_list.append(line)
        line = f.readline().strip('\n')
    f.close()


def process_data(ip):
    headers = {
        'Referer': 'https://blog.csdn.net',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
    }
    # 設置代理,格式如下
    proxy_ip = 'http://' + ip
    proxy_ips = 'https://' + ip
    proxy = {'https': proxy_ips, 'http': proxy_ip}
    for url in link_list:
        try:
            response = requests.get(url, headers=headers, proxies=proxy, timeout=15)
            read_num = int(re.compile('<span.*?read-count.*?(\d+).*?</span>').search(response.text).group(1))
            if read_num:
                print(ip + "-------------代理----->" + url + '當前閱讀量：', read_num)
        except requests.exceptions.RequestException:
            print('代理出問題啦:' + ip)
    time.sleep(1)


def start():
    # 無限刷
    while 1:
        threads = []
        for ip in proxy_list:
            t = threading.Thread(target=process_data, args=(ip,))
            t.start()
            threads.append(t)
        # 等待所有線程完成
        for t in threads:
            t.join()
        print("執行完畢,休眠60秒")
        time.sleep(60)


if __name__ == '__main__':
    get_proxy_list()  # 代理ip列表
    get_link_list()  # 文章列表
    start()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python 多線程代理刷csdn瀏覽量

AI 畫圖真刺激，手把手教你如何用 ComfyUI 來畫出刺激的圖

公司剛入職了一名 Java 中級開發，短短 4 行代碼居然湊齊了 3 個 bug！我哭了~~

數據展示動態（跑分）顯示

公衆號5月C#/.NET熱文一覽

git 下載大陸鏡像地址

Oracle插入長度超過4000的字符串

SpringBoot Thymeleaf實現驗證碼生成

多數據源配置springboot+Mybatis

Java8 Stream分割list集合

for循環使用多線程並查看執行結果

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結