python 多線程代理刷csdn瀏覽量

環境:python3.7

首先拉取自己的所有文章鏈接

# coding: UTF-8
from bs4 import BeautifulSoup
import urllib.request as urlrequest

inFile = open('csdnlink.txt')


def download(url):  # 下載當前網頁內容
    if url is None:
        print("鏈接爲空!")
        return None
    response = urlrequest.urlopen(url)
    if response.getcode() != 200:
        print("訪問失敗!")
        return None
    return response.read()


class Spider(object):
    def __init__(self):
        self.pages = []
        self.datas = []
        self.root = "https://blog.csdn.net/qq_40548741"  # 替換成自己csdn鏈接

    def claw(self, startpage, endpage):
        for i in range(startpage, endpage + 1):
            self.pages.append(self.root + "/article/list/%d?" % i)
        for url in self.pages:
            self.getDatas(url)

    def getDatas(self, url):  # 獲取當前頁所有文章信息
        csdnFile = open('csdnlink.txt', 'a')
        html_cont = download(url)
        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='UTF-8')
        articles = soup.find_all('div', class_='article-item-box csdn-tracking-statistics')

        for article in articles:
            tag_a = article.find('h4').find('a')
            url = tag_a['href']
            csdnFile.write(url + "\n")
        print('共%d' % len(articles) + '篇文章,鏈接寫入完畢')


if __name__ == "__main__":
    tmp = open('csdnlink.txt', 'w')
    tmp.write("")
    spider = Spider()
    spider.claw(1, 1)
    inFile.close()

拉取完後
在這裏插入圖片描述
再把ip代理也按上面格式放入到一個ip.txt文件中,這裏就不放如代理ip了
最後放上刷訪問量代碼

# coding=gbk
import time
import threading
import re
import requests

proxy_list = []
link_list = []


# 讀取代理ip列表
def get_proxy_list():
    global proxy_list
    # ip文件
    f = open("ip.txt")
    line = f.readline().strip('\n')
    while line:
        proxy_list.append(line)
        line = f.readline().strip('\n')
    f.close()


# 讀取文章列表
def get_link_list():
    global link_list
    f = open("csdnlink.txt")
    line = f.readline().strip('\n')
    while line:
        link_list.append(line)
        line = f.readline().strip('\n')
    f.close()


def process_data(ip):
    headers = {
        'Referer': 'https://blog.csdn.net',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
    }
    # 設置代理,格式如下
    proxy_ip = 'http://' + ip
    proxy_ips = 'https://' + ip
    proxy = {'https': proxy_ips, 'http': proxy_ip}
    for url in link_list:
        try:
            response = requests.get(url, headers=headers, proxies=proxy, timeout=15)
            read_num = int(re.compile('<span.*?read-count.*?(\d+).*?</span>').search(response.text).group(1))
            if read_num:
                print(ip + "-------------代理----->" + url + '當前閱讀量:', read_num)
        except requests.exceptions.RequestException:
            print('代理出問題啦:' + ip)
    time.sleep(1)


def start():
    # 無限刷
    while 1:
        threads = []
        for ip in proxy_list:
            t = threading.Thread(target=process_data, args=(ip,))
            t.start()
            threads.append(t)
        # 等待所有線程完成
        for t in threads:
            t.join()
        print("執行完畢,休眠60秒")
        time.sleep(60)


if __name__ == '__main__':
    get_proxy_list()  # 代理ip列表
    get_link_list()  # 文章列表
    start()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章