多進程+多線程快速爬取西刺代理建立自己的ip池

  使用mysql來存最後的ip池,這個版本是本來已經使用單線程的方式爬取過一定數量的ip過後爲了加快速度,利用已有的代理ip使用多進程+多線程快速爬取自己的ip池,應爲西刺代理可分爲4個板塊所以每一個板塊一個進程,每一個進程裏面在開多個線程

數據庫3個字段  id  ip_address port 

可以繼續優化的點:可以再加上一個字段,統計有多少次這個代理ip不能用要是達到一定次數直接刪掉這個ip,提高效率

from multiprocessing.context import DefaultContext
from multiprocessing.process import BaseProcess
from typing import Optional, Callable, Any, Tuple, Mapping

import pyspider
import  bs4
import requests
import pymysql
import random
import queue
import traceback
import multiprocessing
#裏面的Process是大寫的P
from concurrent.futures import ThreadPoolExecutor
import time



#爬取網站的URL可以按照此來開多個進程裏面在開多線程
urldic={"國內高匿":"https://www.xicidaili.com/nn",
        "國內透明":"https://www.xicidaili.com/nt",
        "https代理":"https://www.xicidaili.com/wn",
        "http代理":"https://www.xicidaili.com/wt"
      }

#驗證當前的ip是否可用
def validateIp(url,ip,port):
    try:
        proxy={"https":"https://"+ip+":"+str(port),"http":"http://"+ip+":"+str(port)}
        response=requests.get(url,proxies=proxy,verify=False,timeout=8,headers=getheaders())
        return  (response.status_code==requests.codes.ok)
    except Exception as e:
        print(e)
        return False

#需要帶上head來請求
def getheaders():
    user_agent_list = [ \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent=random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers

#獲取ip
def getOnepageIp(response,databasequeue):
    bs=bs4.BeautifulSoup(response.text,"lxml")
    c=bs.select("table")[0]
    d=c.select("tr")
    d.pop(0)
    for i in d:
        toip=i.select("td")
            #ip
        ip=toip[1].text
        print(ip)
        #port
        port=eval(toip[2].text)
        print(port)
        if (validateIp("https://www.cnblogs.com/selol/p/5446965.html", ip, port)):
                dic={}
                dic["host"]=ip
                dic["port"]=port
                databasequeue.put(dic)



#獲取所有ip
def getIp(url,ipfromdatabaselist,databasequeue):
    '''

    這裏在拿到一個頁面url過後 創建新的線程來跑每一個線程要求要自己到ip的queue裏面自己取一個ip來跑
    :param url:
    :return:
    '''
    # 先哪一個代理ip取出首頁入口
    ipAndPortdic=random.choice(ipfromdatabaselist)
    userful = validateIp('https://blog.csdn.net/qq_40625030/article/details/79722996', str(ipAndPortdic["host"]),
                         ipAndPortdic["port"])
    while not userful:
        ipAndPortdic = random.choice(ipfromdatabaselist)
        userful = validateIp('https://blog.csdn.net/qq_40625030/article/details/79722996', str(ipAndPortdic["host"]),
                             ipAndPortdic["port"])

    try:
        proxy = {"https": "https://" + str(ipAndPortdic["host"]) + ":" + str(ipAndPortdic["port"]),
                "http": "http://" + str(ipAndPortdic["host"]) + ":" + str(ipAndPortdic["port"])}
        response = requests.get(url, proxies=proxy, verify=False, timeout=8, headers=getheaders())
    except Exception as e:
        print(e)
    bs=bs4.BeautifulSoup(response.text,"lxml")
    #bs4裏面css選擇器>前後都要加空格      with 上下文管理協議
    pagebar=bs.select("div.pagination > a")
    num=len(pagebar)
    allpagenum=pagebar[num-2].text
    threadPool = ThreadPoolExecutor(max_workers=30)
    for i in range(1,eval(allpagenum)+1):
        threadPool.submit(actions(url,i,ipfromdatabaselist,databasequeue))



# 線程的action函數
def actions(url,i,ipfromdatabaselist,databasequeue):

    #先獲取ip
    ipAndPortdic = random.choice(ipfromdatabaselist)
    userful = validateIp('https://blog.csdn.net/qq_40625030/article/details/79722996', str(ipAndPortdic["host"]),
                         ipAndPortdic["port"])
    while not userful:
        ipAndPortdic = random.choice(ipfromdatabaselist)
        userful = validateIp('https://blog.csdn.net/qq_40625030/article/details/79722996', str(ipAndPortdic["host"]),
                             ipAndPortdic["port"])

        onepageurl = url + "/" + str(i)
        # time.sleep(2)
        onepageresponse = requests.get(onepageurl, headers=getheaders())
        getOnepageIp(onepageresponse,databasequeue)


#把得到的ip寫入數據庫
def database(databasequeue):
        while True:
            connect=pymysql.Connect(host='127.0.0.1', port=3308, user='root', password='zc19970919', db='ip', charset='utf8')
            try:
                ipdic=databasequeue.get(block=True)
                ip=ipdic["host"]
                port=ipdic["port"]
                cursor=connect.cursor()
                sql="insert into ip.iptable (ip_address,port) values ('%s',%d) on  duplicate  key update port=%d  "%(ip,port,port)
                cursor.execute(sql)
                print(cursor.lastrowid)
                #事務操作是由connect執行
                connect.commit()
            except Exception as e:
                # *******************************************************
                print(traceback.format_exc())
                connect.rollback()



def getDatabaseIpList():
    sql="select * from ip.iptable"
    connect=pymysql.connect(host='127.0.0.1',port=3308,user='root',password='zc19970919',db='ip',charset='utf8')
    try:
        cursor=connect.cursor()
        cursor.execute(sql)
        # 返回的是一個二維元組
        result=cursor.fetchall()
        resultList=[]
        for each in result:
            resultdic={}
            resultdic["host"]=each[1]
            resultdic["port"]=each[2]
            resultList.append(resultdic)
        return resultList

    except Exception as e:
        traceback.print_exc()

#多進程方法一   還可以直接使用pool (用pool能保證線程安全麼?)
class MyProcess(multiprocessing.Process):
    def __init__(self,name,url,iplist,resultqueqeu):
        super().__init__()
        self.name=name
        self.url=url
        self.ipfromdatabaselist=iplist
        self.databasequeue=resultqueqeu

    def run(self) :
        try:
            ipAndPortdic = random.choice(self.ipfromdatabaselist)
            userful = validateIp('https://blog.csdn.net/qq_40625030/article/details/79722996', str(ipAndPortdic["host"]),
                                ipAndPortdic["port"])
            while not userful:
                ipAndPortdic = random.choice(self.ipfromdatabaselist)
                userful = validateIp('https://blog.csdn.net/qq_40625030/article/details/79722996',
                                    str(ipAndPortdic["host"]),
                                    ipAndPortdic["port"])
            proxy = {"https": "https://" + str(ipAndPortdic["host"]) + ":" + str(ipAndPortdic["port"]),
                     "http": "http://" + str(ipAndPortdic["host"]) + ":" + str(ipAndPortdic["port"])}
            response = requests.get(self.url, proxies=proxy, verify=False, timeout=8, headers=getheaders())
            bs = bs4.BeautifulSoup(response.text, "lxml")
            # bs4裏面css選擇器>前後都要加空格      with 上下文管理協議

            pagebar = bs.select("div.pagination > a")
            num = len(pagebar)
            allpagenum = pagebar[num - 2].text
        except Exception as e:
            print(e)
            self.run()
        threadPool = ThreadPoolExecutor(max_workers=30)
        for i in range(1, eval(allpagenum) + 1):
            future=threadPool.submit(actions(self.url,i,self.ipfromdatabaselist,self.databasequeue))
            print(self.name)
            print(type(future))

class MyDatabaseProcess(multiprocessing.Process):

    def __init__(self,name,databasequeue):
        super().__init__()
        self.name=name
        self.databasequeue=databasequeue
    def run(self):
        database(self.databasequeue)


if __name__ == '__main__':
    # ************************聲明多進程的變量要到main函數****************************************************
    databasequeue = multiprocessing.Manager().Queue()
    ipfromdatabaselist = multiprocessing.Manager().list()
    ipfromdatabaselist=getDatabaseIpList()
    process=[]
    urllist=list(urldic.values())
    process1=MyProcess(name="process1",url=urllist[0],iplist=ipfromdatabaselist,resultqueqeu=databasequeue)
    process2 = MyProcess(name="process2",url=urllist[1],iplist=ipfromdatabaselist,resultqueqeu=databasequeue)
    process3 = MyProcess(name="process3",url=urllist[2],iplist=ipfromdatabaselist,resultqueqeu=databasequeue)
    process4 = MyProcess(name="process4",url=urllist[3],iplist=ipfromdatabaselist,resultqueqeu=databasequeue)
    databaseProcess=MyDatabaseProcess("databaseProcess",databasequeue)

    process1.start()
    print("進程1開始")
    process2.start()
    print("進程2開始")
    process3.start()
    print("進程3開始")
    process4.start()
    print("進程4開始")
    databaseProcess.start()
    # 要加上join讓主線程等待子線程    保持connection  才能訪問裏面的connection
    process1.join()
    print("進程1結束")
    process2.join()
    print("進程2結束")
    process3.join()
    print("進程3結束")
    process4.join()
    print("進程4結束")
    if (not (process1.is_alive() or process2.is_alive() or process3.is_alive() or process4.is_alive())):
        databaseProcess.close()
    databaseProcess.join()
    print("全部結束,停止寫入數據庫!")





 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章