用代理的方式爬取boss直聘的信息

import requests
from bs4 import BeautifulSoup
from get_proxy import GetProxy
from urllib import parse
from day03.pymysql_text import Mysql_text
#請求頭信息,放到全局變量方便使用
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
#定義一個函數用來獲取boss直聘詳情頁url
def get_all_info(my_ip):
    #第一個循環用來獲取分頁信息
    for i in range(1,4):
        url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=%s&ka=page-%s' %(i,i)
        #這個循環是當你ip被封了之後,用except下面的函數獲取新的代理ip
        for i in range(4):
            #如果ip失效就換新的代理ip
            try:
                #用BeautifulSoup獲取我們想要的數據
                response = requests.get(url,headers=headers,proxies=my_ip.proxy,timeout=5)
                soup = BeautifulSoup(response.text,'lxml')
                a_list = soup.select('div.info-primary > h3 > a ')
                for a_ele in a_list:
                    #循環拼接詳情頁的路徑
                    a_href = a_ele['href']
                    # print(a_href)
                    info_url = parse.urljoin(url,a_href)
                    print(info_url)
                    #這個循環個下面的try except是在ip失效之後獲取新的ip
                    for i in range(4):
                        try:
                            #調用獲取最終數據的函數
                            get_boss_info(my_ip, info_url)
                            #獲取成功就break結束程序,避免代理ip的浪費
                            break
                        except Exception as e:
                            #獲取新的代理ip
                            my_ip.updata_proxy()
                            print(e)

            except Exception as e:
                print(e)
                my_ip.updata_proxy()
#這個函數用來獲取我們想要的數據
def get_boss_info(my_ip,info_url):
    response = requests.get(info_url,proxies = my_ip.proxy,headers=headers,timeout=5)
    html_ele = BeautifulSoup(response.text,'lxml')

    title = html_ele.select('h1')[0].text
    # print(title)
    price = html_ele.find('span',class_ = "badge").text.replace('\n','').strip()
    # print(price)
    address = html_ele.select('div.info-primary p')[0].text.replace('\n','').strip()
    # print(address)
    yaoqiu = html_ele.select('div.text')[0].text
    # print(yaoqiu)
    data = (title,price,address,yaoqiu)
    print(data)
    #實例化之前封裝好的mysql類並實例化和執行數據添加
    m = Mysql_text()
    sql = 'insert into boss(title,price,address,yaoqiu) VALUES (%s,%s,%s,%s)'
    m.sqlzz(sql,data)



if __name__ == '__main__':
    #實例化獲取代理的類的對象
    my_ip = GetProxy()
    #調用第一個函數
    get_all_info(my_ip)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章