import requests
from bs4 import BeautifulSoup
from get_proxy import GetProxy
from urllib import parse
from day03.pymysql_text import Mysql_text
#請求頭信息,放到全局變量方便使用
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
#定義一個函數用來獲取boss直聘詳情頁url
def get_all_info(my_ip):
#第一個循環用來獲取分頁信息
for i in range(1,4):
url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=%s&ka=page-%s' %(i,i)
#這個循環是當你ip被封了之後,用except下面的函數獲取新的代理ip
for i in range(4):
#如果ip失效就換新的代理ip
try:
#用BeautifulSoup獲取我們想要的數據
response = requests.get(url,headers=headers,proxies=my_ip.proxy,timeout=5)
soup = BeautifulSoup(response.text,'lxml')
a_list = soup.select('div.info-primary > h3 > a ')
for a_ele in a_list:
#循環拼接詳情頁的路徑
a_href = a_ele['href']
# print(a_href)
info_url = parse.urljoin(url,a_href)
print(info_url)
#這個循環個下面的try except是在ip失效之後獲取新的ip
for i in range(4):
try:
#調用獲取最終數據的函數
get_boss_info(my_ip, info_url)
#獲取成功就break結束程序,避免代理ip的浪費
break
except Exception as e:
#獲取新的代理ip
my_ip.updata_proxy()
print(e)
except Exception as e:
print(e)
my_ip.updata_proxy()
#這個函數用來獲取我們想要的數據
def get_boss_info(my_ip,info_url):
response = requests.get(info_url,proxies = my_ip.proxy,headers=headers,timeout=5)
html_ele = BeautifulSoup(response.text,'lxml')
title = html_ele.select('h1')[0].text
# print(title)
price = html_ele.find('span',class_ = "badge").text.replace('\n','').strip()
# print(price)
address = html_ele.select('div.info-primary p')[0].text.replace('\n','').strip()
# print(address)
yaoqiu = html_ele.select('div.text')[0].text
# print(yaoqiu)
data = (title,price,address,yaoqiu)
print(data)
#實例化之前封裝好的mysql類並實例化和執行數據添加
m = Mysql_text()
sql = 'insert into boss(title,price,address,yaoqiu) VALUES (%s,%s,%s,%s)'
m.sqlzz(sql,data)
if __name__ == '__main__':
#實例化獲取代理的類的對象
my_ip = GetProxy()
#調用第一個函數
get_all_info(my_ip)
用代理的方式爬取boss直聘的信息
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.