最近工作狀態不飽滿,想看看招聘網站上Java工程師都能給多少錢,於是就試試動手寫爬蟲,最開始使用的是requests,結果發現請求一直報403,估計是網站都有反爬蟲手段,目前水平有限,不知道怎麼反爬蟲,看來這條路行不通
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib
import random
#網絡數據爬蟲
class DataParser():
def __init__(self):
#大街網
#self.url ='https://so.dajie.com/job/ajax/search/filter?keyword=%E7%A8%8B%E5%BA%8F%E5%91%98&order=0&city=&recruitType=&salary=&experience=&page=1&positionFunction=&_CSRFToken=&ajax=1'
self.url ='https://so.dajie.com/job'
#self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
self.cookie = 'DJ_RF=https%3A%2F%2Fwww.cnblogs.com%2F; DJ_EU=http%3A%2F%2Fso.dajie.com%2Fjob%2Fsearch%3Fkeyword%3D%25E7%25A8%258B%25E5%25BA%258F%25E5%2591%2598%26from%3Djob%26clicktype%3Dblank; DJ_UVID=MTU1MDY2Nzk1MjkwMjU0OTk1; _ga=GA1.2.78647553.1550667957; _gid=GA1.2.713733169.1550667957; SO_COOKIE_V2=f8acdg/Q4CyL1+4OXP8FIHPzBRCzKq814NAzk+VrHXMrA8B/YbmReGC/IGPe7WCnlVcB2J+Ievxhp/jyA68sSXLkzo4N6L6MtoFD; Hm_lvt_6822a51ffa95d58bbe562e877f743b4f=1550667957,1550667965,1550669479; Hm_lpvt_6822a51ffa95d58bbe562e877f743b4f=1550669479; _close_autoreg=1550669481723; _close_autoreg_num=3'
#=======================================================================
user_agents=[
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
]
self.user_agent = random.choice(user_agents)
#=======================================================================
self.headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':self.cookie,
'Referer':'https://so.dajie.com/job/search?keyword=java&from=job&clicktype=blank',
'Upgrade-Insecure-Requests':'1',
'User-Agent':self.user_agent,
'x-requested-with': 'XMLHttpRequest'
}
#proxy = {'https':'10.107.228.126:8080', 'http':'218.14.115.211:3128'}
proxy = {'https':'218.14.115.211:3128'}
proxy_support = urllib.request.ProxyHandler(proxy)
opener =urllib.request.build_opener(proxy_support,urllib.request.HTTPHandler)
opener.addheaders = [('User-Agent',self.user_agent)]
#opener.addheaders = self.headers
self.req = urllib.request.install_opener(opener)
#self.req=urllib.request.Request(url=self.url,headers=self.headers)
#req
def excute(self):
response = urllib.request.urlopen(self.url)
print(response.read().decode('utf-8'))
print(response.status)
if __name__ == '__main__' :
parser = DataParser()
parser.excute()
後來轉換了下思路,模擬瀏覽器行爲,抓取HTML元素是不是也行得通?如是便採用了selenium,讀取到dom節點的元素,進行逐個解析,嘗試了一番,居然真抓取到了數據
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib
import random
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time
#網絡數據爬蟲
#chrome driver下載目錄
#http://chromedriver.storage.googleapis.com/index.html?path=73.0.3683.20/
class DataParser():
def __init__(self):
self.url ='https://so.dajie.com/job'
self.searchContent="Java"
#谷歌瀏覽器驅動位置
driverPath = 'E:\\SoftInstall\\Python3.8\\install\\chromedriver.exe'
self.browser=webdriver.Chrome(executable_path=driverPath)
def excute(self):
browser = self.browser
#打開瀏覽器
browser.get(self.url)
#關掉彈出頁面
register_close_btn = browser.find_element_by_class_name('xxxxxx')
register_close_btn.click()
#進入搜索頁面並且搜索詞條
search_input_btn = browser.find_element_by_id('xxxx')
search_input_btn.send_keys(self.searchContent)
browser.find_element_by_id('xxxx').click()
#jquery獲取查詢到的內容
#script = 'return document.getElementById("xxxxxxx")'
#doc = browser.execute_script(script).page_source()
#print(doc)
time.sleep(5)
#xxxxx是容器div,ul是子標籤
#通過瀏覽器XPATH獲取每個標籤
data = browser.find_elements_by_xpath('xxxxxxxx')
print(len(data))
#每條元素在li標籤中
for i in range(len(data)):
print(i)
job_content = data[i].find_element_by_xpath('xxxxxx')
#招聘網址鏈接
job_href = job_content.get_attribute('xxx')
print(job_href)
#崗位名稱
job_name = job_content.text
print(job_name)
browser.implicitly_wait(10)
#薪水
#salary = data[i].find_element_by_class_name('xxx').text
#print(salary)
#地區
#ads = data[i].find_element_by_class_name('xxx').text
#print(ads)
#經驗
#suffer = data[i].find_element_by_class_name('xxx').text
#print(suffer)
#學歷
#edu = data[i].find_element_by_class_name('xxx').text
#print(edu)
print('***********************************')
if __name__ == '__main__' :
parser = DataParser()
parser.excute()
既然抓取到了數據,那麼後續的工作就是數據存儲和分析啦,不多說了,先研究下MongoDb將爬到的數據 保存下