Python爬虫爬取大街网工作数据

最近工作状态不饱满,想看看招聘网站上Java工程师都能给多少钱,于是就试试动手写爬虫,最开始使用的是requests,结果发现请求一直报403,估计是网站都有反爬虫手段,目前水平有限,不知道怎么反爬虫,看来这条路行不通

# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib
import random
#网络数据爬虫
class DataParser():
    def __init__(self):
        #大街网
        #self.url ='https://so.dajie.com/job/ajax/search/filter?keyword=%E7%A8%8B%E5%BA%8F%E5%91%98&order=0&city=&recruitType=&salary=&experience=&page=1&positionFunction=&_CSRFToken=&ajax=1'
        self.url ='https://so.dajie.com/job'
        #self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        
        self.cookie = 'DJ_RF=https%3A%2F%2Fwww.cnblogs.com%2F; DJ_EU=http%3A%2F%2Fso.dajie.com%2Fjob%2Fsearch%3Fkeyword%3D%25E7%25A8%258B%25E5%25BA%258F%25E5%2591%2598%26from%3Djob%26clicktype%3Dblank; DJ_UVID=MTU1MDY2Nzk1MjkwMjU0OTk1; _ga=GA1.2.78647553.1550667957; _gid=GA1.2.713733169.1550667957; SO_COOKIE_V2=f8acdg/Q4CyL1+4OXP8FIHPzBRCzKq814NAzk+VrHXMrA8B/YbmReGC/IGPe7WCnlVcB2J+Ievxhp/jyA68sSXLkzo4N6L6MtoFD; Hm_lvt_6822a51ffa95d58bbe562e877f743b4f=1550667957,1550667965,1550669479; Hm_lpvt_6822a51ffa95d58bbe562e877f743b4f=1550669479; _close_autoreg=1550669481723; _close_autoreg_num=3'
        #=======================================================================
        user_agents=[
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
        ]
        self.user_agent = random.choice(user_agents)
        #=======================================================================
        
        self.headers  = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate, br',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Cache-Control':'max-age=0',
            'Connection':'keep-alive',
            'Cookie':self.cookie,
            'Referer':'https://so.dajie.com/job/search?keyword=java&from=job&clicktype=blank',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':self.user_agent,
            'x-requested-with': 'XMLHttpRequest'
            }
        
        #proxy = {'https':'10.107.228.126:8080', 'http':'218.14.115.211:3128'}
        
        proxy = {'https':'218.14.115.211:3128'}
        
        proxy_support = urllib.request.ProxyHandler(proxy)
        
        opener =urllib.request.build_opener(proxy_support,urllib.request.HTTPHandler)
        
        opener.addheaders = [('User-Agent',self.user_agent)]
        
        #opener.addheaders = self.headers
        
        self.req = urllib.request.install_opener(opener)
        #self.req=urllib.request.Request(url=self.url,headers=self.headers)
        #req
        
    def excute(self):
        response = urllib.request.urlopen(self.url)
        print(response.read().decode('utf-8'))
        print(response.status)

if __name__ == '__main__' :
    parser = DataParser() 
    parser.excute()

后来转换了下思路,模拟浏览器行为,抓取HTML元素是不是也行得通?如是便采用了selenium,读取到dom节点的元素,进行逐个解析,尝试了一番,居然真抓取到了数据

# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib
import random
import selenium
from selenium  import webdriver
from bs4 import BeautifulSoup
import time
#网络数据爬虫
#chrome driver下载目录
#http://chromedriver.storage.googleapis.com/index.html?path=73.0.3683.20/
class DataParser():
    def __init__(self):
        self.url ='https://so.dajie.com/job'
        
        self.searchContent="Java"
        #谷歌浏览器驱动位置
        driverPath = 'E:\\SoftInstall\\Python3.8\\install\\chromedriver.exe'        
        self.browser=webdriver.Chrome(executable_path=driverPath) 
    def excute(self):
        browser = self.browser
        #打开浏览器
        browser.get(self.url)     
        #关掉弹出页面
        register_close_btn = browser.find_element_by_class_name('xxxxxx')
        register_close_btn.click()
        #进入搜索页面并且搜索词条
        search_input_btn =  browser.find_element_by_id('xxxx')
        search_input_btn.send_keys(self.searchContent)
        browser.find_element_by_id('xxxx').click()
        
        #jquery获取查询到的内容
        #script = 'return document.getElementById("xxxxxxx")'
        #doc = browser.execute_script(script).page_source()
        #print(doc)
        time.sleep(5)
        
        #xxxxx是容器div,ul是子标签
        #通过浏览器XPATH获取每个标签
        data = browser.find_elements_by_xpath('xxxxxxxx')
        
        print(len(data))
        #每条元素在li标签中
        for i in range(len(data)):
            print(i)
            job_content = data[i].find_element_by_xpath('xxxxxx')            
            #招聘网址链接
            job_href = job_content.get_attribute('xxx')
            print(job_href)
            #岗位名称
            job_name = job_content.text
            print(job_name)
            browser.implicitly_wait(10)
            #薪水
            #salary = data[i].find_element_by_class_name('xxx').text
            #print(salary) 
            #地区
            #ads = data[i].find_element_by_class_name('xxx').text
            #print(ads)
            #经验
            #suffer = data[i].find_element_by_class_name('xxx').text
            #print(suffer)
            #学历
            #edu = data[i].find_element_by_class_name('xxx').text
            #print(edu)
            print('***********************************')
        
if __name__ == '__main__' :
    parser = DataParser() 
    parser.excute()        

 

既然抓取到了数据,那么后续的工作就是数据存储和分析啦,不多说了,先研究下MongoDb将爬到的数据 保存下

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章