Python 爬蟲 (翻頁爬取,騰訊社招網)

1.創建項目tencentSpider

scrapy startproject tencentSpider

2.進入項目下創建爬蟲

scrapy genspider -t crawl tencent hr.tencent.com

3.編輯items.py 寫入提取字段模塊類

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TencentspiderItem(scrapy.Item):
    # define the fields for your item here like:
    #職位
    positionname = scrapy.Field()
    #職位鏈接
    positionlink = scrapy.Field()
    #職位類型
    positionType = scrapy.Field()
    #需要人數
    peopleNum = scrapy.Field()
    #工作地點
    worklocation = scrapy.Field()
    #發佈時間
    publishTime = scrapy.Field()

3.書寫管道文件對數據的處理

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json


class TencentspiderPipeline(object):
    #創建並打開json文件
    def __init__(self):
        # 讀取時需要加上encoding='utf-8' 不然顯示亂碼
        self.filename=open("tencent.json","w",encoding='utf-8')

    # process_item方法是必須寫的 用來處理item數據
    def process_item(self, item, spider):
        text=json.dumps(dict(item),ensure_ascii=False)+"\n"
        self.filename.write(text)
        return item

    # close_spider方法是可選的,結束時調用的方法
    def close_spider(self,spider):
        self.filename.close()

4.配置文件修改  修改兩處

#請求默認報頭
DEFAULT_REQUEST_HEADERS = {
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  # 'Accept-Language': 'en',
}
#管道文件  下載好數據處理
ITEM_PIPELINES = {
   'mySpider.pipelines.ItcastPipeline': 300,
}

(如果趴下來顯示在界面的數據太快想保存到本地)也可以添加

#保存日誌信息的文件名
LOG_FILE="tencentlog.log"
#保存日誌等級,高於或等於此等級的信息都被保存
LOG_LEVEL="DEBUG"

5.寫入爬蟲

# -*- coding: utf-8 -*-
import scrapy
#導入CrawlSpider 類和Rule
from scrapy.linkextractors import LinkExtractor
#導入連接規則匹配類,用來提取符合規則的鏈接
from scrapy.spiders import CrawlSpider, Rule
from ..items import TencentspiderItem

class TencentSpider(CrawlSpider):
    name = 'tencent'
    allowed_domains = ['hr.tencent.com']
    start_urls = ['https://hr.tencent.com/position.php?start=0#a']

    #匹Response裏連接的提取規則  返回複合匹配規則的連接匹配對象
    pagelink=LinkExtractor(allow=("start=\d+"))
    #獲取列表裏的鏈接,一次發送請求,並且繼續跟進,調用指定回調函數
    rules = [
        Rule(pagelink, callback='parseTencent', follow=True),

    ]
    #調用函數
    def parseTencent(self, response):
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item=TencentspiderItem()
            item['sitionname'] = each.xpath("./td[1]/a/text()").extract_first()
            # 詳情
            item['positionlink'] = each.xpath("./td[1]/a/@href").extract_first()
            # 職位類型
            item['positionType'] = each.xpath("./td[2]/text()").extract_first()
            # 招聘人數
            item['peopleNum'] = each.xpath("./td[3]/text()").extract_first()
            # 工作地點
            item['worklocation'] = each.xpath("./td[4]/text()").extract_first()
            # 發佈時間
            item['publishTime'] = each.xpath("./td[5]/text()").extract_first()
            yield item

6.執行

scrapy crawl tencent

7.出現tencent.json文件 即使所需要的內容

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章