1.創建項目tencentSpider
scrapy startproject tencentSpider
2.進入項目下創建爬蟲
scrapy genspider -t crawl tencent hr.tencent.com
3.編輯items.py 寫入提取字段模塊類
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentspiderItem(scrapy.Item):
# define the fields for your item here like:
#職位
positionname = scrapy.Field()
#職位鏈接
positionlink = scrapy.Field()
#職位類型
positionType = scrapy.Field()
#需要人數
peopleNum = scrapy.Field()
#工作地點
worklocation = scrapy.Field()
#發佈時間
publishTime = scrapy.Field()
3.書寫管道文件對數據的處理
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class TencentspiderPipeline(object):
#創建並打開json文件
def __init__(self):
# 讀取時需要加上encoding='utf-8' 不然顯示亂碼
self.filename=open("tencent.json","w",encoding='utf-8')
# process_item方法是必須寫的 用來處理item數據
def process_item(self, item, spider):
text=json.dumps(dict(item),ensure_ascii=False)+"\n"
self.filename.write(text)
return item
# close_spider方法是可選的,結束時調用的方法
def close_spider(self,spider):
self.filename.close()
4.配置文件修改 修改兩處
#請求默認報頭
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
#管道文件 下載好數據處理
ITEM_PIPELINES = {
'mySpider.pipelines.ItcastPipeline': 300,
}
(如果趴下來顯示在界面的數據太快想保存到本地)也可以添加
#保存日誌信息的文件名
LOG_FILE="tencentlog.log"
#保存日誌等級,高於或等於此等級的信息都被保存
LOG_LEVEL="DEBUG"
5.寫入爬蟲
# -*- coding: utf-8 -*-
import scrapy
#導入CrawlSpider 類和Rule
from scrapy.linkextractors import LinkExtractor
#導入連接規則匹配類,用來提取符合規則的鏈接
from scrapy.spiders import CrawlSpider, Rule
from ..items import TencentspiderItem
class TencentSpider(CrawlSpider):
name = 'tencent'
allowed_domains = ['hr.tencent.com']
start_urls = ['https://hr.tencent.com/position.php?start=0#a']
#匹Response裏連接的提取規則 返回複合匹配規則的連接匹配對象
pagelink=LinkExtractor(allow=("start=\d+"))
#獲取列表裏的鏈接,一次發送請求,並且繼續跟進,調用指定回調函數
rules = [
Rule(pagelink, callback='parseTencent', follow=True),
]
#調用函數
def parseTencent(self, response):
for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
item=TencentspiderItem()
item['sitionname'] = each.xpath("./td[1]/a/text()").extract_first()
# 詳情
item['positionlink'] = each.xpath("./td[1]/a/@href").extract_first()
# 職位類型
item['positionType'] = each.xpath("./td[2]/text()").extract_first()
# 招聘人數
item['peopleNum'] = each.xpath("./td[3]/text()").extract_first()
# 工作地點
item['worklocation'] = each.xpath("./td[4]/text()").extract_first()
# 發佈時間
item['publishTime'] = each.xpath("./td[5]/text()").extract_first()
yield item
6.執行
scrapy crawl tencent
7.出現tencent.json文件 即使所需要的內容