scrapy爬取騰訊招聘信息

spider代碼 mmiao.py 

主要代碼如下:

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Selector
from miao.items import MiaoItem
class MmiaoSpider(scrapy.Spider):
    name = 'mmiao'
    offset = 0
    allowed_domains = ["tencent.com"]
    url = 'http://hr.tencent.com/position.php?&start='
    start_urls = ['http://hr.tencent.com/position.php?&start=' + str(offset)]
    addurl = 'https://hr.tencent.com/'
    def parse(self, response):
        for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
            item = MiaoItem()
            item['positionname'] = each.xpath('./td[1]/a/text()').extract()[0]
            item['positionlink'] = self.addurl+each.xpath('./td[1]/a/@href').extract()[0]
            try:
                item['positiontype'] = each.xpath('./td[2]/text()').extract()[0]
            except:
                pass
            item['peoplenum']  = each.xpath('./td[3]/text()').extract()[0]
            item['worklocation'] = each.xpath("./td[4]/text()").extract()[0]
            # 發佈時間
            item['publishtime'] = each.xpath("./td[5]/text()").extract()[0]
            yield item
        if self.offset<1680:
            self.offset+=10
        yield  scrapy.Request(self.url+str(self.offset),callback=self.parse)

提取信息文件 items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MiaoItem(scrapy.Item):
    positionname = scrapy.Field()
    positionlink = scrapy.Field()
    positiontype = scrapy.Field()
    peoplenum = scrapy.Field()
    worklocation = scrapy.Field()
    publishtime = scrapy.Field()

管道文件 pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json

class MiaoPipeline(object):
    def __init__(self):
        self.f = open('123.txt','w')
    def process_item(self, item, spider):

        # text = json.dumps(dict(item),ensure_ascii=False)+',\n'
        # self.f.write(text.encode('utf-8'))
        # self.f.write('\n'.encode())
        self.f.writelines(item['positionname']+'___')
        self.f.writelines(item['positionlink']+'___')
        self.f.writelines(item['positiontype']+'___')
        self.f.writelines(item['peoplenum']+'___')
        self.f.writelines(item['worklocation']+'___')
        self.f.writelines(item['publishtime']+'\n\n\n')
        return item
    def close_sider(self,spider):
        self.f.close()

設置文件 settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for miao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'miao'

SPIDER_MODULES = ['miao.spiders']
NEWSPIDER_MODULE = 'miao.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'miao (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
#HTTPERROR_ALLOWED_CODES = [403]#上面報的是403,就把403加入。
#USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
DEFAULT_REQUEST_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'miao.middlewares.MiaoSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'miao.middlewares.MiaoDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'miao.pipelines.MiaoPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
FEED_EXPORT_ENCODING = 'utf-8'
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

結果如下:

本次博客記錄到此結束!

完整項目工程下載地址: https://download.csdn.net/download/qq_38162763/10586993

拜拜

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章