spider代碼 mmiao.py
主要代碼如下:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Selector
from miao.items import MiaoItem
class MmiaoSpider(scrapy.Spider):
name = 'mmiao'
offset = 0
allowed_domains = ["tencent.com"]
url = 'http://hr.tencent.com/position.php?&start='
start_urls = ['http://hr.tencent.com/position.php?&start=' + str(offset)]
addurl = 'https://hr.tencent.com/'
def parse(self, response):
for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
item = MiaoItem()
item['positionname'] = each.xpath('./td[1]/a/text()').extract()[0]
item['positionlink'] = self.addurl+each.xpath('./td[1]/a/@href').extract()[0]
try:
item['positiontype'] = each.xpath('./td[2]/text()').extract()[0]
except:
pass
item['peoplenum'] = each.xpath('./td[3]/text()').extract()[0]
item['worklocation'] = each.xpath("./td[4]/text()").extract()[0]
# 發佈時間
item['publishtime'] = each.xpath("./td[5]/text()").extract()[0]
yield item
if self.offset<1680:
self.offset+=10
yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
提取信息文件 items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MiaoItem(scrapy.Item):
positionname = scrapy.Field()
positionlink = scrapy.Field()
positiontype = scrapy.Field()
peoplenum = scrapy.Field()
worklocation = scrapy.Field()
publishtime = scrapy.Field()
管道文件 pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class MiaoPipeline(object):
def __init__(self):
self.f = open('123.txt','w')
def process_item(self, item, spider):
# text = json.dumps(dict(item),ensure_ascii=False)+',\n'
# self.f.write(text.encode('utf-8'))
# self.f.write('\n'.encode())
self.f.writelines(item['positionname']+'___')
self.f.writelines(item['positionlink']+'___')
self.f.writelines(item['positiontype']+'___')
self.f.writelines(item['peoplenum']+'___')
self.f.writelines(item['worklocation']+'___')
self.f.writelines(item['publishtime']+'\n\n\n')
return item
def close_sider(self,spider):
self.f.close()
設置文件 settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for miao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'miao'
SPIDER_MODULES = ['miao.spiders']
NEWSPIDER_MODULE = 'miao.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'miao (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
#HTTPERROR_ALLOWED_CODES = [403]#上面報的是403,就把403加入。
#USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'miao.middlewares.MiaoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'miao.middlewares.MiaoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'miao.pipelines.MiaoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
FEED_EXPORT_ENCODING = 'utf-8'
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
結果如下:
本次博客記錄到此結束!
完整項目工程下載地址: https://download.csdn.net/download/qq_38162763/10586993
拜拜